diff mbox

[v5,3/4] KVM: Fix races in irqfd using new eventfd_kref_get interface

Message ID 20090625132826.26748.15607.stgit@dev.haskins.net (mailing list archive)
State New, archived
Headers show

Commit Message

Gregory Haskins June 25, 2009, 1:28 p.m. UTC
eventfd currently emits a POLLHUP wakeup on f_ops->release() to generate a
"release" callback.  This lets eventfd clients know if the eventfd is about
to go away and is very useful particularly for in-kernel clients.  However,
until recently it is not possible to use this feature of eventfd in a
race-free way.  This patch utilizes a new eventfd interface to rectify
the problem.

Note that one final race is known to exist: the slow-work thread may race
with module removal.  We are currently working with slow-work upstream
to fix this issue as well.  Since the code prior to this patch also
races with module_put(), we are not making anything worse, but rather
shifting the cause of the race.  Once the slow-work code is patched we
will be fixing the last remaining issue.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
---

 include/linux/kvm_host.h |    7 +-
 virt/kvm/Kconfig         |    1 
 virt/kvm/eventfd.c       |  199 ++++++++++++++++++++++++++++++++++++++++------
 3 files changed, 179 insertions(+), 28 deletions(-)


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Gregory Haskins June 26, 2009, 2:05 p.m. UTC | #1
Gregory Haskins wrote:
> eventfd currently emits a POLLHUP wakeup on f_ops->release() to generate a
> "release" callback.  This lets eventfd clients know if the eventfd is about
> to go away and is very useful particularly for in-kernel clients.  However,
> until recently it is not possible to use this feature of eventfd in a
> race-free way.  This patch utilizes a new eventfd interface to rectify
> the problem.
>
> Note that one final race is known to exist: the slow-work thread may race
> with module removal.  We are currently working with slow-work upstream
> to fix this issue as well.  Since the code prior to this patch also
> races with module_put(), we are not making anything worse, but rather
> shifting the cause of the race.  Once the slow-work code is patched we
> will be fixing the last remaining issue.
>
> Signed-off-by: Gregory Haskins <ghaskins@novell.com>
> ---
>
>  include/linux/kvm_host.h |    7 +-
>  virt/kvm/Kconfig         |    1 
>  virt/kvm/eventfd.c       |  199 ++++++++++++++++++++++++++++++++++++++++------
>  3 files changed, 179 insertions(+), 28 deletions(-)
>
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 2451f48..d94ee72 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -141,7 +141,12 @@ struct kvm {
>  	struct kvm_io_bus mmio_bus;
>  	struct kvm_io_bus pio_bus;
>  #ifdef CONFIG_HAVE_KVM_EVENTFD
> -	struct list_head irqfds;
> +	struct {
> +		spinlock_t        lock;
> +		struct list_head  items;
> +		atomic_t          outstanding;
> +		wait_queue_head_t wqh;
> +	} irqfds;
>  #endif
>  	struct kvm_vm_stat stat;
>  	struct kvm_arch arch;
> diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
> index daece36..ab7848a 100644
> --- a/virt/kvm/Kconfig
> +++ b/virt/kvm/Kconfig
> @@ -9,6 +9,7 @@ config HAVE_KVM_IRQCHIP
>  config HAVE_KVM_EVENTFD
>         bool
>         select EVENTFD
> +       select SLOW_WORK
>  
>  config KVM_APIC_ARCHITECTURE
>         bool
> diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
> index 9656027..ca21e8a 100644
> --- a/virt/kvm/eventfd.c
> +++ b/virt/kvm/eventfd.c
> @@ -28,6 +28,7 @@
>  #include <linux/file.h>
>  #include <linux/list.h>
>  #include <linux/eventfd.h>
> +#include <linux/slow-work.h>
>  
>  /*
>   * --------------------------------------------------------------------
> @@ -36,17 +37,36 @@
>   * Credit goes to Avi Kivity for the original idea.
>   * --------------------------------------------------------------------
>   */
> +
>  struct _irqfd {
>  	struct kvm               *kvm;
> +	struct eventfd_ctx       *eventfd;
>  	int                       gsi;
>  	struct list_head          list;
>  	poll_table                pt;
>  	wait_queue_head_t        *wqh;
>  	wait_queue_t              wait;
>  	struct work_struct        inject;
> +	struct slow_work          shutdown;
> +	int                       active:1;
>  };
>  
>  static void
> +irqfd_release(struct _irqfd *irqfd)
> +{
> +	eventfd_ctx_put(irqfd->eventfd);
> +	kfree(irqfd);
> +}
> +
> +/* assumes kvm->irqfds.lock is held */
> +static void
> +irqfd_deactivate(struct _irqfd *irqfd)
> +{
> +	irqfd->active = false;
> +	list_del(&irqfd->list);
> +}
> +
> +static void
>  irqfd_inject(struct work_struct *work)
>  {
>  	struct _irqfd *irqfd = container_of(work, struct _irqfd, inject);
> @@ -58,6 +78,55 @@ irqfd_inject(struct work_struct *work)
>  	mutex_unlock(&kvm->irq_lock);
>  }
>  
> +static struct _irqfd *
> +work_to_irqfd(struct slow_work *work)
> +{
> +	return container_of(work, struct _irqfd, shutdown);
> +}
> +
> +static int
> +irqfd_shutdown_get_ref(struct slow_work *work)
> +{
> +	struct _irqfd *irqfd = work_to_irqfd(work);
> +	struct kvm *kvm = irqfd->kvm;
> +
> +	atomic_inc(&kvm->irqfds.outstanding);
> +
> +	return 0;
> +}
> +
> +static void
> +irqfd_shutdown_put_ref(struct slow_work *work)
> +{
> +	struct _irqfd *irqfd = work_to_irqfd(work);
> +	struct kvm *kvm = irqfd->kvm;
> +
> +	irqfd_release(irqfd);
> +
> +	if (atomic_dec_and_test(&kvm->irqfds.outstanding))
> +		wake_up(&kvm->irqfds.wqh);
> +}
> +
> +static void
> +irqfd_shutdown_execute(struct slow_work *work)
> +{
> +	struct _irqfd *irqfd = work_to_irqfd(work);
> +
> +	/*
> +	 * Ensure there are no outstanding "inject" work-items before we blow
> +	 * away our state.  Once this job completes, the slow_work
> +	 * infrastructure will drop the irqfd object completely via put_ref
> +	 */
> +	flush_work(&irqfd->inject);
> +}
> +
> +const static struct slow_work_ops irqfd_shutdown_work_ops = {
> +	.get_ref = irqfd_shutdown_get_ref,
> +	.put_ref = irqfd_shutdown_put_ref,
> +	.execute = irqfd_shutdown_execute,
> +};
> +
> +
>  static int
>  irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key)
>  {
> @@ -65,25 +134,39 @@ irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key)
>  	unsigned long flags = (unsigned long)key;
>  
>  	/*
> -	 * Assume we will be called with interrupts disabled
> +	 * Called with interrupts disabled
>  	 */
>  	if (flags & POLLIN)
> -		/*
> -		 * Defer the IRQ injection until later since we need to
> -		 * acquire the kvm->lock to do so.
> -		 */
> +		/* An event has been signaled, inject an interrupt */
>  		schedule_work(&irqfd->inject);
>  
>  	if (flags & POLLHUP) {
> -		/*
> -		 * for now, just remove ourselves from the list and let
> -		 * the rest dangle.  We will fix this up later once
> -		 * the races in eventfd are fixed
> -		 */
> +		/* The eventfd is closing, detach from KVM */
> +		struct kvm *kvm = irqfd->kvm;
> +		unsigned long flags;
> +
>  		__remove_wait_queue(irqfd->wqh, &irqfd->wait);
> -		irqfd->wqh = NULL;
> +
> +		spin_lock_irqsave(&kvm->irqfds.lock, flags);
> +
> +		if (irqfd->active) {
> +			/*
> +			 * If the item is still active we can be sure that
> +			 * no-one else is trying to shutdown this object at
> +			 * the same time.
> +			 *
> +			 * Defer the shutdown to a thread so we can flush
> +			 * all remaining inject jobs.  We use a slow-work
> +			 * item to prevent a deadlock against the work-queue
> +			 */
> +			irqfd_deactivate(irqfd);
> +			slow_work_enqueue(&irqfd->shutdown);
> +		}
> +
> +		spin_unlock_irqrestore(&kvm->irqfds.lock, flags);
>  	}
>  
> +
>  	return 0;
>  }
>  
> @@ -102,6 +185,7 @@ kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags)
>  {
>  	struct _irqfd *irqfd;
>  	struct file *file = NULL;
> +	struct eventfd_ctx *eventfd = NULL;
>  	int ret;
>  	unsigned int events;
>  
> @@ -113,6 +197,8 @@ kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags)
>  	irqfd->gsi = gsi;
>  	INIT_LIST_HEAD(&irqfd->list);
>  	INIT_WORK(&irqfd->inject, irqfd_inject);
> +	slow_work_init(&irqfd->shutdown, &irqfd_shutdown_work_ops);
> +	irqfd->active = true;
>  
>  	file = eventfd_fget(fd);
>  	if (IS_ERR(file)) {
> @@ -129,12 +215,21 @@ kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags)
>  
>  	events = file->f_op->poll(file, &irqfd->pt);
>  
> -	mutex_lock(&kvm->lock);
> -	list_add_tail(&irqfd->list, &kvm->irqfds);
> -	mutex_unlock(&kvm->lock);
> +	spin_lock_irq(&kvm->irqfds.lock);
> +	list_add_tail(&irqfd->list, &kvm->irqfds.items);
> +	spin_unlock_irq(&kvm->irqfds.lock);
> +
> +	eventfd = eventfd_ctx_fileget(file);
> +	if (IS_ERR(file)) {
> +		ret = PTR_ERR(file);
> +		goto fail;
> +	}
> +
> +	irqfd->eventfd = eventfd;
>   

<sigh> I just noticed this while doing a self review:  I need to assign
the eventfd context *before* putting the item on the list.  Not sure why
I even did that.  I suspect I re-arranged the code at the last minute
and didn't notice what a dumb thing I was doing.

So this will need at least a v6, but I will wait to hear if there are
any other comments from Michael et. al.

-Greg

>  
>  	/*
> -	 * Check if there was an event already queued
> +	 * Check if there was an event already pending on the eventfd
> +	 * before we registered, and trigger it as if we didn't miss it.
>  	 */
>  	if (events & POLLIN)
>  		schedule_work(&irqfd->inject);
> @@ -148,6 +243,9 @@ kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags)
>  	return 0;
>  
>  fail:
> +	if (eventfd && !IS_ERR(eventfd))
> +		eventfd_ctx_put(eventfd);
> +
>  	if (file && !IS_ERR(file))
>  		fput(file);
>  
> @@ -158,24 +256,71 @@ fail:
>  void
>  kvm_irqfd_init(struct kvm *kvm)
>  {
> -	INIT_LIST_HEAD(&kvm->irqfds);
> +	slow_work_register_user();
> +
> +	spin_lock_init(&kvm->irqfds.lock);
> +	INIT_LIST_HEAD(&kvm->irqfds.items);
> +	atomic_set(&kvm->irqfds.outstanding, 0);
> +	init_waitqueue_head(&kvm->irqfds.wqh);
> +}
> +
> +static struct _irqfd *
> +irqfd_pop(struct kvm *kvm)
> +{
> +	struct _irqfd *irqfd = NULL;
> +
> +	spin_lock_irq(&kvm->irqfds.lock);
> +
> +	if (!list_empty(&kvm->irqfds.items)) {
> +		irqfd = list_first_entry(&kvm->irqfds.items,
> +					 struct _irqfd, list);
> +		irqfd_deactivate(irqfd);
> +	}
> +
> +	spin_unlock_irq(&kvm->irqfds.lock);
> +
> +	return irqfd;
> +}
> +
> +/*
> + * locally releases the irqfd
> + *
> + * This function is called when KVM won the race with eventfd (signalled by
> + * finding the item active on the kvm->irqfds.item list). We are now guaranteed
> + * that we will never schedule a deferred shutdown task against this object,
> + * so we take steps to perform the shutdown ourselves.
> + *
> + * 1) We must remove ourselves from the wait-queue to prevent further events,
> + *    which will simultaneously act to sync us with eventfd (via wqh->lock)
> + * 2) Flush any outstanding inject-tasks to ensure its safe to free memory
> + * 3) Delete the object
> + */
> +static void
> +irqfd_shutdown(struct _irqfd *irqfd)
> +{
> +	remove_wait_queue(irqfd->wqh, &irqfd->wait);
> +	flush_work(&irqfd->inject);
> +	irqfd_release(irqfd);
>  }
>  
>  void
>  kvm_irqfd_release(struct kvm *kvm)
>  {
> -	struct _irqfd *irqfd, *tmp;
> -
> -	list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds, list) {
> -		if (irqfd->wqh)
> -			remove_wait_queue(irqfd->wqh, &irqfd->wait);
> +	struct _irqfd *irqfd;
>  
> -		flush_work(&irqfd->inject);
> +	/*
> +	 * Shutdown all irqfds that still remain
> +	 */
> +	while ((irqfd = irqfd_pop(kvm)))
> +		irqfd_shutdown(irqfd);
>  
> -		mutex_lock(&kvm->lock);
> -		list_del(&irqfd->list);
> -		mutex_unlock(&kvm->lock);
> +	/*
> +	 * irqfds.outstanding tracks the number of outstanding "shutdown"
> +	 * jobs pending at any given time.  Once we get here, we know that
> +	 * no more jobs will get scheduled, so go ahead and block until all
> +	 * of them complete
> +	 */
> +	wait_event(kvm->irqfds.wqh, (!atomic_read(&kvm->irqfds.outstanding)));
>  
> -		kfree(irqfd);
> -	}
> +	slow_work_unregister_user();
>  }
>
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
Michael S. Tsirkin June 28, 2009, 11:06 a.m. UTC | #2
On Thu, Jun 25, 2009 at 09:28:27AM -0400, Gregory Haskins wrote:
> @@ -65,25 +134,39 @@ irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key)
>  	unsigned long flags = (unsigned long)key;
>  
>  	/*
> -	 * Assume we will be called with interrupts disabled
> +	 * Called with interrupts disabled
>  	 */
>  	if (flags & POLLIN)
> -		/*
> -		 * Defer the IRQ injection until later since we need to
> -		 * acquire the kvm->lock to do so.
> -		 */
> +		/* An event has been signaled, inject an interrupt */
>  		schedule_work(&irqfd->inject);
>  
>  	if (flags & POLLHUP) {
> -		/*
> -		 * for now, just remove ourselves from the list and let
> -		 * the rest dangle.  We will fix this up later once
> -		 * the races in eventfd are fixed
> -		 */
> +		/* The eventfd is closing, detach from KVM */
> +		struct kvm *kvm = irqfd->kvm;
> +		unsigned long flags;
> +
>  		__remove_wait_queue(irqfd->wqh, &irqfd->wait);
> -		irqfd->wqh = NULL;
> +
> +		spin_lock_irqsave(&kvm->irqfds.lock, flags);
> +
> +		if (irqfd->active) {
> +			/*
> +			 * If the item is still active we can be sure that
> +			 * no-one else is trying to shutdown this object at
> +			 * the same time.
> +			 *
> +			 * Defer the shutdown to a thread so we can flush
> +			 * all remaining inject jobs.  We use a slow-work
> +			 * item to prevent a deadlock against the work-queue
> +			 */
> +			irqfd_deactivate(irqfd);
> +			slow_work_enqueue(&irqfd->shutdown);

Greg, in your patch for slow-work module removal, you write:
  "Callers must ensure that their module has at least
  one reference held while the work is enqueued."
  Where does this guarantee come from, in this case?
Michael S. Tsirkin June 28, 2009, 11:48 a.m. UTC | #3
On Thu, Jun 25, 2009 at 09:28:27AM -0400, Gregory Haskins wrote:
> eventfd currently emits a POLLHUP wakeup on f_ops->release() to generate a
> "release" callback.  This lets eventfd clients know if the eventfd is about
> to go away and is very useful particularly for in-kernel clients.  However,
> until recently it is not possible to use this feature of eventfd in a
> race-free way.  This patch utilizes a new eventfd interface to rectify
> the problem.
> 
> Note that one final race is known to exist: the slow-work thread may race
> with module removal.  We are currently working with slow-work upstream
> to fix this issue as well.  Since the code prior to this patch also
> races with module_put(), we are not making anything worse, but rather
> shifting the cause of the race.  Once the slow-work code is patched we
> will be fixing the last remaining issue.

By the way, why are we using slow-work here? Wouldn't a regular
workqueue do just as well, with less code, and avoid the race?
Gregory Haskins June 28, 2009, 12:50 p.m. UTC | #4
Michael S. Tsirkin wrote:
> On Thu, Jun 25, 2009 at 09:28:27AM -0400, Gregory Haskins wrote:
>   
>> @@ -65,25 +134,39 @@ irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key)
>>  	unsigned long flags = (unsigned long)key;
>>  
>>  	/*
>> -	 * Assume we will be called with interrupts disabled
>> +	 * Called with interrupts disabled
>>  	 */
>>  	if (flags & POLLIN)
>> -		/*
>> -		 * Defer the IRQ injection until later since we need to
>> -		 * acquire the kvm->lock to do so.
>> -		 */
>> +		/* An event has been signaled, inject an interrupt */
>>  		schedule_work(&irqfd->inject);
>>  
>>  	if (flags & POLLHUP) {
>> -		/*
>> -		 * for now, just remove ourselves from the list and let
>> -		 * the rest dangle.  We will fix this up later once
>> -		 * the races in eventfd are fixed
>> -		 */
>> +		/* The eventfd is closing, detach from KVM */
>> +		struct kvm *kvm = irqfd->kvm;
>> +		unsigned long flags;
>> +
>>  		__remove_wait_queue(irqfd->wqh, &irqfd->wait);
>> -		irqfd->wqh = NULL;
>> +
>> +		spin_lock_irqsave(&kvm->irqfds.lock, flags);
>> +
>> +		if (irqfd->active) {
>> +			/*
>> +			 * If the item is still active we can be sure that
>> +			 * no-one else is trying to shutdown this object at
>> +			 * the same time.
>> +			 *
>> +			 * Defer the shutdown to a thread so we can flush
>> +			 * all remaining inject jobs.  We use a slow-work
>> +			 * item to prevent a deadlock against the work-queue
>> +			 */
>> +			irqfd_deactivate(irqfd);
>> +			slow_work_enqueue(&irqfd->shutdown);
>>     
>
> Greg, in your patch for slow-work module removal, you write:
>   "Callers must ensure that their module has at least
>   one reference held while the work is enqueued."
>   Where does this guarantee come from, in this case?
>   
The general guarantee comes from the fact that modules naturally have to
have a reference to be able to call the enqueue function to begin with,
or the calling function was already racy.  In this particular case, we
can guarantee that the kvm vm fd is held while our slow-work is active,
and all slow work is flushed before it is released.  (I guess I am
assuming that VFS takes a module reference when an fd is opened, but I
have not verified that it actually does.  If it doesn't, I suppose KVM
is already racy w.r.t. unloading, independent of my patches)

-Greg
Gregory Haskins June 28, 2009, 12:53 p.m. UTC | #5
Michael S. Tsirkin wrote:
> On Thu, Jun 25, 2009 at 09:28:27AM -0400, Gregory Haskins wrote:
>   
>> eventfd currently emits a POLLHUP wakeup on f_ops->release() to generate a
>> "release" callback.  This lets eventfd clients know if the eventfd is about
>> to go away and is very useful particularly for in-kernel clients.  However,
>> until recently it is not possible to use this feature of eventfd in a
>> race-free way.  This patch utilizes a new eventfd interface to rectify
>> the problem.
>>
>> Note that one final race is known to exist: the slow-work thread may race
>> with module removal.  We are currently working with slow-work upstream
>> to fix this issue as well.  Since the code prior to this patch also
>> races with module_put(), we are not making anything worse, but rather
>> shifting the cause of the race.  Once the slow-work code is patched we
>> will be fixing the last remaining issue.
>>     
>
> By the way, why are we using slow-work here? Wouldn't a regular
> workqueue do just as well, with less code, and avoid the race?
>
>   
I believe it will cause a problem if you do a "flush_work()" from inside
a work-item.  I could be wrong, of course, but it looks like a recipe to
deadlock.

-Greg
Michael S. Tsirkin June 28, 2009, 12:56 p.m. UTC | #6
On Sun, Jun 28, 2009 at 08:53:22AM -0400, Gregory Haskins wrote:
> Michael S. Tsirkin wrote:
> > On Thu, Jun 25, 2009 at 09:28:27AM -0400, Gregory Haskins wrote:
> >   
> >> eventfd currently emits a POLLHUP wakeup on f_ops->release() to generate a
> >> "release" callback.  This lets eventfd clients know if the eventfd is about
> >> to go away and is very useful particularly for in-kernel clients.  However,
> >> until recently it is not possible to use this feature of eventfd in a
> >> race-free way.  This patch utilizes a new eventfd interface to rectify
> >> the problem.
> >>
> >> Note that one final race is known to exist: the slow-work thread may race
> >> with module removal.  We are currently working with slow-work upstream
> >> to fix this issue as well.  Since the code prior to this patch also
> >> races with module_put(), we are not making anything worse, but rather
> >> shifting the cause of the race.  Once the slow-work code is patched we
> >> will be fixing the last remaining issue.
> >>     
> >
> > By the way, why are we using slow-work here? Wouldn't a regular
> > workqueue do just as well, with less code, and avoid the race?
> >
> >   
> I believe it will cause a problem if you do a "flush_work()" from inside
> a work-item.  I could be wrong, of course, but it looks like a recipe to
> deadlock.
> 
> -Greg
> 

Sure, but the idea is to only flush on kvm close, never from work item.
Michael S. Tsirkin June 28, 2009, 12:57 p.m. UTC | #7
On Sun, Jun 28, 2009 at 03:56:12PM +0300, Michael S. Tsirkin wrote:
> On Sun, Jun 28, 2009 at 08:53:22AM -0400, Gregory Haskins wrote:
> > Michael S. Tsirkin wrote:
> > > On Thu, Jun 25, 2009 at 09:28:27AM -0400, Gregory Haskins wrote:
> > >   
> > >> eventfd currently emits a POLLHUP wakeup on f_ops->release() to generate a
> > >> "release" callback.  This lets eventfd clients know if the eventfd is about
> > >> to go away and is very useful particularly for in-kernel clients.  However,
> > >> until recently it is not possible to use this feature of eventfd in a
> > >> race-free way.  This patch utilizes a new eventfd interface to rectify
> > >> the problem.
> > >>
> > >> Note that one final race is known to exist: the slow-work thread may race
> > >> with module removal.  We are currently working with slow-work upstream
> > >> to fix this issue as well.  Since the code prior to this patch also
> > >> races with module_put(), we are not making anything worse, but rather
> > >> shifting the cause of the race.  Once the slow-work code is patched we
> > >> will be fixing the last remaining issue.
> > >>     
> > >
> > > By the way, why are we using slow-work here? Wouldn't a regular
> > > workqueue do just as well, with less code, and avoid the race?
> > >
> > >   
> > I believe it will cause a problem if you do a "flush_work()" from inside
> > a work-item.  I could be wrong, of course, but it looks like a recipe to
> > deadlock.
> > 
> > -Greg
> > 
> 
> Sure, but the idea is to only flush on kvm close, never from work item.

To clarify, you don't flush slow works from a work-item,
so you shouldn't need to flush workqueue either.
Michael S. Tsirkin June 28, 2009, 1:18 p.m. UTC | #8
On Sun, Jun 28, 2009 at 08:50:28AM -0400, Gregory Haskins wrote:
> Michael S. Tsirkin wrote:
> > On Thu, Jun 25, 2009 at 09:28:27AM -0400, Gregory Haskins wrote:
> >   
> >> @@ -65,25 +134,39 @@ irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key)
> >>  	unsigned long flags = (unsigned long)key;
> >>  
> >>  	/*
> >> -	 * Assume we will be called with interrupts disabled
> >> +	 * Called with interrupts disabled
> >>  	 */
> >>  	if (flags & POLLIN)
> >> -		/*
> >> -		 * Defer the IRQ injection until later since we need to
> >> -		 * acquire the kvm->lock to do so.
> >> -		 */
> >> +		/* An event has been signaled, inject an interrupt */
> >>  		schedule_work(&irqfd->inject);
> >>  
> >>  	if (flags & POLLHUP) {
> >> -		/*
> >> -		 * for now, just remove ourselves from the list and let
> >> -		 * the rest dangle.  We will fix this up later once
> >> -		 * the races in eventfd are fixed
> >> -		 */
> >> +		/* The eventfd is closing, detach from KVM */
> >> +		struct kvm *kvm = irqfd->kvm;
> >> +		unsigned long flags;
> >> +
> >>  		__remove_wait_queue(irqfd->wqh, &irqfd->wait);
> >> -		irqfd->wqh = NULL;
> >> +
> >> +		spin_lock_irqsave(&kvm->irqfds.lock, flags);
> >> +
> >> +		if (irqfd->active) {
> >> +			/*
> >> +			 * If the item is still active we can be sure that
> >> +			 * no-one else is trying to shutdown this object at
> >> +			 * the same time.
> >> +			 *
> >> +			 * Defer the shutdown to a thread so we can flush
> >> +			 * all remaining inject jobs.  We use a slow-work
> >> +			 * item to prevent a deadlock against the work-queue
> >> +			 */
> >> +			irqfd_deactivate(irqfd);
> >> +			slow_work_enqueue(&irqfd->shutdown);
> >>     
> >
> > Greg, in your patch for slow-work module removal, you write:
> >   "Callers must ensure that their module has at least
> >   one reference held while the work is enqueued."
> >   Where does this guarantee come from, in this case?
> >   
> The general guarantee comes from the fact that modules naturally have to
> have a reference to be able to call the enqueue function to begin with,
> or the calling function was already racy.  In this particular case, we
> can guarantee that the kvm vm fd is held while our slow-work is active,
> and all slow work is flushed before it is released.  (I guess I am
> assuming that VFS takes a module reference when an fd is opened, but I
> have not verified that it actually does.  If it doesn't, I suppose KVM
> is already racy w.r.t. unloading, independent of my patches)
> 
> -Greg
> 

that could be the case, as we have, for example:

static struct file_operations kvm_vm_fops = {
        .release        = kvm_vm_release,
        .unlocked_ioctl = kvm_vm_ioctl,
        .compat_ioctl   = kvm_vm_ioctl,
        .mmap           = kvm_vm_mmap,
};

with no owner field.

Avi, shouldn't we initialize the owner field to prevent
kvm module from going away while files are open?
Michael S. Tsirkin June 28, 2009, 1:20 p.m. UTC | #9
On Sun, Jun 28, 2009 at 03:57:30PM +0300, Michael S. Tsirkin wrote:
> On Sun, Jun 28, 2009 at 03:56:12PM +0300, Michael S. Tsirkin wrote:
> > On Sun, Jun 28, 2009 at 08:53:22AM -0400, Gregory Haskins wrote:
> > > Michael S. Tsirkin wrote:
> > > > On Thu, Jun 25, 2009 at 09:28:27AM -0400, Gregory Haskins wrote:
> > > >   
> > > >> eventfd currently emits a POLLHUP wakeup on f_ops->release() to generate a
> > > >> "release" callback.  This lets eventfd clients know if the eventfd is about
> > > >> to go away and is very useful particularly for in-kernel clients.  However,
> > > >> until recently it is not possible to use this feature of eventfd in a
> > > >> race-free way.  This patch utilizes a new eventfd interface to rectify
> > > >> the problem.
> > > >>
> > > >> Note that one final race is known to exist: the slow-work thread may race
> > > >> with module removal.  We are currently working with slow-work upstream
> > > >> to fix this issue as well.  Since the code prior to this patch also
> > > >> races with module_put(), we are not making anything worse, but rather
> > > >> shifting the cause of the race.  Once the slow-work code is patched we
> > > >> will be fixing the last remaining issue.
> > > >>     
> > > >
> > > > By the way, why are we using slow-work here? Wouldn't a regular
> > > > workqueue do just as well, with less code, and avoid the race?
> > > >
> > > >   
> > > I believe it will cause a problem if you do a "flush_work()" from inside
> > > a work-item.  I could be wrong, of course, but it looks like a recipe to
> > > deadlock.
> > > 
> > > -Greg
> > > 
> > 
> > Sure, but the idea is to only flush on kvm close, never from work item.
> 
> To clarify, you don't flush slow works from a work-item,
> so you shouldn't need to flush workqueue either.

I guess my question is - why is slow work different? It's still
a thread pool underneath ...
Avi Kivity June 28, 2009, 1:25 p.m. UTC | #10
On 06/28/2009 04:18 PM, Michael S. Tsirkin wrote:
>
> that could be the case, as we have, for example:
>
> static struct file_operations kvm_vm_fops = {
>          .release        = kvm_vm_release,
>          .unlocked_ioctl = kvm_vm_ioctl,
>          .compat_ioctl   = kvm_vm_ioctl,
>          .mmap           = kvm_vm_mmap,
> };
>
> with no owner field.
>
> Avi, shouldn't we initialize the owner field to prevent
> kvm module from going away while files are open?
>    

We do initialize it:

kvm_chardev_ops.owner = module;
kvm_vm_fops.owner = module;
kvm_vcpu_fops.owner = module;

The reason it's not done through the initializer is that we set the 
owner to the vendor module (e.g. kvm-intel.ko) so that you can't remove 
the vendor module when a guest is running.
Gregory Haskins June 28, 2009, 4:25 p.m. UTC | #11
Michael S. Tsirkin wrote:
> On Sun, Jun 28, 2009 at 08:53:22AM -0400, Gregory Haskins wrote:
>   
>> Michael S. Tsirkin wrote:
>>     
>>> On Thu, Jun 25, 2009 at 09:28:27AM -0400, Gregory Haskins wrote:
>>>   
>>>       
>>>> eventfd currently emits a POLLHUP wakeup on f_ops->release() to generate a
>>>> "release" callback.  This lets eventfd clients know if the eventfd is about
>>>> to go away and is very useful particularly for in-kernel clients.  However,
>>>> until recently it is not possible to use this feature of eventfd in a
>>>> race-free way.  This patch utilizes a new eventfd interface to rectify
>>>> the problem.
>>>>
>>>> Note that one final race is known to exist: the slow-work thread may race
>>>> with module removal.  We are currently working with slow-work upstream
>>>> to fix this issue as well.  Since the code prior to this patch also
>>>> races with module_put(), we are not making anything worse, but rather
>>>> shifting the cause of the race.  Once the slow-work code is patched we
>>>> will be fixing the last remaining issue.
>>>>     
>>>>         
>>> By the way, why are we using slow-work here? Wouldn't a regular
>>> workqueue do just as well, with less code, and avoid the race?
>>>
>>>   
>>>       
>> I believe it will cause a problem if you do a "flush_work()" from inside
>> a work-item.  I could be wrong, of course, but it looks like a recipe to
>> deadlock.
>>
>> -Greg
>>
>>     
>
> Sure, but the idea is to only flush on kvm close, never from work item.
>
>   
The point of the flush on the eventfd side is to make sure we
synchronize with outstanding injects before we free the irqfd.

-Greg
Gregory Haskins June 28, 2009, 4:28 p.m. UTC | #12
Michael S. Tsirkin wrote:
> On Sun, Jun 28, 2009 at 03:57:30PM +0300, Michael S. Tsirkin wrote:
>   
>> On Sun, Jun 28, 2009 at 03:56:12PM +0300, Michael S. Tsirkin wrote:
>>     
>>> On Sun, Jun 28, 2009 at 08:53:22AM -0400, Gregory Haskins wrote:
>>>       
>>>> Michael S. Tsirkin wrote:
>>>>         
>>>>> On Thu, Jun 25, 2009 at 09:28:27AM -0400, Gregory Haskins wrote:
>>>>>   
>>>>>           
>>>>>> eventfd currently emits a POLLHUP wakeup on f_ops->release() to generate a
>>>>>> "release" callback.  This lets eventfd clients know if the eventfd is about
>>>>>> to go away and is very useful particularly for in-kernel clients.  However,
>>>>>> until recently it is not possible to use this feature of eventfd in a
>>>>>> race-free way.  This patch utilizes a new eventfd interface to rectify
>>>>>> the problem.
>>>>>>
>>>>>> Note that one final race is known to exist: the slow-work thread may race
>>>>>> with module removal.  We are currently working with slow-work upstream
>>>>>> to fix this issue as well.  Since the code prior to this patch also
>>>>>> races with module_put(), we are not making anything worse, but rather
>>>>>> shifting the cause of the race.  Once the slow-work code is patched we
>>>>>> will be fixing the last remaining issue.
>>>>>>     
>>>>>>             
>>>>> By the way, why are we using slow-work here? Wouldn't a regular
>>>>> workqueue do just as well, with less code, and avoid the race?
>>>>>
>>>>>   
>>>>>           
>>>> I believe it will cause a problem if you do a "flush_work()" from inside
>>>> a work-item.  I could be wrong, of course, but it looks like a recipe to
>>>> deadlock.
>>>>
>>>> -Greg
>>>>
>>>>         
>>> Sure, but the idea is to only flush on kvm close, never from work item.
>>>       
>> To clarify, you don't flush slow works from a work-item,
>> so you shouldn't need to flush workqueue either.
>>     
>
> I guess my question is - why is slow work different? It's still
> a thread pool underneath ...
>
>   
Its not interdependent.  Flush-work blocks the thread..if the thread
happens to be the work-queue thread you may deadlock preventing it from
processing further jobs like the inject.  In reality it shouldnt be
possible, but its just a bad idea to assume its ok.  Slow work, on the
other hand, will just make a new thread.

-Greg
Michael S. Tsirkin June 28, 2009, 7:07 p.m. UTC | #13
On Sun, Jun 28, 2009 at 12:28:19PM -0400, Gregory Haskins wrote:
> Michael S. Tsirkin wrote:
> > On Sun, Jun 28, 2009 at 03:57:30PM +0300, Michael S. Tsirkin wrote:
> >   
> >> On Sun, Jun 28, 2009 at 03:56:12PM +0300, Michael S. Tsirkin wrote:
> >>     
> >>> On Sun, Jun 28, 2009 at 08:53:22AM -0400, Gregory Haskins wrote:
> >>>       
> >>>> Michael S. Tsirkin wrote:
> >>>>         
> >>>>> On Thu, Jun 25, 2009 at 09:28:27AM -0400, Gregory Haskins wrote:
> >>>>>   
> >>>>>           
> >>>>>> eventfd currently emits a POLLHUP wakeup on f_ops->release() to generate a
> >>>>>> "release" callback.  This lets eventfd clients know if the eventfd is about
> >>>>>> to go away and is very useful particularly for in-kernel clients.  However,
> >>>>>> until recently it is not possible to use this feature of eventfd in a
> >>>>>> race-free way.  This patch utilizes a new eventfd interface to rectify
> >>>>>> the problem.
> >>>>>>
> >>>>>> Note that one final race is known to exist: the slow-work thread may race
> >>>>>> with module removal.  We are currently working with slow-work upstream
> >>>>>> to fix this issue as well.  Since the code prior to this patch also
> >>>>>> races with module_put(), we are not making anything worse, but rather
> >>>>>> shifting the cause of the race.  Once the slow-work code is patched we
> >>>>>> will be fixing the last remaining issue.
> >>>>>>     
> >>>>>>             
> >>>>> By the way, why are we using slow-work here? Wouldn't a regular
> >>>>> workqueue do just as well, with less code, and avoid the race?
> >>>>>
> >>>>>   
> >>>>>           
> >>>> I believe it will cause a problem if you do a "flush_work()" from inside
> >>>> a work-item.  I could be wrong, of course, but it looks like a recipe to
> >>>> deadlock.
> >>>>
> >>>> -Greg
> >>>>
> >>>>         
> >>> Sure, but the idea is to only flush on kvm close, never from work item.
> >>>       
> >> To clarify, you don't flush slow works from a work-item,
> >> so you shouldn't need to flush workqueue either.
> >>     
> >
> > I guess my question is - why is slow work different? It's still
> > a thread pool underneath ...
> >
> >   
> Its not interdependent.  Flush-work blocks the thread..if the thread
> happens to be the work-queue thread you may deadlock preventing it from
> processing further jobs like the inject.  In reality it shouldnt be
> possible, but its just a bad idea to assume its ok.
> Slow work, on the
> other hand, will just make a new thread.
> 
> -Greg
> 

But if you create your own workqueue, and all you do there is destroy
irqfds, things are ok I think. Right?



--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gregory Haskins June 28, 2009, 7:54 p.m. UTC | #14
Michael S. Tsirkin wrote:
> On Sun, Jun 28, 2009 at 12:28:19PM -0400, Gregory Haskins wrote:
>   
>> Michael S. Tsirkin wrote:
>>     
>>> On Sun, Jun 28, 2009 at 03:57:30PM +0300, Michael S. Tsirkin wrote:
>>>   
>>>       
>>>> On Sun, Jun 28, 2009 at 03:56:12PM +0300, Michael S. Tsirkin wrote:
>>>>     
>>>>         
>>>>> On Sun, Jun 28, 2009 at 08:53:22AM -0400, Gregory Haskins wrote:
>>>>>       
>>>>>           
>>>>>> Michael S. Tsirkin wrote:
>>>>>>         
>>>>>>             
>>>>>>> On Thu, Jun 25, 2009 at 09:28:27AM -0400, Gregory Haskins wrote:
>>>>>>>   
>>>>>>>           
>>>>>>>               
>>>>>>>> eventfd currently emits a POLLHUP wakeup on f_ops->release() to generate a
>>>>>>>> "release" callback.  This lets eventfd clients know if the eventfd is about
>>>>>>>> to go away and is very useful particularly for in-kernel clients.  However,
>>>>>>>> until recently it is not possible to use this feature of eventfd in a
>>>>>>>> race-free way.  This patch utilizes a new eventfd interface to rectify
>>>>>>>> the problem.
>>>>>>>>
>>>>>>>> Note that one final race is known to exist: the slow-work thread may race
>>>>>>>> with module removal.  We are currently working with slow-work upstream
>>>>>>>> to fix this issue as well.  Since the code prior to this patch also
>>>>>>>> races with module_put(), we are not making anything worse, but rather
>>>>>>>> shifting the cause of the race.  Once the slow-work code is patched we
>>>>>>>> will be fixing the last remaining issue.
>>>>>>>>     
>>>>>>>>             
>>>>>>>>                 
>>>>>>> By the way, why are we using slow-work here? Wouldn't a regular
>>>>>>> workqueue do just as well, with less code, and avoid the race?
>>>>>>>
>>>>>>>   
>>>>>>>           
>>>>>>>               
>>>>>> I believe it will cause a problem if you do a "flush_work()" from inside
>>>>>> a work-item.  I could be wrong, of course, but it looks like a recipe to
>>>>>> deadlock.
>>>>>>
>>>>>> -Greg
>>>>>>
>>>>>>         
>>>>>>             
>>>>> Sure, but the idea is to only flush on kvm close, never from work item.
>>>>>       
>>>>>           
>>>> To clarify, you don't flush slow works from a work-item,
>>>> so you shouldn't need to flush workqueue either.
>>>>     
>>>>         
>>> I guess my question is - why is slow work different? It's still
>>> a thread pool underneath ...
>>>
>>>   
>>>       
>> Its not interdependent.  Flush-work blocks the thread..if the thread
>> happens to be the work-queue thread you may deadlock preventing it from
>> processing further jobs like the inject.  In reality it shouldnt be
>> possible, but its just a bad idea to assume its ok.
>> Slow work, on the
>> other hand, will just make a new thread.
>>
>> -Greg
>>
>>     
>
> But if you create your own workqueue, and all you do there is destroy
> irqfds, things are ok I think. Right?
>   

Yep, creating your own queue works too.  I picked slow-work as an
alternate to generating a dedicated resource, but I agree either method
would work fine.  Do you have a preference? 

Regards,
-Greg
>
>
>
Michael S. Tsirkin June 28, 2009, 8:07 p.m. UTC | #15
On Sun, Jun 28, 2009 at 03:54:42PM -0400, Gregory Haskins wrote:
> Michael S. Tsirkin wrote:
> > On Sun, Jun 28, 2009 at 12:28:19PM -0400, Gregory Haskins wrote:
> >   
> >> Michael S. Tsirkin wrote:
> >>     
> >>> On Sun, Jun 28, 2009 at 03:57:30PM +0300, Michael S. Tsirkin wrote:
> >>>   
> >>>       
> >>>> On Sun, Jun 28, 2009 at 03:56:12PM +0300, Michael S. Tsirkin wrote:
> >>>>     
> >>>>         
> >>>>> On Sun, Jun 28, 2009 at 08:53:22AM -0400, Gregory Haskins wrote:
> >>>>>       
> >>>>>           
> >>>>>> Michael S. Tsirkin wrote:
> >>>>>>         
> >>>>>>             
> >>>>>>> On Thu, Jun 25, 2009 at 09:28:27AM -0400, Gregory Haskins wrote:
> >>>>>>>   
> >>>>>>>           
> >>>>>>>               
> >>>>>>>> eventfd currently emits a POLLHUP wakeup on f_ops->release() to generate a
> >>>>>>>> "release" callback.  This lets eventfd clients know if the eventfd is about
> >>>>>>>> to go away and is very useful particularly for in-kernel clients.  However,
> >>>>>>>> until recently it is not possible to use this feature of eventfd in a
> >>>>>>>> race-free way.  This patch utilizes a new eventfd interface to rectify
> >>>>>>>> the problem.
> >>>>>>>>
> >>>>>>>> Note that one final race is known to exist: the slow-work thread may race
> >>>>>>>> with module removal.  We are currently working with slow-work upstream
> >>>>>>>> to fix this issue as well.  Since the code prior to this patch also
> >>>>>>>> races with module_put(), we are not making anything worse, but rather
> >>>>>>>> shifting the cause of the race.  Once the slow-work code is patched we
> >>>>>>>> will be fixing the last remaining issue.
> >>>>>>>>     
> >>>>>>>>             
> >>>>>>>>                 
> >>>>>>> By the way, why are we using slow-work here? Wouldn't a regular
> >>>>>>> workqueue do just as well, with less code, and avoid the race?
> >>>>>>>
> >>>>>>>   
> >>>>>>>           
> >>>>>>>               
> >>>>>> I believe it will cause a problem if you do a "flush_work()" from inside
> >>>>>> a work-item.  I could be wrong, of course, but it looks like a recipe to
> >>>>>> deadlock.
> >>>>>>
> >>>>>> -Greg
> >>>>>>
> >>>>>>         
> >>>>>>             
> >>>>> Sure, but the idea is to only flush on kvm close, never from work item.
> >>>>>       
> >>>>>           
> >>>> To clarify, you don't flush slow works from a work-item,
> >>>> so you shouldn't need to flush workqueue either.
> >>>>     
> >>>>         
> >>> I guess my question is - why is slow work different? It's still
> >>> a thread pool underneath ...
> >>>
> >>>   
> >>>       
> >> Its not interdependent.  Flush-work blocks the thread..if the thread
> >> happens to be the work-queue thread you may deadlock preventing it from
> >> processing further jobs like the inject.  In reality it shouldnt be
> >> possible, but its just a bad idea to assume its ok.
> >> Slow work, on the
> >> other hand, will just make a new thread.
> >>
> >> -Greg
> >>
> >>     
> >
> > But if you create your own workqueue, and all you do there is destroy
> > irqfds, things are ok I think. Right?
> >   
> 
> Yep, creating your own queue works too.  I picked slow-work as an
> alternate to generating a dedicated resource, but I agree either method
> would work fine.  Do you have a preference? 
> 
> Regards,
> -Greg

It's not something I lose sleep about, but I think workqueue might be
less code: for example, you can just flush it instead of using your own
counter. And possibly things can be further simplified by making the
workqueue single-threaded and always doing deassign from there.
Gregory Haskins June 28, 2009, 8:17 p.m. UTC | #16
Michael S. Tsirkin wrote:
> On Sun, Jun 28, 2009 at 03:54:42PM -0400, Gregory Haskins wrote:
>   
>> Michael S. Tsirkin wrote:
>>     
>>> On Sun, Jun 28, 2009 at 12:28:19PM -0400, Gregory Haskins wrote:
>>>   
>>>       
>>>> Michael S. Tsirkin wrote:
>>>>     
>>>>         
>>>>> On Sun, Jun 28, 2009 at 03:57:30PM +0300, Michael S. Tsirkin wrote:
>>>>>   
>>>>>       
>>>>>           
>>>>>> On Sun, Jun 28, 2009 at 03:56:12PM +0300, Michael S. Tsirkin wrote:
>>>>>>     
>>>>>>         
>>>>>>             
>>>>>>> On Sun, Jun 28, 2009 at 08:53:22AM -0400, Gregory Haskins wrote:
>>>>>>>       
>>>>>>>           
>>>>>>>               
>>>>>>>> Michael S. Tsirkin wrote:
>>>>>>>>         
>>>>>>>>             
>>>>>>>>                 
>>>>>>>>> On Thu, Jun 25, 2009 at 09:28:27AM -0400, Gregory Haskins wrote:
>>>>>>>>>   
>>>>>>>>>           
>>>>>>>>>               
>>>>>>>>>                   
>>>>>>>>>> eventfd currently emits a POLLHUP wakeup on f_ops->release() to generate a
>>>>>>>>>> "release" callback.  This lets eventfd clients know if the eventfd is about
>>>>>>>>>> to go away and is very useful particularly for in-kernel clients.  However,
>>>>>>>>>> until recently it is not possible to use this feature of eventfd in a
>>>>>>>>>> race-free way.  This patch utilizes a new eventfd interface to rectify
>>>>>>>>>> the problem.
>>>>>>>>>>
>>>>>>>>>> Note that one final race is known to exist: the slow-work thread may race
>>>>>>>>>> with module removal.  We are currently working with slow-work upstream
>>>>>>>>>> to fix this issue as well.  Since the code prior to this patch also
>>>>>>>>>> races with module_put(), we are not making anything worse, but rather
>>>>>>>>>> shifting the cause of the race.  Once the slow-work code is patched we
>>>>>>>>>> will be fixing the last remaining issue.
>>>>>>>>>>     
>>>>>>>>>>             
>>>>>>>>>>                 
>>>>>>>>>>                     
>>>>>>>>> By the way, why are we using slow-work here? Wouldn't a regular
>>>>>>>>> workqueue do just as well, with less code, and avoid the race?
>>>>>>>>>
>>>>>>>>>   
>>>>>>>>>           
>>>>>>>>>               
>>>>>>>>>                   
>>>>>>>> I believe it will cause a problem if you do a "flush_work()" from inside
>>>>>>>> a work-item.  I could be wrong, of course, but it looks like a recipe to
>>>>>>>> deadlock.
>>>>>>>>
>>>>>>>> -Greg
>>>>>>>>
>>>>>>>>         
>>>>>>>>             
>>>>>>>>                 
>>>>>>> Sure, but the idea is to only flush on kvm close, never from work item.
>>>>>>>       
>>>>>>>           
>>>>>>>               
>>>>>> To clarify, you don't flush slow works from a work-item,
>>>>>> so you shouldn't need to flush workqueue either.
>>>>>>     
>>>>>>         
>>>>>>             
>>>>> I guess my question is - why is slow work different? It's still
>>>>> a thread pool underneath ...
>>>>>
>>>>>   
>>>>>       
>>>>>           
>>>> Its not interdependent.  Flush-work blocks the thread..if the thread
>>>> happens to be the work-queue thread you may deadlock preventing it from
>>>> processing further jobs like the inject.  In reality it shouldnt be
>>>> possible, but its just a bad idea to assume its ok.
>>>> Slow work, on the
>>>> other hand, will just make a new thread.
>>>>
>>>> -Greg
>>>>
>>>>     
>>>>         
>>> But if you create your own workqueue, and all you do there is destroy
>>> irqfds, things are ok I think. Right?
>>>   
>>>       
>> Yep, creating your own queue works too.  I picked slow-work as an
>> alternate to generating a dedicated resource, but I agree either method
>> would work fine.  Do you have a preference? 
>>
>> Regards,
>> -Greg
>>     
>
> It's not something I lose sleep about, but I think workqueue might be
> less code: for example, you can just flush it instead of using your own
> counter. And possibly things can be further simplified by making the
> workqueue single-threaded and always doing deassign from there.
>
>   
Yeah, its not a huge deal.  The logic is probably slightly simpler with
a dedicated single-thread queue for shutdown, at the expense of having
the work-queue hang around mostly idle.  I'll code it up both ways so we
can compare.

Thanks Michael,
-Greg
diff mbox

Patch

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 2451f48..d94ee72 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -141,7 +141,12 @@  struct kvm {
 	struct kvm_io_bus mmio_bus;
 	struct kvm_io_bus pio_bus;
 #ifdef CONFIG_HAVE_KVM_EVENTFD
-	struct list_head irqfds;
+	struct {
+		spinlock_t        lock;
+		struct list_head  items;
+		atomic_t          outstanding;
+		wait_queue_head_t wqh;
+	} irqfds;
 #endif
 	struct kvm_vm_stat stat;
 	struct kvm_arch arch;
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index daece36..ab7848a 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -9,6 +9,7 @@  config HAVE_KVM_IRQCHIP
 config HAVE_KVM_EVENTFD
        bool
        select EVENTFD
+       select SLOW_WORK
 
 config KVM_APIC_ARCHITECTURE
        bool
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index 9656027..ca21e8a 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -28,6 +28,7 @@ 
 #include <linux/file.h>
 #include <linux/list.h>
 #include <linux/eventfd.h>
+#include <linux/slow-work.h>
 
 /*
  * --------------------------------------------------------------------
@@ -36,17 +37,36 @@ 
  * Credit goes to Avi Kivity for the original idea.
  * --------------------------------------------------------------------
  */
+
 struct _irqfd {
 	struct kvm               *kvm;
+	struct eventfd_ctx       *eventfd;
 	int                       gsi;
 	struct list_head          list;
 	poll_table                pt;
 	wait_queue_head_t        *wqh;
 	wait_queue_t              wait;
 	struct work_struct        inject;
+	struct slow_work          shutdown;
+	int                       active:1;
 };
 
 static void
+irqfd_release(struct _irqfd *irqfd)
+{
+	eventfd_ctx_put(irqfd->eventfd);
+	kfree(irqfd);
+}
+
+/* assumes kvm->irqfds.lock is held */
+static void
+irqfd_deactivate(struct _irqfd *irqfd)
+{
+	irqfd->active = false;
+	list_del(&irqfd->list);
+}
+
+static void
 irqfd_inject(struct work_struct *work)
 {
 	struct _irqfd *irqfd = container_of(work, struct _irqfd, inject);
@@ -58,6 +78,55 @@  irqfd_inject(struct work_struct *work)
 	mutex_unlock(&kvm->irq_lock);
 }
 
+static struct _irqfd *
+work_to_irqfd(struct slow_work *work)
+{
+	return container_of(work, struct _irqfd, shutdown);
+}
+
+static int
+irqfd_shutdown_get_ref(struct slow_work *work)
+{
+	struct _irqfd *irqfd = work_to_irqfd(work);
+	struct kvm *kvm = irqfd->kvm;
+
+	atomic_inc(&kvm->irqfds.outstanding);
+
+	return 0;
+}
+
+static void
+irqfd_shutdown_put_ref(struct slow_work *work)
+{
+	struct _irqfd *irqfd = work_to_irqfd(work);
+	struct kvm *kvm = irqfd->kvm;
+
+	irqfd_release(irqfd);
+
+	if (atomic_dec_and_test(&kvm->irqfds.outstanding))
+		wake_up(&kvm->irqfds.wqh);
+}
+
+static void
+irqfd_shutdown_execute(struct slow_work *work)
+{
+	struct _irqfd *irqfd = work_to_irqfd(work);
+
+	/*
+	 * Ensure there are no outstanding "inject" work-items before we blow
+	 * away our state.  Once this job completes, the slow_work
+	 * infrastructure will drop the irqfd object completely via put_ref
+	 */
+	flush_work(&irqfd->inject);
+}
+
+const static struct slow_work_ops irqfd_shutdown_work_ops = {
+	.get_ref = irqfd_shutdown_get_ref,
+	.put_ref = irqfd_shutdown_put_ref,
+	.execute = irqfd_shutdown_execute,
+};
+
+
 static int
 irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key)
 {
@@ -65,25 +134,39 @@  irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key)
 	unsigned long flags = (unsigned long)key;
 
 	/*
-	 * Assume we will be called with interrupts disabled
+	 * Called with interrupts disabled
 	 */
 	if (flags & POLLIN)
-		/*
-		 * Defer the IRQ injection until later since we need to
-		 * acquire the kvm->lock to do so.
-		 */
+		/* An event has been signaled, inject an interrupt */
 		schedule_work(&irqfd->inject);
 
 	if (flags & POLLHUP) {
-		/*
-		 * for now, just remove ourselves from the list and let
-		 * the rest dangle.  We will fix this up later once
-		 * the races in eventfd are fixed
-		 */
+		/* The eventfd is closing, detach from KVM */
+		struct kvm *kvm = irqfd->kvm;
+		unsigned long flags;
+
 		__remove_wait_queue(irqfd->wqh, &irqfd->wait);
-		irqfd->wqh = NULL;
+
+		spin_lock_irqsave(&kvm->irqfds.lock, flags);
+
+		if (irqfd->active) {
+			/*
+			 * If the item is still active we can be sure that
+			 * no-one else is trying to shutdown this object at
+			 * the same time.
+			 *
+			 * Defer the shutdown to a thread so we can flush
+			 * all remaining inject jobs.  We use a slow-work
+			 * item to prevent a deadlock against the work-queue
+			 */
+			irqfd_deactivate(irqfd);
+			slow_work_enqueue(&irqfd->shutdown);
+		}
+
+		spin_unlock_irqrestore(&kvm->irqfds.lock, flags);
 	}
 
+
 	return 0;
 }
 
@@ -102,6 +185,7 @@  kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags)
 {
 	struct _irqfd *irqfd;
 	struct file *file = NULL;
+	struct eventfd_ctx *eventfd = NULL;
 	int ret;
 	unsigned int events;
 
@@ -113,6 +197,8 @@  kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags)
 	irqfd->gsi = gsi;
 	INIT_LIST_HEAD(&irqfd->list);
 	INIT_WORK(&irqfd->inject, irqfd_inject);
+	slow_work_init(&irqfd->shutdown, &irqfd_shutdown_work_ops);
+	irqfd->active = true;
 
 	file = eventfd_fget(fd);
 	if (IS_ERR(file)) {
@@ -129,12 +215,21 @@  kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags)
 
 	events = file->f_op->poll(file, &irqfd->pt);
 
-	mutex_lock(&kvm->lock);
-	list_add_tail(&irqfd->list, &kvm->irqfds);
-	mutex_unlock(&kvm->lock);
+	spin_lock_irq(&kvm->irqfds.lock);
+	list_add_tail(&irqfd->list, &kvm->irqfds.items);
+	spin_unlock_irq(&kvm->irqfds.lock);
+
+	eventfd = eventfd_ctx_fileget(file);
+	if (IS_ERR(file)) {
+		ret = PTR_ERR(file);
+		goto fail;
+	}
+
+	irqfd->eventfd = eventfd;
 
 	/*
-	 * Check if there was an event already queued
+	 * Check if there was an event already pending on the eventfd
+	 * before we registered, and trigger it as if we didn't miss it.
 	 */
 	if (events & POLLIN)
 		schedule_work(&irqfd->inject);
@@ -148,6 +243,9 @@  kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags)
 	return 0;
 
 fail:
+	if (eventfd && !IS_ERR(eventfd))
+		eventfd_ctx_put(eventfd);
+
 	if (file && !IS_ERR(file))
 		fput(file);
 
@@ -158,24 +256,71 @@  fail:
 void
 kvm_irqfd_init(struct kvm *kvm)
 {
-	INIT_LIST_HEAD(&kvm->irqfds);
+	slow_work_register_user();
+
+	spin_lock_init(&kvm->irqfds.lock);
+	INIT_LIST_HEAD(&kvm->irqfds.items);
+	atomic_set(&kvm->irqfds.outstanding, 0);
+	init_waitqueue_head(&kvm->irqfds.wqh);
+}
+
+static struct _irqfd *
+irqfd_pop(struct kvm *kvm)
+{
+	struct _irqfd *irqfd = NULL;
+
+	spin_lock_irq(&kvm->irqfds.lock);
+
+	if (!list_empty(&kvm->irqfds.items)) {
+		irqfd = list_first_entry(&kvm->irqfds.items,
+					 struct _irqfd, list);
+		irqfd_deactivate(irqfd);
+	}
+
+	spin_unlock_irq(&kvm->irqfds.lock);
+
+	return irqfd;
+}
+
+/*
+ * locally releases the irqfd
+ *
+ * This function is called when KVM won the race with eventfd (signalled by
+ * finding the item active on the kvm->irqfds.item list). We are now guaranteed
+ * that we will never schedule a deferred shutdown task against this object,
+ * so we take steps to perform the shutdown ourselves.
+ *
+ * 1) We must remove ourselves from the wait-queue to prevent further events,
+ *    which will simultaneously act to sync us with eventfd (via wqh->lock)
+ * 2) Flush any outstanding inject-tasks to ensure its safe to free memory
+ * 3) Delete the object
+ */
+static void
+irqfd_shutdown(struct _irqfd *irqfd)
+{
+	remove_wait_queue(irqfd->wqh, &irqfd->wait);
+	flush_work(&irqfd->inject);
+	irqfd_release(irqfd);
 }
 
 void
 kvm_irqfd_release(struct kvm *kvm)
 {
-	struct _irqfd *irqfd, *tmp;
-
-	list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds, list) {
-		if (irqfd->wqh)
-			remove_wait_queue(irqfd->wqh, &irqfd->wait);
+	struct _irqfd *irqfd;
 
-		flush_work(&irqfd->inject);
+	/*
+	 * Shutdown all irqfds that still remain
+	 */
+	while ((irqfd = irqfd_pop(kvm)))
+		irqfd_shutdown(irqfd);
 
-		mutex_lock(&kvm->lock);
-		list_del(&irqfd->list);
-		mutex_unlock(&kvm->lock);
+	/*
+	 * irqfds.outstanding tracks the number of outstanding "shutdown"
+	 * jobs pending at any given time.  Once we get here, we know that
+	 * no more jobs will get scheduled, so go ahead and block until all
+	 * of them complete
+	 */
+	wait_event(kvm->irqfds.wqh, (!atomic_read(&kvm->irqfds.outstanding)));
 
-		kfree(irqfd);
-	}
+	slow_work_unregister_user();
 }