Message ID | 20090625132826.26748.15607.stgit@dev.haskins.net (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Gregory Haskins wrote: > eventfd currently emits a POLLHUP wakeup on f_ops->release() to generate a > "release" callback. This lets eventfd clients know if the eventfd is about > to go away and is very useful particularly for in-kernel clients. However, > until recently it is not possible to use this feature of eventfd in a > race-free way. This patch utilizes a new eventfd interface to rectify > the problem. > > Note that one final race is known to exist: the slow-work thread may race > with module removal. We are currently working with slow-work upstream > to fix this issue as well. Since the code prior to this patch also > races with module_put(), we are not making anything worse, but rather > shifting the cause of the race. Once the slow-work code is patched we > will be fixing the last remaining issue. > > Signed-off-by: Gregory Haskins <ghaskins@novell.com> > --- > > include/linux/kvm_host.h | 7 +- > virt/kvm/Kconfig | 1 > virt/kvm/eventfd.c | 199 ++++++++++++++++++++++++++++++++++++++++------ > 3 files changed, 179 insertions(+), 28 deletions(-) > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h > index 2451f48..d94ee72 100644 > --- a/include/linux/kvm_host.h > +++ b/include/linux/kvm_host.h > @@ -141,7 +141,12 @@ struct kvm { > struct kvm_io_bus mmio_bus; > struct kvm_io_bus pio_bus; > #ifdef CONFIG_HAVE_KVM_EVENTFD > - struct list_head irqfds; > + struct { > + spinlock_t lock; > + struct list_head items; > + atomic_t outstanding; > + wait_queue_head_t wqh; > + } irqfds; > #endif > struct kvm_vm_stat stat; > struct kvm_arch arch; > diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig > index daece36..ab7848a 100644 > --- a/virt/kvm/Kconfig > +++ b/virt/kvm/Kconfig > @@ -9,6 +9,7 @@ config HAVE_KVM_IRQCHIP > config HAVE_KVM_EVENTFD > bool > select EVENTFD > + select SLOW_WORK > > config KVM_APIC_ARCHITECTURE > bool > diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c > index 9656027..ca21e8a 100644 > --- a/virt/kvm/eventfd.c > +++ b/virt/kvm/eventfd.c > @@ -28,6 +28,7 @@ > #include <linux/file.h> > #include <linux/list.h> > #include <linux/eventfd.h> > +#include <linux/slow-work.h> > > /* > * -------------------------------------------------------------------- > @@ -36,17 +37,36 @@ > * Credit goes to Avi Kivity for the original idea. > * -------------------------------------------------------------------- > */ > + > struct _irqfd { > struct kvm *kvm; > + struct eventfd_ctx *eventfd; > int gsi; > struct list_head list; > poll_table pt; > wait_queue_head_t *wqh; > wait_queue_t wait; > struct work_struct inject; > + struct slow_work shutdown; > + int active:1; > }; > > static void > +irqfd_release(struct _irqfd *irqfd) > +{ > + eventfd_ctx_put(irqfd->eventfd); > + kfree(irqfd); > +} > + > +/* assumes kvm->irqfds.lock is held */ > +static void > +irqfd_deactivate(struct _irqfd *irqfd) > +{ > + irqfd->active = false; > + list_del(&irqfd->list); > +} > + > +static void > irqfd_inject(struct work_struct *work) > { > struct _irqfd *irqfd = container_of(work, struct _irqfd, inject); > @@ -58,6 +78,55 @@ irqfd_inject(struct work_struct *work) > mutex_unlock(&kvm->irq_lock); > } > > +static struct _irqfd * > +work_to_irqfd(struct slow_work *work) > +{ > + return container_of(work, struct _irqfd, shutdown); > +} > + > +static int > +irqfd_shutdown_get_ref(struct slow_work *work) > +{ > + struct _irqfd *irqfd = work_to_irqfd(work); > + struct kvm *kvm = irqfd->kvm; > + > + atomic_inc(&kvm->irqfds.outstanding); > + > + return 0; > +} > + > +static void > +irqfd_shutdown_put_ref(struct slow_work *work) > +{ > + struct _irqfd *irqfd = work_to_irqfd(work); > + struct kvm *kvm = irqfd->kvm; > + > + irqfd_release(irqfd); > + > + if (atomic_dec_and_test(&kvm->irqfds.outstanding)) > + wake_up(&kvm->irqfds.wqh); > +} > + > +static void > +irqfd_shutdown_execute(struct slow_work *work) > +{ > + struct _irqfd *irqfd = work_to_irqfd(work); > + > + /* > + * Ensure there are no outstanding "inject" work-items before we blow > + * away our state. Once this job completes, the slow_work > + * infrastructure will drop the irqfd object completely via put_ref > + */ > + flush_work(&irqfd->inject); > +} > + > +const static struct slow_work_ops irqfd_shutdown_work_ops = { > + .get_ref = irqfd_shutdown_get_ref, > + .put_ref = irqfd_shutdown_put_ref, > + .execute = irqfd_shutdown_execute, > +}; > + > + > static int > irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key) > { > @@ -65,25 +134,39 @@ irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key) > unsigned long flags = (unsigned long)key; > > /* > - * Assume we will be called with interrupts disabled > + * Called with interrupts disabled > */ > if (flags & POLLIN) > - /* > - * Defer the IRQ injection until later since we need to > - * acquire the kvm->lock to do so. > - */ > + /* An event has been signaled, inject an interrupt */ > schedule_work(&irqfd->inject); > > if (flags & POLLHUP) { > - /* > - * for now, just remove ourselves from the list and let > - * the rest dangle. We will fix this up later once > - * the races in eventfd are fixed > - */ > + /* The eventfd is closing, detach from KVM */ > + struct kvm *kvm = irqfd->kvm; > + unsigned long flags; > + > __remove_wait_queue(irqfd->wqh, &irqfd->wait); > - irqfd->wqh = NULL; > + > + spin_lock_irqsave(&kvm->irqfds.lock, flags); > + > + if (irqfd->active) { > + /* > + * If the item is still active we can be sure that > + * no-one else is trying to shutdown this object at > + * the same time. > + * > + * Defer the shutdown to a thread so we can flush > + * all remaining inject jobs. We use a slow-work > + * item to prevent a deadlock against the work-queue > + */ > + irqfd_deactivate(irqfd); > + slow_work_enqueue(&irqfd->shutdown); > + } > + > + spin_unlock_irqrestore(&kvm->irqfds.lock, flags); > } > > + > return 0; > } > > @@ -102,6 +185,7 @@ kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags) > { > struct _irqfd *irqfd; > struct file *file = NULL; > + struct eventfd_ctx *eventfd = NULL; > int ret; > unsigned int events; > > @@ -113,6 +197,8 @@ kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags) > irqfd->gsi = gsi; > INIT_LIST_HEAD(&irqfd->list); > INIT_WORK(&irqfd->inject, irqfd_inject); > + slow_work_init(&irqfd->shutdown, &irqfd_shutdown_work_ops); > + irqfd->active = true; > > file = eventfd_fget(fd); > if (IS_ERR(file)) { > @@ -129,12 +215,21 @@ kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags) > > events = file->f_op->poll(file, &irqfd->pt); > > - mutex_lock(&kvm->lock); > - list_add_tail(&irqfd->list, &kvm->irqfds); > - mutex_unlock(&kvm->lock); > + spin_lock_irq(&kvm->irqfds.lock); > + list_add_tail(&irqfd->list, &kvm->irqfds.items); > + spin_unlock_irq(&kvm->irqfds.lock); > + > + eventfd = eventfd_ctx_fileget(file); > + if (IS_ERR(file)) { > + ret = PTR_ERR(file); > + goto fail; > + } > + > + irqfd->eventfd = eventfd; > <sigh> I just noticed this while doing a self review: I need to assign the eventfd context *before* putting the item on the list. Not sure why I even did that. I suspect I re-arranged the code at the last minute and didn't notice what a dumb thing I was doing. So this will need at least a v6, but I will wait to hear if there are any other comments from Michael et. al. -Greg > > /* > - * Check if there was an event already queued > + * Check if there was an event already pending on the eventfd > + * before we registered, and trigger it as if we didn't miss it. > */ > if (events & POLLIN) > schedule_work(&irqfd->inject); > @@ -148,6 +243,9 @@ kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags) > return 0; > > fail: > + if (eventfd && !IS_ERR(eventfd)) > + eventfd_ctx_put(eventfd); > + > if (file && !IS_ERR(file)) > fput(file); > > @@ -158,24 +256,71 @@ fail: > void > kvm_irqfd_init(struct kvm *kvm) > { > - INIT_LIST_HEAD(&kvm->irqfds); > + slow_work_register_user(); > + > + spin_lock_init(&kvm->irqfds.lock); > + INIT_LIST_HEAD(&kvm->irqfds.items); > + atomic_set(&kvm->irqfds.outstanding, 0); > + init_waitqueue_head(&kvm->irqfds.wqh); > +} > + > +static struct _irqfd * > +irqfd_pop(struct kvm *kvm) > +{ > + struct _irqfd *irqfd = NULL; > + > + spin_lock_irq(&kvm->irqfds.lock); > + > + if (!list_empty(&kvm->irqfds.items)) { > + irqfd = list_first_entry(&kvm->irqfds.items, > + struct _irqfd, list); > + irqfd_deactivate(irqfd); > + } > + > + spin_unlock_irq(&kvm->irqfds.lock); > + > + return irqfd; > +} > + > +/* > + * locally releases the irqfd > + * > + * This function is called when KVM won the race with eventfd (signalled by > + * finding the item active on the kvm->irqfds.item list). We are now guaranteed > + * that we will never schedule a deferred shutdown task against this object, > + * so we take steps to perform the shutdown ourselves. > + * > + * 1) We must remove ourselves from the wait-queue to prevent further events, > + * which will simultaneously act to sync us with eventfd (via wqh->lock) > + * 2) Flush any outstanding inject-tasks to ensure its safe to free memory > + * 3) Delete the object > + */ > +static void > +irqfd_shutdown(struct _irqfd *irqfd) > +{ > + remove_wait_queue(irqfd->wqh, &irqfd->wait); > + flush_work(&irqfd->inject); > + irqfd_release(irqfd); > } > > void > kvm_irqfd_release(struct kvm *kvm) > { > - struct _irqfd *irqfd, *tmp; > - > - list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds, list) { > - if (irqfd->wqh) > - remove_wait_queue(irqfd->wqh, &irqfd->wait); > + struct _irqfd *irqfd; > > - flush_work(&irqfd->inject); > + /* > + * Shutdown all irqfds that still remain > + */ > + while ((irqfd = irqfd_pop(kvm))) > + irqfd_shutdown(irqfd); > > - mutex_lock(&kvm->lock); > - list_del(&irqfd->list); > - mutex_unlock(&kvm->lock); > + /* > + * irqfds.outstanding tracks the number of outstanding "shutdown" > + * jobs pending at any given time. Once we get here, we know that > + * no more jobs will get scheduled, so go ahead and block until all > + * of them complete > + */ > + wait_event(kvm->irqfds.wqh, (!atomic_read(&kvm->irqfds.outstanding))); > > - kfree(irqfd); > - } > + slow_work_unregister_user(); > } > > -- > To unsubscribe from this list: send the line "unsubscribe kvm" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html >
On Thu, Jun 25, 2009 at 09:28:27AM -0400, Gregory Haskins wrote: > @@ -65,25 +134,39 @@ irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key) > unsigned long flags = (unsigned long)key; > > /* > - * Assume we will be called with interrupts disabled > + * Called with interrupts disabled > */ > if (flags & POLLIN) > - /* > - * Defer the IRQ injection until later since we need to > - * acquire the kvm->lock to do so. > - */ > + /* An event has been signaled, inject an interrupt */ > schedule_work(&irqfd->inject); > > if (flags & POLLHUP) { > - /* > - * for now, just remove ourselves from the list and let > - * the rest dangle. We will fix this up later once > - * the races in eventfd are fixed > - */ > + /* The eventfd is closing, detach from KVM */ > + struct kvm *kvm = irqfd->kvm; > + unsigned long flags; > + > __remove_wait_queue(irqfd->wqh, &irqfd->wait); > - irqfd->wqh = NULL; > + > + spin_lock_irqsave(&kvm->irqfds.lock, flags); > + > + if (irqfd->active) { > + /* > + * If the item is still active we can be sure that > + * no-one else is trying to shutdown this object at > + * the same time. > + * > + * Defer the shutdown to a thread so we can flush > + * all remaining inject jobs. We use a slow-work > + * item to prevent a deadlock against the work-queue > + */ > + irqfd_deactivate(irqfd); > + slow_work_enqueue(&irqfd->shutdown); Greg, in your patch for slow-work module removal, you write: "Callers must ensure that their module has at least one reference held while the work is enqueued." Where does this guarantee come from, in this case?
On Thu, Jun 25, 2009 at 09:28:27AM -0400, Gregory Haskins wrote: > eventfd currently emits a POLLHUP wakeup on f_ops->release() to generate a > "release" callback. This lets eventfd clients know if the eventfd is about > to go away and is very useful particularly for in-kernel clients. However, > until recently it is not possible to use this feature of eventfd in a > race-free way. This patch utilizes a new eventfd interface to rectify > the problem. > > Note that one final race is known to exist: the slow-work thread may race > with module removal. We are currently working with slow-work upstream > to fix this issue as well. Since the code prior to this patch also > races with module_put(), we are not making anything worse, but rather > shifting the cause of the race. Once the slow-work code is patched we > will be fixing the last remaining issue. By the way, why are we using slow-work here? Wouldn't a regular workqueue do just as well, with less code, and avoid the race?
Michael S. Tsirkin wrote: > On Thu, Jun 25, 2009 at 09:28:27AM -0400, Gregory Haskins wrote: > >> @@ -65,25 +134,39 @@ irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key) >> unsigned long flags = (unsigned long)key; >> >> /* >> - * Assume we will be called with interrupts disabled >> + * Called with interrupts disabled >> */ >> if (flags & POLLIN) >> - /* >> - * Defer the IRQ injection until later since we need to >> - * acquire the kvm->lock to do so. >> - */ >> + /* An event has been signaled, inject an interrupt */ >> schedule_work(&irqfd->inject); >> >> if (flags & POLLHUP) { >> - /* >> - * for now, just remove ourselves from the list and let >> - * the rest dangle. We will fix this up later once >> - * the races in eventfd are fixed >> - */ >> + /* The eventfd is closing, detach from KVM */ >> + struct kvm *kvm = irqfd->kvm; >> + unsigned long flags; >> + >> __remove_wait_queue(irqfd->wqh, &irqfd->wait); >> - irqfd->wqh = NULL; >> + >> + spin_lock_irqsave(&kvm->irqfds.lock, flags); >> + >> + if (irqfd->active) { >> + /* >> + * If the item is still active we can be sure that >> + * no-one else is trying to shutdown this object at >> + * the same time. >> + * >> + * Defer the shutdown to a thread so we can flush >> + * all remaining inject jobs. We use a slow-work >> + * item to prevent a deadlock against the work-queue >> + */ >> + irqfd_deactivate(irqfd); >> + slow_work_enqueue(&irqfd->shutdown); >> > > Greg, in your patch for slow-work module removal, you write: > "Callers must ensure that their module has at least > one reference held while the work is enqueued." > Where does this guarantee come from, in this case? > The general guarantee comes from the fact that modules naturally have to have a reference to be able to call the enqueue function to begin with, or the calling function was already racy. In this particular case, we can guarantee that the kvm vm fd is held while our slow-work is active, and all slow work is flushed before it is released. (I guess I am assuming that VFS takes a module reference when an fd is opened, but I have not verified that it actually does. If it doesn't, I suppose KVM is already racy w.r.t. unloading, independent of my patches) -Greg
Michael S. Tsirkin wrote: > On Thu, Jun 25, 2009 at 09:28:27AM -0400, Gregory Haskins wrote: > >> eventfd currently emits a POLLHUP wakeup on f_ops->release() to generate a >> "release" callback. This lets eventfd clients know if the eventfd is about >> to go away and is very useful particularly for in-kernel clients. However, >> until recently it is not possible to use this feature of eventfd in a >> race-free way. This patch utilizes a new eventfd interface to rectify >> the problem. >> >> Note that one final race is known to exist: the slow-work thread may race >> with module removal. We are currently working with slow-work upstream >> to fix this issue as well. Since the code prior to this patch also >> races with module_put(), we are not making anything worse, but rather >> shifting the cause of the race. Once the slow-work code is patched we >> will be fixing the last remaining issue. >> > > By the way, why are we using slow-work here? Wouldn't a regular > workqueue do just as well, with less code, and avoid the race? > > I believe it will cause a problem if you do a "flush_work()" from inside a work-item. I could be wrong, of course, but it looks like a recipe to deadlock. -Greg
On Sun, Jun 28, 2009 at 08:53:22AM -0400, Gregory Haskins wrote: > Michael S. Tsirkin wrote: > > On Thu, Jun 25, 2009 at 09:28:27AM -0400, Gregory Haskins wrote: > > > >> eventfd currently emits a POLLHUP wakeup on f_ops->release() to generate a > >> "release" callback. This lets eventfd clients know if the eventfd is about > >> to go away and is very useful particularly for in-kernel clients. However, > >> until recently it is not possible to use this feature of eventfd in a > >> race-free way. This patch utilizes a new eventfd interface to rectify > >> the problem. > >> > >> Note that one final race is known to exist: the slow-work thread may race > >> with module removal. We are currently working with slow-work upstream > >> to fix this issue as well. Since the code prior to this patch also > >> races with module_put(), we are not making anything worse, but rather > >> shifting the cause of the race. Once the slow-work code is patched we > >> will be fixing the last remaining issue. > >> > > > > By the way, why are we using slow-work here? Wouldn't a regular > > workqueue do just as well, with less code, and avoid the race? > > > > > I believe it will cause a problem if you do a "flush_work()" from inside > a work-item. I could be wrong, of course, but it looks like a recipe to > deadlock. > > -Greg > Sure, but the idea is to only flush on kvm close, never from work item.
On Sun, Jun 28, 2009 at 03:56:12PM +0300, Michael S. Tsirkin wrote: > On Sun, Jun 28, 2009 at 08:53:22AM -0400, Gregory Haskins wrote: > > Michael S. Tsirkin wrote: > > > On Thu, Jun 25, 2009 at 09:28:27AM -0400, Gregory Haskins wrote: > > > > > >> eventfd currently emits a POLLHUP wakeup on f_ops->release() to generate a > > >> "release" callback. This lets eventfd clients know if the eventfd is about > > >> to go away and is very useful particularly for in-kernel clients. However, > > >> until recently it is not possible to use this feature of eventfd in a > > >> race-free way. This patch utilizes a new eventfd interface to rectify > > >> the problem. > > >> > > >> Note that one final race is known to exist: the slow-work thread may race > > >> with module removal. We are currently working with slow-work upstream > > >> to fix this issue as well. Since the code prior to this patch also > > >> races with module_put(), we are not making anything worse, but rather > > >> shifting the cause of the race. Once the slow-work code is patched we > > >> will be fixing the last remaining issue. > > >> > > > > > > By the way, why are we using slow-work here? Wouldn't a regular > > > workqueue do just as well, with less code, and avoid the race? > > > > > > > > I believe it will cause a problem if you do a "flush_work()" from inside > > a work-item. I could be wrong, of course, but it looks like a recipe to > > deadlock. > > > > -Greg > > > > Sure, but the idea is to only flush on kvm close, never from work item. To clarify, you don't flush slow works from a work-item, so you shouldn't need to flush workqueue either.
On Sun, Jun 28, 2009 at 08:50:28AM -0400, Gregory Haskins wrote: > Michael S. Tsirkin wrote: > > On Thu, Jun 25, 2009 at 09:28:27AM -0400, Gregory Haskins wrote: > > > >> @@ -65,25 +134,39 @@ irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key) > >> unsigned long flags = (unsigned long)key; > >> > >> /* > >> - * Assume we will be called with interrupts disabled > >> + * Called with interrupts disabled > >> */ > >> if (flags & POLLIN) > >> - /* > >> - * Defer the IRQ injection until later since we need to > >> - * acquire the kvm->lock to do so. > >> - */ > >> + /* An event has been signaled, inject an interrupt */ > >> schedule_work(&irqfd->inject); > >> > >> if (flags & POLLHUP) { > >> - /* > >> - * for now, just remove ourselves from the list and let > >> - * the rest dangle. We will fix this up later once > >> - * the races in eventfd are fixed > >> - */ > >> + /* The eventfd is closing, detach from KVM */ > >> + struct kvm *kvm = irqfd->kvm; > >> + unsigned long flags; > >> + > >> __remove_wait_queue(irqfd->wqh, &irqfd->wait); > >> - irqfd->wqh = NULL; > >> + > >> + spin_lock_irqsave(&kvm->irqfds.lock, flags); > >> + > >> + if (irqfd->active) { > >> + /* > >> + * If the item is still active we can be sure that > >> + * no-one else is trying to shutdown this object at > >> + * the same time. > >> + * > >> + * Defer the shutdown to a thread so we can flush > >> + * all remaining inject jobs. We use a slow-work > >> + * item to prevent a deadlock against the work-queue > >> + */ > >> + irqfd_deactivate(irqfd); > >> + slow_work_enqueue(&irqfd->shutdown); > >> > > > > Greg, in your patch for slow-work module removal, you write: > > "Callers must ensure that their module has at least > > one reference held while the work is enqueued." > > Where does this guarantee come from, in this case? > > > The general guarantee comes from the fact that modules naturally have to > have a reference to be able to call the enqueue function to begin with, > or the calling function was already racy. In this particular case, we > can guarantee that the kvm vm fd is held while our slow-work is active, > and all slow work is flushed before it is released. (I guess I am > assuming that VFS takes a module reference when an fd is opened, but I > have not verified that it actually does. If it doesn't, I suppose KVM > is already racy w.r.t. unloading, independent of my patches) > > -Greg > that could be the case, as we have, for example: static struct file_operations kvm_vm_fops = { .release = kvm_vm_release, .unlocked_ioctl = kvm_vm_ioctl, .compat_ioctl = kvm_vm_ioctl, .mmap = kvm_vm_mmap, }; with no owner field. Avi, shouldn't we initialize the owner field to prevent kvm module from going away while files are open?
On Sun, Jun 28, 2009 at 03:57:30PM +0300, Michael S. Tsirkin wrote: > On Sun, Jun 28, 2009 at 03:56:12PM +0300, Michael S. Tsirkin wrote: > > On Sun, Jun 28, 2009 at 08:53:22AM -0400, Gregory Haskins wrote: > > > Michael S. Tsirkin wrote: > > > > On Thu, Jun 25, 2009 at 09:28:27AM -0400, Gregory Haskins wrote: > > > > > > > >> eventfd currently emits a POLLHUP wakeup on f_ops->release() to generate a > > > >> "release" callback. This lets eventfd clients know if the eventfd is about > > > >> to go away and is very useful particularly for in-kernel clients. However, > > > >> until recently it is not possible to use this feature of eventfd in a > > > >> race-free way. This patch utilizes a new eventfd interface to rectify > > > >> the problem. > > > >> > > > >> Note that one final race is known to exist: the slow-work thread may race > > > >> with module removal. We are currently working with slow-work upstream > > > >> to fix this issue as well. Since the code prior to this patch also > > > >> races with module_put(), we are not making anything worse, but rather > > > >> shifting the cause of the race. Once the slow-work code is patched we > > > >> will be fixing the last remaining issue. > > > >> > > > > > > > > By the way, why are we using slow-work here? Wouldn't a regular > > > > workqueue do just as well, with less code, and avoid the race? > > > > > > > > > > > I believe it will cause a problem if you do a "flush_work()" from inside > > > a work-item. I could be wrong, of course, but it looks like a recipe to > > > deadlock. > > > > > > -Greg > > > > > > > Sure, but the idea is to only flush on kvm close, never from work item. > > To clarify, you don't flush slow works from a work-item, > so you shouldn't need to flush workqueue either. I guess my question is - why is slow work different? It's still a thread pool underneath ...
On 06/28/2009 04:18 PM, Michael S. Tsirkin wrote: > > that could be the case, as we have, for example: > > static struct file_operations kvm_vm_fops = { > .release = kvm_vm_release, > .unlocked_ioctl = kvm_vm_ioctl, > .compat_ioctl = kvm_vm_ioctl, > .mmap = kvm_vm_mmap, > }; > > with no owner field. > > Avi, shouldn't we initialize the owner field to prevent > kvm module from going away while files are open? > We do initialize it: kvm_chardev_ops.owner = module; kvm_vm_fops.owner = module; kvm_vcpu_fops.owner = module; The reason it's not done through the initializer is that we set the owner to the vendor module (e.g. kvm-intel.ko) so that you can't remove the vendor module when a guest is running.
Michael S. Tsirkin wrote: > On Sun, Jun 28, 2009 at 08:53:22AM -0400, Gregory Haskins wrote: > >> Michael S. Tsirkin wrote: >> >>> On Thu, Jun 25, 2009 at 09:28:27AM -0400, Gregory Haskins wrote: >>> >>> >>>> eventfd currently emits a POLLHUP wakeup on f_ops->release() to generate a >>>> "release" callback. This lets eventfd clients know if the eventfd is about >>>> to go away and is very useful particularly for in-kernel clients. However, >>>> until recently it is not possible to use this feature of eventfd in a >>>> race-free way. This patch utilizes a new eventfd interface to rectify >>>> the problem. >>>> >>>> Note that one final race is known to exist: the slow-work thread may race >>>> with module removal. We are currently working with slow-work upstream >>>> to fix this issue as well. Since the code prior to this patch also >>>> races with module_put(), we are not making anything worse, but rather >>>> shifting the cause of the race. Once the slow-work code is patched we >>>> will be fixing the last remaining issue. >>>> >>>> >>> By the way, why are we using slow-work here? Wouldn't a regular >>> workqueue do just as well, with less code, and avoid the race? >>> >>> >>> >> I believe it will cause a problem if you do a "flush_work()" from inside >> a work-item. I could be wrong, of course, but it looks like a recipe to >> deadlock. >> >> -Greg >> >> > > Sure, but the idea is to only flush on kvm close, never from work item. > > The point of the flush on the eventfd side is to make sure we synchronize with outstanding injects before we free the irqfd. -Greg
Michael S. Tsirkin wrote: > On Sun, Jun 28, 2009 at 03:57:30PM +0300, Michael S. Tsirkin wrote: > >> On Sun, Jun 28, 2009 at 03:56:12PM +0300, Michael S. Tsirkin wrote: >> >>> On Sun, Jun 28, 2009 at 08:53:22AM -0400, Gregory Haskins wrote: >>> >>>> Michael S. Tsirkin wrote: >>>> >>>>> On Thu, Jun 25, 2009 at 09:28:27AM -0400, Gregory Haskins wrote: >>>>> >>>>> >>>>>> eventfd currently emits a POLLHUP wakeup on f_ops->release() to generate a >>>>>> "release" callback. This lets eventfd clients know if the eventfd is about >>>>>> to go away and is very useful particularly for in-kernel clients. However, >>>>>> until recently it is not possible to use this feature of eventfd in a >>>>>> race-free way. This patch utilizes a new eventfd interface to rectify >>>>>> the problem. >>>>>> >>>>>> Note that one final race is known to exist: the slow-work thread may race >>>>>> with module removal. We are currently working with slow-work upstream >>>>>> to fix this issue as well. Since the code prior to this patch also >>>>>> races with module_put(), we are not making anything worse, but rather >>>>>> shifting the cause of the race. Once the slow-work code is patched we >>>>>> will be fixing the last remaining issue. >>>>>> >>>>>> >>>>> By the way, why are we using slow-work here? Wouldn't a regular >>>>> workqueue do just as well, with less code, and avoid the race? >>>>> >>>>> >>>>> >>>> I believe it will cause a problem if you do a "flush_work()" from inside >>>> a work-item. I could be wrong, of course, but it looks like a recipe to >>>> deadlock. >>>> >>>> -Greg >>>> >>>> >>> Sure, but the idea is to only flush on kvm close, never from work item. >>> >> To clarify, you don't flush slow works from a work-item, >> so you shouldn't need to flush workqueue either. >> > > I guess my question is - why is slow work different? It's still > a thread pool underneath ... > > Its not interdependent. Flush-work blocks the thread..if the thread happens to be the work-queue thread you may deadlock preventing it from processing further jobs like the inject. In reality it shouldnt be possible, but its just a bad idea to assume its ok. Slow work, on the other hand, will just make a new thread. -Greg
On Sun, Jun 28, 2009 at 12:28:19PM -0400, Gregory Haskins wrote: > Michael S. Tsirkin wrote: > > On Sun, Jun 28, 2009 at 03:57:30PM +0300, Michael S. Tsirkin wrote: > > > >> On Sun, Jun 28, 2009 at 03:56:12PM +0300, Michael S. Tsirkin wrote: > >> > >>> On Sun, Jun 28, 2009 at 08:53:22AM -0400, Gregory Haskins wrote: > >>> > >>>> Michael S. Tsirkin wrote: > >>>> > >>>>> On Thu, Jun 25, 2009 at 09:28:27AM -0400, Gregory Haskins wrote: > >>>>> > >>>>> > >>>>>> eventfd currently emits a POLLHUP wakeup on f_ops->release() to generate a > >>>>>> "release" callback. This lets eventfd clients know if the eventfd is about > >>>>>> to go away and is very useful particularly for in-kernel clients. However, > >>>>>> until recently it is not possible to use this feature of eventfd in a > >>>>>> race-free way. This patch utilizes a new eventfd interface to rectify > >>>>>> the problem. > >>>>>> > >>>>>> Note that one final race is known to exist: the slow-work thread may race > >>>>>> with module removal. We are currently working with slow-work upstream > >>>>>> to fix this issue as well. Since the code prior to this patch also > >>>>>> races with module_put(), we are not making anything worse, but rather > >>>>>> shifting the cause of the race. Once the slow-work code is patched we > >>>>>> will be fixing the last remaining issue. > >>>>>> > >>>>>> > >>>>> By the way, why are we using slow-work here? Wouldn't a regular > >>>>> workqueue do just as well, with less code, and avoid the race? > >>>>> > >>>>> > >>>>> > >>>> I believe it will cause a problem if you do a "flush_work()" from inside > >>>> a work-item. I could be wrong, of course, but it looks like a recipe to > >>>> deadlock. > >>>> > >>>> -Greg > >>>> > >>>> > >>> Sure, but the idea is to only flush on kvm close, never from work item. > >>> > >> To clarify, you don't flush slow works from a work-item, > >> so you shouldn't need to flush workqueue either. > >> > > > > I guess my question is - why is slow work different? It's still > > a thread pool underneath ... > > > > > Its not interdependent. Flush-work blocks the thread..if the thread > happens to be the work-queue thread you may deadlock preventing it from > processing further jobs like the inject. In reality it shouldnt be > possible, but its just a bad idea to assume its ok. > Slow work, on the > other hand, will just make a new thread. > > -Greg > But if you create your own workqueue, and all you do there is destroy irqfds, things are ok I think. Right? -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin wrote: > On Sun, Jun 28, 2009 at 12:28:19PM -0400, Gregory Haskins wrote: > >> Michael S. Tsirkin wrote: >> >>> On Sun, Jun 28, 2009 at 03:57:30PM +0300, Michael S. Tsirkin wrote: >>> >>> >>>> On Sun, Jun 28, 2009 at 03:56:12PM +0300, Michael S. Tsirkin wrote: >>>> >>>> >>>>> On Sun, Jun 28, 2009 at 08:53:22AM -0400, Gregory Haskins wrote: >>>>> >>>>> >>>>>> Michael S. Tsirkin wrote: >>>>>> >>>>>> >>>>>>> On Thu, Jun 25, 2009 at 09:28:27AM -0400, Gregory Haskins wrote: >>>>>>> >>>>>>> >>>>>>> >>>>>>>> eventfd currently emits a POLLHUP wakeup on f_ops->release() to generate a >>>>>>>> "release" callback. This lets eventfd clients know if the eventfd is about >>>>>>>> to go away and is very useful particularly for in-kernel clients. However, >>>>>>>> until recently it is not possible to use this feature of eventfd in a >>>>>>>> race-free way. This patch utilizes a new eventfd interface to rectify >>>>>>>> the problem. >>>>>>>> >>>>>>>> Note that one final race is known to exist: the slow-work thread may race >>>>>>>> with module removal. We are currently working with slow-work upstream >>>>>>>> to fix this issue as well. Since the code prior to this patch also >>>>>>>> races with module_put(), we are not making anything worse, but rather >>>>>>>> shifting the cause of the race. Once the slow-work code is patched we >>>>>>>> will be fixing the last remaining issue. >>>>>>>> >>>>>>>> >>>>>>>> >>>>>>> By the way, why are we using slow-work here? Wouldn't a regular >>>>>>> workqueue do just as well, with less code, and avoid the race? >>>>>>> >>>>>>> >>>>>>> >>>>>>> >>>>>> I believe it will cause a problem if you do a "flush_work()" from inside >>>>>> a work-item. I could be wrong, of course, but it looks like a recipe to >>>>>> deadlock. >>>>>> >>>>>> -Greg >>>>>> >>>>>> >>>>>> >>>>> Sure, but the idea is to only flush on kvm close, never from work item. >>>>> >>>>> >>>> To clarify, you don't flush slow works from a work-item, >>>> so you shouldn't need to flush workqueue either. >>>> >>>> >>> I guess my question is - why is slow work different? It's still >>> a thread pool underneath ... >>> >>> >>> >> Its not interdependent. Flush-work blocks the thread..if the thread >> happens to be the work-queue thread you may deadlock preventing it from >> processing further jobs like the inject. In reality it shouldnt be >> possible, but its just a bad idea to assume its ok. >> Slow work, on the >> other hand, will just make a new thread. >> >> -Greg >> >> > > But if you create your own workqueue, and all you do there is destroy > irqfds, things are ok I think. Right? > Yep, creating your own queue works too. I picked slow-work as an alternate to generating a dedicated resource, but I agree either method would work fine. Do you have a preference? Regards, -Greg > > >
On Sun, Jun 28, 2009 at 03:54:42PM -0400, Gregory Haskins wrote: > Michael S. Tsirkin wrote: > > On Sun, Jun 28, 2009 at 12:28:19PM -0400, Gregory Haskins wrote: > > > >> Michael S. Tsirkin wrote: > >> > >>> On Sun, Jun 28, 2009 at 03:57:30PM +0300, Michael S. Tsirkin wrote: > >>> > >>> > >>>> On Sun, Jun 28, 2009 at 03:56:12PM +0300, Michael S. Tsirkin wrote: > >>>> > >>>> > >>>>> On Sun, Jun 28, 2009 at 08:53:22AM -0400, Gregory Haskins wrote: > >>>>> > >>>>> > >>>>>> Michael S. Tsirkin wrote: > >>>>>> > >>>>>> > >>>>>>> On Thu, Jun 25, 2009 at 09:28:27AM -0400, Gregory Haskins wrote: > >>>>>>> > >>>>>>> > >>>>>>> > >>>>>>>> eventfd currently emits a POLLHUP wakeup on f_ops->release() to generate a > >>>>>>>> "release" callback. This lets eventfd clients know if the eventfd is about > >>>>>>>> to go away and is very useful particularly for in-kernel clients. However, > >>>>>>>> until recently it is not possible to use this feature of eventfd in a > >>>>>>>> race-free way. This patch utilizes a new eventfd interface to rectify > >>>>>>>> the problem. > >>>>>>>> > >>>>>>>> Note that one final race is known to exist: the slow-work thread may race > >>>>>>>> with module removal. We are currently working with slow-work upstream > >>>>>>>> to fix this issue as well. Since the code prior to this patch also > >>>>>>>> races with module_put(), we are not making anything worse, but rather > >>>>>>>> shifting the cause of the race. Once the slow-work code is patched we > >>>>>>>> will be fixing the last remaining issue. > >>>>>>>> > >>>>>>>> > >>>>>>>> > >>>>>>> By the way, why are we using slow-work here? Wouldn't a regular > >>>>>>> workqueue do just as well, with less code, and avoid the race? > >>>>>>> > >>>>>>> > >>>>>>> > >>>>>>> > >>>>>> I believe it will cause a problem if you do a "flush_work()" from inside > >>>>>> a work-item. I could be wrong, of course, but it looks like a recipe to > >>>>>> deadlock. > >>>>>> > >>>>>> -Greg > >>>>>> > >>>>>> > >>>>>> > >>>>> Sure, but the idea is to only flush on kvm close, never from work item. > >>>>> > >>>>> > >>>> To clarify, you don't flush slow works from a work-item, > >>>> so you shouldn't need to flush workqueue either. > >>>> > >>>> > >>> I guess my question is - why is slow work different? It's still > >>> a thread pool underneath ... > >>> > >>> > >>> > >> Its not interdependent. Flush-work blocks the thread..if the thread > >> happens to be the work-queue thread you may deadlock preventing it from > >> processing further jobs like the inject. In reality it shouldnt be > >> possible, but its just a bad idea to assume its ok. > >> Slow work, on the > >> other hand, will just make a new thread. > >> > >> -Greg > >> > >> > > > > But if you create your own workqueue, and all you do there is destroy > > irqfds, things are ok I think. Right? > > > > Yep, creating your own queue works too. I picked slow-work as an > alternate to generating a dedicated resource, but I agree either method > would work fine. Do you have a preference? > > Regards, > -Greg It's not something I lose sleep about, but I think workqueue might be less code: for example, you can just flush it instead of using your own counter. And possibly things can be further simplified by making the workqueue single-threaded and always doing deassign from there.
Michael S. Tsirkin wrote: > On Sun, Jun 28, 2009 at 03:54:42PM -0400, Gregory Haskins wrote: > >> Michael S. Tsirkin wrote: >> >>> On Sun, Jun 28, 2009 at 12:28:19PM -0400, Gregory Haskins wrote: >>> >>> >>>> Michael S. Tsirkin wrote: >>>> >>>> >>>>> On Sun, Jun 28, 2009 at 03:57:30PM +0300, Michael S. Tsirkin wrote: >>>>> >>>>> >>>>> >>>>>> On Sun, Jun 28, 2009 at 03:56:12PM +0300, Michael S. Tsirkin wrote: >>>>>> >>>>>> >>>>>> >>>>>>> On Sun, Jun 28, 2009 at 08:53:22AM -0400, Gregory Haskins wrote: >>>>>>> >>>>>>> >>>>>>> >>>>>>>> Michael S. Tsirkin wrote: >>>>>>>> >>>>>>>> >>>>>>>> >>>>>>>>> On Thu, Jun 25, 2009 at 09:28:27AM -0400, Gregory Haskins wrote: >>>>>>>>> >>>>>>>>> >>>>>>>>> >>>>>>>>> >>>>>>>>>> eventfd currently emits a POLLHUP wakeup on f_ops->release() to generate a >>>>>>>>>> "release" callback. This lets eventfd clients know if the eventfd is about >>>>>>>>>> to go away and is very useful particularly for in-kernel clients. However, >>>>>>>>>> until recently it is not possible to use this feature of eventfd in a >>>>>>>>>> race-free way. This patch utilizes a new eventfd interface to rectify >>>>>>>>>> the problem. >>>>>>>>>> >>>>>>>>>> Note that one final race is known to exist: the slow-work thread may race >>>>>>>>>> with module removal. We are currently working with slow-work upstream >>>>>>>>>> to fix this issue as well. Since the code prior to this patch also >>>>>>>>>> races with module_put(), we are not making anything worse, but rather >>>>>>>>>> shifting the cause of the race. Once the slow-work code is patched we >>>>>>>>>> will be fixing the last remaining issue. >>>>>>>>>> >>>>>>>>>> >>>>>>>>>> >>>>>>>>>> >>>>>>>>> By the way, why are we using slow-work here? Wouldn't a regular >>>>>>>>> workqueue do just as well, with less code, and avoid the race? >>>>>>>>> >>>>>>>>> >>>>>>>>> >>>>>>>>> >>>>>>>>> >>>>>>>> I believe it will cause a problem if you do a "flush_work()" from inside >>>>>>>> a work-item. I could be wrong, of course, but it looks like a recipe to >>>>>>>> deadlock. >>>>>>>> >>>>>>>> -Greg >>>>>>>> >>>>>>>> >>>>>>>> >>>>>>>> >>>>>>> Sure, but the idea is to only flush on kvm close, never from work item. >>>>>>> >>>>>>> >>>>>>> >>>>>> To clarify, you don't flush slow works from a work-item, >>>>>> so you shouldn't need to flush workqueue either. >>>>>> >>>>>> >>>>>> >>>>> I guess my question is - why is slow work different? It's still >>>>> a thread pool underneath ... >>>>> >>>>> >>>>> >>>>> >>>> Its not interdependent. Flush-work blocks the thread..if the thread >>>> happens to be the work-queue thread you may deadlock preventing it from >>>> processing further jobs like the inject. In reality it shouldnt be >>>> possible, but its just a bad idea to assume its ok. >>>> Slow work, on the >>>> other hand, will just make a new thread. >>>> >>>> -Greg >>>> >>>> >>>> >>> But if you create your own workqueue, and all you do there is destroy >>> irqfds, things are ok I think. Right? >>> >>> >> Yep, creating your own queue works too. I picked slow-work as an >> alternate to generating a dedicated resource, but I agree either method >> would work fine. Do you have a preference? >> >> Regards, >> -Greg >> > > It's not something I lose sleep about, but I think workqueue might be > less code: for example, you can just flush it instead of using your own > counter. And possibly things can be further simplified by making the > workqueue single-threaded and always doing deassign from there. > > Yeah, its not a huge deal. The logic is probably slightly simpler with a dedicated single-thread queue for shutdown, at the expense of having the work-queue hang around mostly idle. I'll code it up both ways so we can compare. Thanks Michael, -Greg
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 2451f48..d94ee72 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -141,7 +141,12 @@ struct kvm { struct kvm_io_bus mmio_bus; struct kvm_io_bus pio_bus; #ifdef CONFIG_HAVE_KVM_EVENTFD - struct list_head irqfds; + struct { + spinlock_t lock; + struct list_head items; + atomic_t outstanding; + wait_queue_head_t wqh; + } irqfds; #endif struct kvm_vm_stat stat; struct kvm_arch arch; diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index daece36..ab7848a 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -9,6 +9,7 @@ config HAVE_KVM_IRQCHIP config HAVE_KVM_EVENTFD bool select EVENTFD + select SLOW_WORK config KVM_APIC_ARCHITECTURE bool diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 9656027..ca21e8a 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -28,6 +28,7 @@ #include <linux/file.h> #include <linux/list.h> #include <linux/eventfd.h> +#include <linux/slow-work.h> /* * -------------------------------------------------------------------- @@ -36,17 +37,36 @@ * Credit goes to Avi Kivity for the original idea. * -------------------------------------------------------------------- */ + struct _irqfd { struct kvm *kvm; + struct eventfd_ctx *eventfd; int gsi; struct list_head list; poll_table pt; wait_queue_head_t *wqh; wait_queue_t wait; struct work_struct inject; + struct slow_work shutdown; + int active:1; }; static void +irqfd_release(struct _irqfd *irqfd) +{ + eventfd_ctx_put(irqfd->eventfd); + kfree(irqfd); +} + +/* assumes kvm->irqfds.lock is held */ +static void +irqfd_deactivate(struct _irqfd *irqfd) +{ + irqfd->active = false; + list_del(&irqfd->list); +} + +static void irqfd_inject(struct work_struct *work) { struct _irqfd *irqfd = container_of(work, struct _irqfd, inject); @@ -58,6 +78,55 @@ irqfd_inject(struct work_struct *work) mutex_unlock(&kvm->irq_lock); } +static struct _irqfd * +work_to_irqfd(struct slow_work *work) +{ + return container_of(work, struct _irqfd, shutdown); +} + +static int +irqfd_shutdown_get_ref(struct slow_work *work) +{ + struct _irqfd *irqfd = work_to_irqfd(work); + struct kvm *kvm = irqfd->kvm; + + atomic_inc(&kvm->irqfds.outstanding); + + return 0; +} + +static void +irqfd_shutdown_put_ref(struct slow_work *work) +{ + struct _irqfd *irqfd = work_to_irqfd(work); + struct kvm *kvm = irqfd->kvm; + + irqfd_release(irqfd); + + if (atomic_dec_and_test(&kvm->irqfds.outstanding)) + wake_up(&kvm->irqfds.wqh); +} + +static void +irqfd_shutdown_execute(struct slow_work *work) +{ + struct _irqfd *irqfd = work_to_irqfd(work); + + /* + * Ensure there are no outstanding "inject" work-items before we blow + * away our state. Once this job completes, the slow_work + * infrastructure will drop the irqfd object completely via put_ref + */ + flush_work(&irqfd->inject); +} + +const static struct slow_work_ops irqfd_shutdown_work_ops = { + .get_ref = irqfd_shutdown_get_ref, + .put_ref = irqfd_shutdown_put_ref, + .execute = irqfd_shutdown_execute, +}; + + static int irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key) { @@ -65,25 +134,39 @@ irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key) unsigned long flags = (unsigned long)key; /* - * Assume we will be called with interrupts disabled + * Called with interrupts disabled */ if (flags & POLLIN) - /* - * Defer the IRQ injection until later since we need to - * acquire the kvm->lock to do so. - */ + /* An event has been signaled, inject an interrupt */ schedule_work(&irqfd->inject); if (flags & POLLHUP) { - /* - * for now, just remove ourselves from the list and let - * the rest dangle. We will fix this up later once - * the races in eventfd are fixed - */ + /* The eventfd is closing, detach from KVM */ + struct kvm *kvm = irqfd->kvm; + unsigned long flags; + __remove_wait_queue(irqfd->wqh, &irqfd->wait); - irqfd->wqh = NULL; + + spin_lock_irqsave(&kvm->irqfds.lock, flags); + + if (irqfd->active) { + /* + * If the item is still active we can be sure that + * no-one else is trying to shutdown this object at + * the same time. + * + * Defer the shutdown to a thread so we can flush + * all remaining inject jobs. We use a slow-work + * item to prevent a deadlock against the work-queue + */ + irqfd_deactivate(irqfd); + slow_work_enqueue(&irqfd->shutdown); + } + + spin_unlock_irqrestore(&kvm->irqfds.lock, flags); } + return 0; } @@ -102,6 +185,7 @@ kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags) { struct _irqfd *irqfd; struct file *file = NULL; + struct eventfd_ctx *eventfd = NULL; int ret; unsigned int events; @@ -113,6 +197,8 @@ kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags) irqfd->gsi = gsi; INIT_LIST_HEAD(&irqfd->list); INIT_WORK(&irqfd->inject, irqfd_inject); + slow_work_init(&irqfd->shutdown, &irqfd_shutdown_work_ops); + irqfd->active = true; file = eventfd_fget(fd); if (IS_ERR(file)) { @@ -129,12 +215,21 @@ kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags) events = file->f_op->poll(file, &irqfd->pt); - mutex_lock(&kvm->lock); - list_add_tail(&irqfd->list, &kvm->irqfds); - mutex_unlock(&kvm->lock); + spin_lock_irq(&kvm->irqfds.lock); + list_add_tail(&irqfd->list, &kvm->irqfds.items); + spin_unlock_irq(&kvm->irqfds.lock); + + eventfd = eventfd_ctx_fileget(file); + if (IS_ERR(file)) { + ret = PTR_ERR(file); + goto fail; + } + + irqfd->eventfd = eventfd; /* - * Check if there was an event already queued + * Check if there was an event already pending on the eventfd + * before we registered, and trigger it as if we didn't miss it. */ if (events & POLLIN) schedule_work(&irqfd->inject); @@ -148,6 +243,9 @@ kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags) return 0; fail: + if (eventfd && !IS_ERR(eventfd)) + eventfd_ctx_put(eventfd); + if (file && !IS_ERR(file)) fput(file); @@ -158,24 +256,71 @@ fail: void kvm_irqfd_init(struct kvm *kvm) { - INIT_LIST_HEAD(&kvm->irqfds); + slow_work_register_user(); + + spin_lock_init(&kvm->irqfds.lock); + INIT_LIST_HEAD(&kvm->irqfds.items); + atomic_set(&kvm->irqfds.outstanding, 0); + init_waitqueue_head(&kvm->irqfds.wqh); +} + +static struct _irqfd * +irqfd_pop(struct kvm *kvm) +{ + struct _irqfd *irqfd = NULL; + + spin_lock_irq(&kvm->irqfds.lock); + + if (!list_empty(&kvm->irqfds.items)) { + irqfd = list_first_entry(&kvm->irqfds.items, + struct _irqfd, list); + irqfd_deactivate(irqfd); + } + + spin_unlock_irq(&kvm->irqfds.lock); + + return irqfd; +} + +/* + * locally releases the irqfd + * + * This function is called when KVM won the race with eventfd (signalled by + * finding the item active on the kvm->irqfds.item list). We are now guaranteed + * that we will never schedule a deferred shutdown task against this object, + * so we take steps to perform the shutdown ourselves. + * + * 1) We must remove ourselves from the wait-queue to prevent further events, + * which will simultaneously act to sync us with eventfd (via wqh->lock) + * 2) Flush any outstanding inject-tasks to ensure its safe to free memory + * 3) Delete the object + */ +static void +irqfd_shutdown(struct _irqfd *irqfd) +{ + remove_wait_queue(irqfd->wqh, &irqfd->wait); + flush_work(&irqfd->inject); + irqfd_release(irqfd); } void kvm_irqfd_release(struct kvm *kvm) { - struct _irqfd *irqfd, *tmp; - - list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds, list) { - if (irqfd->wqh) - remove_wait_queue(irqfd->wqh, &irqfd->wait); + struct _irqfd *irqfd; - flush_work(&irqfd->inject); + /* + * Shutdown all irqfds that still remain + */ + while ((irqfd = irqfd_pop(kvm))) + irqfd_shutdown(irqfd); - mutex_lock(&kvm->lock); - list_del(&irqfd->list); - mutex_unlock(&kvm->lock); + /* + * irqfds.outstanding tracks the number of outstanding "shutdown" + * jobs pending at any given time. Once we get here, we know that + * no more jobs will get scheduled, so go ahead and block until all + * of them complete + */ + wait_event(kvm->irqfds.wqh, (!atomic_read(&kvm->irqfds.outstanding))); - kfree(irqfd); - } + slow_work_unregister_user(); }
eventfd currently emits a POLLHUP wakeup on f_ops->release() to generate a "release" callback. This lets eventfd clients know if the eventfd is about to go away and is very useful particularly for in-kernel clients. However, until recently it is not possible to use this feature of eventfd in a race-free way. This patch utilizes a new eventfd interface to rectify the problem. Note that one final race is known to exist: the slow-work thread may race with module removal. We are currently working with slow-work upstream to fix this issue as well. Since the code prior to this patch also races with module_put(), we are not making anything worse, but rather shifting the cause of the race. Once the slow-work code is patched we will be fixing the last remaining issue. Signed-off-by: Gregory Haskins <ghaskins@novell.com> --- include/linux/kvm_host.h | 7 +- virt/kvm/Kconfig | 1 virt/kvm/eventfd.c | 199 ++++++++++++++++++++++++++++++++++++++++------ 3 files changed, 179 insertions(+), 28 deletions(-) -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html