From patchwork Mon Jun 22 16:05:57 2009 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Gregory Haskins X-Patchwork-Id: 31787 Received: from vger.kernel.org (vger.kernel.org [209.132.176.167]) by demeter.kernel.org (8.14.2/8.14.2) with ESMTP id n5MG6CHi023416 for ; Mon, 22 Jun 2009 16:06:12 GMT Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1755691AbZFVQGG (ORCPT ); Mon, 22 Jun 2009 12:06:06 -0400 Received: (majordomo@vger.kernel.org) by vger.kernel.org id S1757159AbZFVQGG (ORCPT ); Mon, 22 Jun 2009 12:06:06 -0400 Received: from victor.provo.novell.com ([137.65.250.26]:42192 "EHLO victor.provo.novell.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1754657AbZFVQGD (ORCPT ); Mon, 22 Jun 2009 12:06:03 -0400 Received: from dev.haskins.net (prv-ext-foundry1.gns.novell.com [137.65.251.240]) by victor.provo.novell.com with ESMTP (TLS encrypted); Mon, 22 Jun 2009 10:05:58 -0600 Received: from dev.haskins.net (localhost [127.0.0.1]) by dev.haskins.net (Postfix) with ESMTP id 2B6EF4641EB; Mon, 22 Jun 2009 12:05:57 -0400 (EDT) From: Gregory Haskins Subject: [KVM PATCH v3 3/3] KVM: Fix races in irqfd using new eventfd_kref_get interface To: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org, mst@redhat.com, avi@redhat.com, paulmck@linux.vnet.ibm.com, davidel@xmailserver.org, mingo@elte.hu, rusty@rustcorp.com.au Date: Mon, 22 Jun 2009 12:05:57 -0400 Message-ID: <20090622160556.22967.76273.stgit@dev.haskins.net> In-Reply-To: <20090622155504.22967.50532.stgit@dev.haskins.net> References: <20090622155504.22967.50532.stgit@dev.haskins.net> User-Agent: StGIT/0.14.3 MIME-Version: 1.0 Sender: kvm-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: kvm@vger.kernel.org This patch fixes all known races in irqfd, and paves the way to restore DEASSIGN support. For details of the eventfd races, please see the patch presumably commited immediately prior to this one. In a nutshell, we use eventfd_kref_get/put() to properly manage the lifetime of the underlying eventfd. We also use careful coordination with our workqueue to ensure that all irqfd objects have terminated before we allow kvm to shutdown. The logic used for shutdown walks all open irqfds and releases them. This logic can be generalized in the future to allow a subset of irqfds to be released, thus allowing DEASSIGN support. Signed-off-by: Gregory Haskins --- virt/kvm/eventfd.c | 144 ++++++++++++++++++++++++++++++++++++++++------------ 1 files changed, 110 insertions(+), 34 deletions(-) -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 9656027..67985cd 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -28,6 +28,7 @@ #include #include #include +#include /* * -------------------------------------------------------------------- @@ -36,26 +37,68 @@ * Credit goes to Avi Kivity for the original idea. * -------------------------------------------------------------------- */ + +enum { + irqfd_flags_shutdown, +}; + struct _irqfd { struct kvm *kvm; + struct kref *eventfd; int gsi; struct list_head list; poll_table pt; wait_queue_head_t *wqh; wait_queue_t wait; - struct work_struct inject; + struct work_struct work; + unsigned long flags; }; static void -irqfd_inject(struct work_struct *work) +irqfd_release(struct _irqfd *irqfd) +{ + eventfd_kref_put(irqfd->eventfd); + kfree(irqfd); +} + +static void +irqfd_work(struct work_struct *work) { - struct _irqfd *irqfd = container_of(work, struct _irqfd, inject); + struct _irqfd *irqfd = container_of(work, struct _irqfd, work); struct kvm *kvm = irqfd->kvm; - mutex_lock(&kvm->irq_lock); - kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1); - kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0); - mutex_unlock(&kvm->irq_lock); + if (!test_bit(irqfd_flags_shutdown, &irqfd->flags)) { + /* Inject an interrupt */ + mutex_lock(&kvm->irq_lock); + kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1); + kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0); + mutex_unlock(&kvm->irq_lock); + } else { + /* shutdown the irqfd */ + struct _irqfd *_irqfd = NULL; + + mutex_lock(&kvm->lock); + + if (!list_empty(&irqfd->list)) + _irqfd = irqfd; + + if (_irqfd) + list_del(&_irqfd->list); + + mutex_unlock(&kvm->lock); + + /* + * If the item is not currently on the irqfds list, we know + * we are running concurrently with the KVM side trying to + * remove this item as well. Since the KVM side should be + * holding the reference now, and will block behind a + * flush_work(), lets just let them do the release() for us + */ + if (!_irqfd) + return; + + irqfd_release(_irqfd); + } } static int @@ -65,25 +108,20 @@ irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key) unsigned long flags = (unsigned long)key; /* - * Assume we will be called with interrupts disabled + * called with interrupts disabled */ - if (flags & POLLIN) - /* - * Defer the IRQ injection until later since we need to - * acquire the kvm->lock to do so. - */ - schedule_work(&irqfd->inject); - if (flags & POLLHUP) { /* - * for now, just remove ourselves from the list and let - * the rest dangle. We will fix this up later once - * the races in eventfd are fixed + * ordering is important: shutdown flag must be visible + * before we schedule */ __remove_wait_queue(irqfd->wqh, &irqfd->wait); - irqfd->wqh = NULL; + set_bit(irqfd_flags_shutdown, &irqfd->flags); } + if (flags & (POLLHUP | POLLIN)) + schedule_work(&irqfd->work); + return 0; } @@ -102,6 +140,7 @@ kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags) { struct _irqfd *irqfd; struct file *file = NULL; + struct kref *kref = NULL; int ret; unsigned int events; @@ -112,7 +151,7 @@ kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags) irqfd->kvm = kvm; irqfd->gsi = gsi; INIT_LIST_HEAD(&irqfd->list); - INIT_WORK(&irqfd->inject, irqfd_inject); + INIT_WORK(&irqfd->work, irqfd_work); file = eventfd_fget(fd); if (IS_ERR(file)) { @@ -133,11 +172,13 @@ kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags) list_add_tail(&irqfd->list, &kvm->irqfds); mutex_unlock(&kvm->lock); - /* - * Check if there was an event already queued - */ - if (events & POLLIN) - schedule_work(&irqfd->inject); + kref = eventfd_kref_get(file); + if (IS_ERR(file)) { + ret = PTR_ERR(file); + goto fail; + } + + irqfd->eventfd = kref; /* * do not drop the file until the irqfd is fully initialized, otherwise @@ -145,9 +186,18 @@ kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags) */ fput(file); + /* + * Check if there was an event already queued + */ + if (events & POLLIN) + schedule_work(&irqfd->work); + return 0; fail: + if (kref && !IS_ERR(kref)) + eventfd_kref_put(kref); + if (file && !IS_ERR(file)) fput(file); @@ -161,21 +211,47 @@ kvm_irqfd_init(struct kvm *kvm) INIT_LIST_HEAD(&kvm->irqfds); } +static struct _irqfd * +irqfd_pop(struct kvm *kvm) +{ + struct _irqfd *irqfd = NULL; + + mutex_lock(&kvm->lock); + + if (!list_empty(&kvm->irqfds)) { + irqfd = list_first_entry(&kvm->irqfds, struct _irqfd, list); + list_del(&irqfd->list); + } + + mutex_unlock(&kvm->lock); + + return irqfd; +} + void kvm_irqfd_release(struct kvm *kvm) { - struct _irqfd *irqfd, *tmp; + struct _irqfd *irqfd; - list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds, list) { - if (irqfd->wqh) - remove_wait_queue(irqfd->wqh, &irqfd->wait); + while ((irqfd = irqfd_pop(kvm))) { - flush_work(&irqfd->inject); + remove_wait_queue(irqfd->wqh, &irqfd->wait); - mutex_lock(&kvm->lock); - list_del(&irqfd->list); - mutex_unlock(&kvm->lock); + /* + * We guarantee there will be no more notifications after + * the remove_wait_queue returns. Now lets make sure we + * synchronize behind any outstanding work items before + * releasing the resources + */ + flush_work(&irqfd->work); - kfree(irqfd); + irqfd_release(irqfd); } + + /* + * We need to wait in case there are any outstanding work-items + * in flight that had already removed themselves from the list + * prior to entry to this function + */ + flush_scheduled_work(); }