diff mbox

[v7,2/3] kvm: add support for irqfd via eventfd-notification interface

Message ID 20090512182655.26131.53824.stgit@dev.haskins.net (mailing list archive)
State New, archived
Headers show

Commit Message

Gregory Haskins May 12, 2009, 6:26 p.m. UTC
KVM provides a complete virtual system environment for guests, including
support for injecting interrupts modeled after the real exception/interrupt
facilities present on the native platform (such as the IDT on x86).
Virtual interrupts can come from a variety of sources (emulated devices,
pass-through devices, etc) but all must be injected to the guest via
the KVM infrastructure.  This patch adds a new mechanism to inject a specific
interrupt to a guest using a decoupled eventfd mechnanism:  Any legal signal
on the irqfd (using eventfd semantics from either userspace or kernel) will
translate into an injected interrupt in the guest at the next available
interrupt window.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
---

 arch/x86/kvm/Makefile    |    2 
 arch/x86/kvm/x86.c       |    1 
 include/linux/kvm.h      |   10 ++
 include/linux/kvm_host.h |    5 +
 virt/kvm/eventfd.c       |  187 ++++++++++++++++++++++++++++++++++++++++++++++
 virt/kvm/kvm_main.c      |   20 +++++
 6 files changed, 224 insertions(+), 1 deletions(-)
 create mode 100644 virt/kvm/eventfd.c


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Avi Kivity May 14, 2009, 9:47 a.m. UTC | #1
Gregory Haskins wrote:
> KVM provides a complete virtual system environment for guests, including
> support for injecting interrupts modeled after the real exception/interrupt
> facilities present on the native platform (such as the IDT on x86).
> Virtual interrupts can come from a variety of sources (emulated devices,
> pass-through devices, etc) but all must be injected to the guest via
> the KVM infrastructure.  This patch adds a new mechanism to inject a specific
> interrupt to a guest using a decoupled eventfd mechnanism:  Any legal signal
> on the irqfd (using eventfd semantics from either userspace or kernel) will
> translate into an injected interrupt in the guest at the next available
> interrupt window.
>   

>  		r = 1;
>  		break;
>  	case KVM_CAP_COALESCED_MMIO:
> diff --git a/include/linux/kvm.h b/include/linux/kvm.h
> index 3db5d8d..dfc4bcc 100644
> --- a/include/linux/kvm.h
> +++ b/include/linux/kvm.h
> @@ -415,6 +415,7 @@ struct kvm_trace_rec {
>  #define KVM_CAP_ASSIGN_DEV_IRQ 29
>  /* Another bug in KVM_SET_USER_MEMORY_REGION fixed: */
>  #define KVM_CAP_JOIN_MEMORY_REGIONS_WORKS 30
> +#define KVM_CAP_EVENTFD 31
>   

Let's keep a fine granularity and call it IRQFD.

> +
> +int
> +kvm_deassign_irqfd(struct kvm *kvm, int fd)
> +{
> +	struct _irqfd *irqfd, *tmp;
> +
> +	mutex_lock(&kvm->lock);
> +
> +	/*
> +	 * linear search isn't brilliant, but this should be a infrequent
> +	 * operation and the list should not grow very large
> +	 */
> +	list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds, list) {
> +		if (irqfd->fd != fd)
> +			continue;
>   

Please fget() the new fd and compare the filps; fds aren't meaningful in 
the kernel.  You can also drop _irqfd::fd.

It may also be useful to compare the gsi, this allows a 
"make-before-break" switchover:

- guest reroutes irq to a different gsi
- associate irqfd with new gsi
- disassociate irqfd from old gsi

> +
> +		irqfd_release(irqfd);
> +		mutex_unlock(&kvm->lock);
> +		return 0;
>   

Don't return, userspace may have multiple associations?
Avi Kivity May 14, 2009, 11:22 a.m. UTC | #2
Gregory Haskins wrote:
> KVM provides a complete virtual system environment for guests, including
> support for injecting interrupts modeled after the real exception/interrupt
> facilities present on the native platform (such as the IDT on x86).
> Virtual interrupts can come from a variety of sources (emulated devices,
> pass-through devices, etc) but all must be injected to the guest via
> the KVM infrastructure.  This patch adds a new mechanism to inject a specific
> interrupt to a guest using a decoupled eventfd mechnanism:  Any legal signal
> on the irqfd (using eventfd semantics from either userspace or kernel) will
> translate into an injected interrupt in the guest at the next available
> interrupt window.
>
> +
> +static void
> +irqfd_inject(struct work_struct *work)
> +{
> +	struct _irqfd *irqfd = container_of(work, struct _irqfd, work);
> +	struct kvm *kvm = irqfd->kvm;
> +
>   


I think you need to ->read() from the irqfd, otherwise the count will 
never clear.

> +	mutex_lock(&kvm->lock);
> +	kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1);
> +	kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0);
> +	mutex_unlock(&kvm->lock);
> +}
>
Gregory Haskins May 14, 2009, 11:52 a.m. UTC | #3
Avi Kivity wrote:
> Gregory Haskins wrote:
>> KVM provides a complete virtual system environment for guests, including
>> support for injecting interrupts modeled after the real
>> exception/interrupt
>> facilities present on the native platform (such as the IDT on x86).
>> Virtual interrupts can come from a variety of sources (emulated devices,
>> pass-through devices, etc) but all must be injected to the guest via
>> the KVM infrastructure.  This patch adds a new mechanism to inject a
>> specific
>> interrupt to a guest using a decoupled eventfd mechnanism:  Any legal
>> signal
>> on the irqfd (using eventfd semantics from either userspace or
>> kernel) will
>> translate into an injected interrupt in the guest at the next available
>> interrupt window.
>>   
>
>>          r = 1;
>>          break;
>>      case KVM_CAP_COALESCED_MMIO:
>> diff --git a/include/linux/kvm.h b/include/linux/kvm.h
>> index 3db5d8d..dfc4bcc 100644
>> --- a/include/linux/kvm.h
>> +++ b/include/linux/kvm.h
>> @@ -415,6 +415,7 @@ struct kvm_trace_rec {
>>  #define KVM_CAP_ASSIGN_DEV_IRQ 29
>>  /* Another bug in KVM_SET_USER_MEMORY_REGION fixed: */
>>  #define KVM_CAP_JOIN_MEMORY_REGIONS_WORKS 30
>> +#define KVM_CAP_EVENTFD 31
>>   
>
> Let's keep a fine granularity and call it IRQFD.

Yeah, the iofd stuff is still immature and is not likely to be ready at
the same time anyway.  The CAP bits are cheap enough as it is, so not
sure what I was thinking.  Will fix.

>
>> +
>> +int
>> +kvm_deassign_irqfd(struct kvm *kvm, int fd)
>> +{
>> +    struct _irqfd *irqfd, *tmp;
>> +
>> +    mutex_lock(&kvm->lock);
>> +
>> +    /*
>> +     * linear search isn't brilliant, but this should be a infrequent
>> +     * operation and the list should not grow very large
>> +     */
>> +    list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds, list) {
>> +        if (irqfd->fd != fd)
>> +            continue;
>>   
>
> Please fget() the new fd and compare the filps; fds aren't meaningful
> in the kernel.  You can also drop _irqfd::fd.

I like this as a second option...

>
> It may also be useful to compare the gsi, this allows a
> "make-before-break" switchover:

...but I like this best.  Good idea.

>
> - guest reroutes irq to a different gsi
> - associate irqfd with new gsi
> - disassociate irqfd from old gsi
>
>> +
>> +        irqfd_release(irqfd);
>> +        mutex_unlock(&kvm->lock);
>> +        return 0;
>>   
>
> Don't return, userspace may have multiple associations?

Parse error.  Can you elaborate?

-Greg

>
>
Avi Kivity May 14, 2009, 12:20 p.m. UTC | #4
Gregory Haskins wrote:
>> Please fget() the new fd and compare the filps; fds aren't meaningful
>> in the kernel.  You can also drop _irqfd::fd.
>>     
>
> I like this as a second option...
>
>   
>> It may also be useful to compare the gsi, this allows a
>> "make-before-break" switchover:
>>     
>
> ...but I like this best.  Good idea.
>   

I thought of comparing both.

>> - guest reroutes irq to a different gsi
>> - associate irqfd with new gsi
>> - disassociate irqfd from old gsi
>>
>>     
>>> +
>>> +        irqfd_release(irqfd);
>>> +        mutex_unlock(&kvm->lock);
>>> +        return 0;
>>>   
>>>       
>> Don't return, userspace may have multiple associations?
>>     
>
> Parse error.  Can you elaborate?
>
>   

You break out of the look when you match your irqfd.  But there may be 
multiple matches.

Granted, it doesn't make much sense to hook the same fd to the same gsi 
multiple times (it may make sense to hook multiple fds to a single gsi, 
or maybe a single fd to multiple gsis), but it pays to have a consistent 
do-what-I-said-even-if-it-doesn't-make-sense interface.
Gregory Haskins May 14, 2009, 1:12 p.m. UTC | #5
Avi Kivity wrote:
> Gregory Haskins wrote:
>>> Please fget() the new fd and compare the filps; fds aren't meaningful
>>> in the kernel.  You can also drop _irqfd::fd.
>>>     
>>
>> I like this as a second option...
>>
>>  
>>> It may also be useful to compare the gsi, this allows a
>>> "make-before-break" switchover:
>>>     
>>
>> ...but I like this best.  Good idea.
>>   
>
> I thought of comparing both.

Ah, ok.  I misunderstood.  We can do that.
>
>>> - guest reroutes irq to a different gsi
>>> - associate irqfd with new gsi
>>> - disassociate irqfd from old gsi
>>>
>>>    
>>>> +
>>>> +        irqfd_release(irqfd);
>>>> +        mutex_unlock(&kvm->lock);
>>>> +        return 0;
>>>>         
>>> Don't return, userspace may have multiple associations?
>>>     
>>
>> Parse error.  Can you elaborate?
>>
>>   
>
> You break out of the look when you match your irqfd.  But there may be
> multiple matches.
>
> Granted, it doesn't make much sense to hook the same fd to the same
> gsi multiple times (it may make sense to hook multiple fds to a single
> gsi, or maybe a single fd to multiple gsis), but it pays to have a
> consistent do-what-I-said-even-if-it-doesn't-make-sense interface.

Ack, will do.

-Greg
Gregory Haskins May 14, 2009, 3:52 p.m. UTC | #6
Avi Kivity wrote:
> Gregory Haskins wrote:
>> KVM provides a complete virtual system environment for guests, including
>> support for injecting interrupts modeled after the real
>> exception/interrupt
>> facilities present on the native platform (such as the IDT on x86).
>> Virtual interrupts can come from a variety of sources (emulated devices,
>> pass-through devices, etc) but all must be injected to the guest via
>> the KVM infrastructure.  This patch adds a new mechanism to inject a
>> specific
>> interrupt to a guest using a decoupled eventfd mechnanism:  Any legal
>> signal
>> on the irqfd (using eventfd semantics from either userspace or
>> kernel) will
>> translate into an injected interrupt in the guest at the next available
>> interrupt window.
>>
>> +
>> +static void
>> +irqfd_inject(struct work_struct *work)
>> +{
>> +    struct _irqfd *irqfd = container_of(work, struct _irqfd, work);
>> +    struct kvm *kvm = irqfd->kvm;
>> +
>>   
>
>
> I think you need to ->read() from the irqfd, otherwise the count will
> never clear.

Yeah, and this is a disavantage to using eventfd vs a custom anon-fd
implementation.

However, the count is really only there for deciding whether to sleep a
traditional eventfd recipient which doesn't really apply in this
application.  I suppose we could try to invoke the read method (or add a
new method to eventfd to allow it to be cleared independent of the
f_ops->read() (ala eventfd_signal() vs f_ops->write()).  I'm not
convinced we really need to worry about it, though.  IMO we can just let
the count accumulate.

But if you insist this loose end should be addressed, perhaps Davide has
some thoughts on how to best do this?

-Greg

>
>> +    mutex_lock(&kvm->lock);
>> +    kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1);
>> +    kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0);
>> +    mutex_unlock(&kvm->lock);
>> +}
>>   
>
Davide Libenzi May 15, 2009, 3:22 a.m. UTC | #7
On Thu, 14 May 2009, Gregory Haskins wrote:

> Avi Kivity wrote:
> > Gregory Haskins wrote:
> >> KVM provides a complete virtual system environment for guests, including
> >> support for injecting interrupts modeled after the real
> >> exception/interrupt
> >> facilities present on the native platform (such as the IDT on x86).
> >> Virtual interrupts can come from a variety of sources (emulated devices,
> >> pass-through devices, etc) but all must be injected to the guest via
> >> the KVM infrastructure.  This patch adds a new mechanism to inject a
> >> specific
> >> interrupt to a guest using a decoupled eventfd mechnanism:  Any legal
> >> signal
> >> on the irqfd (using eventfd semantics from either userspace or
> >> kernel) will
> >> translate into an injected interrupt in the guest at the next available
> >> interrupt window.
> >>
> >> +
> >> +static void
> >> +irqfd_inject(struct work_struct *work)
> >> +{
> >> +    struct _irqfd *irqfd = container_of(work, struct _irqfd, work);
> >> +    struct kvm *kvm = irqfd->kvm;
> >> +
> >>   
> >
> >
> > I think you need to ->read() from the irqfd, otherwise the count will
> > never clear.
> 
> Yeah, and this is a disavantage to using eventfd vs a custom anon-fd
> implementation.
> 
> However, the count is really only there for deciding whether to sleep a
> traditional eventfd recipient which doesn't really apply in this
> application.  I suppose we could try to invoke the read method (or add a
> new method to eventfd to allow it to be cleared independent of the
> f_ops->read() (ala eventfd_signal() vs f_ops->write()).  I'm not
> convinced we really need to worry about it, though.  IMO we can just let
> the count accumulate.
> 
> But if you insist this loose end should be addressed, perhaps Davide has
> some thoughts on how to best do this?

The counter is 64bit, so at 1M IRQ/s will take about 585K years to 
saturate. But from a symmetry POV, it may be better to clear it. Maybe 
with a kernel-side eventfd_read()?


- Davide


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gregory Haskins May 15, 2009, 3:35 a.m. UTC | #8
Davide Libenzi wrote:
> On Thu, 14 May 2009, Gregory Haskins wrote:
>
>   
>> Avi Kivity wrote:
>>     
>>> Gregory Haskins wrote:
>>>       
>>>> KVM provides a complete virtual system environment for guests, including
>>>> support for injecting interrupts modeled after the real
>>>> exception/interrupt
>>>> facilities present on the native platform (such as the IDT on x86).
>>>> Virtual interrupts can come from a variety of sources (emulated devices,
>>>> pass-through devices, etc) but all must be injected to the guest via
>>>> the KVM infrastructure.  This patch adds a new mechanism to inject a
>>>> specific
>>>> interrupt to a guest using a decoupled eventfd mechnanism:  Any legal
>>>> signal
>>>> on the irqfd (using eventfd semantics from either userspace or
>>>> kernel) will
>>>> translate into an injected interrupt in the guest at the next available
>>>> interrupt window.
>>>>
>>>> +
>>>> +static void
>>>> +irqfd_inject(struct work_struct *work)
>>>> +{
>>>> +    struct _irqfd *irqfd = container_of(work, struct _irqfd, work);
>>>> +    struct kvm *kvm = irqfd->kvm;
>>>> +
>>>>   
>>>>         
>>> I think you need to ->read() from the irqfd, otherwise the count will
>>> never clear.
>>>       
>> Yeah, and this is a disavantage to using eventfd vs a custom anon-fd
>> implementation.
>>
>> However, the count is really only there for deciding whether to sleep a
>> traditional eventfd recipient which doesn't really apply in this
>> application.  I suppose we could try to invoke the read method (or add a
>> new method to eventfd to allow it to be cleared independent of the
>> f_ops->read() (ala eventfd_signal() vs f_ops->write()).  I'm not
>> convinced we really need to worry about it, though.  IMO we can just let
>> the count accumulate.
>>
>> But if you insist this loose end should be addressed, perhaps Davide has
>> some thoughts on how to best do this?
>>     
>
> The counter is 64bit, so at 1M IRQ/s will take about 585K years to 
> saturate. But from a symmetry POV, it may be better to clear it. Maybe 
> with a kernel-side eventfd_read()?
>   
Hi Davide,

I think ultimately that would be the direction to go.  I will defer to
Avi, but I think we have reached consensus that while its perhaps sloppy
to leave the counter untouched, we can back-burner this issue for now
and just let it accumulate indefinately.  If it becomes an issue down
the road we can always fix it then.

Thanks,
-Greg
diff mbox

Patch

diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index b43c4ef..4d50904 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -3,7 +3,7 @@ 
 #
 
 common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
-                coalesced_mmio.o irq_comm.o)
+                coalesced_mmio.o irq_comm.o eventfd.o)
 ifeq ($(CONFIG_KVM_TRACE),y)
 common-objs += $(addprefix ../../../virt/kvm/, kvm_trace.o)
 endif
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index fd0a571..ba541f6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1026,6 +1026,7 @@  int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_REINJECT_CONTROL:
 	case KVM_CAP_IRQ_INJECT_STATUS:
 	case KVM_CAP_ASSIGN_DEV_IRQ:
+	case KVM_CAP_EVENTFD:
 		r = 1;
 		break;
 	case KVM_CAP_COALESCED_MMIO:
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 3db5d8d..dfc4bcc 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -415,6 +415,7 @@  struct kvm_trace_rec {
 #define KVM_CAP_ASSIGN_DEV_IRQ 29
 /* Another bug in KVM_SET_USER_MEMORY_REGION fixed: */
 #define KVM_CAP_JOIN_MEMORY_REGIONS_WORKS 30
+#define KVM_CAP_EVENTFD 31
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -454,6 +455,13 @@  struct kvm_irq_routing {
 
 #endif
 
+struct kvm_irqfd {
+	__u32 fd;
+	__u32 gsi;
+	__u32 flags;
+	__u8  pad[20];
+};
+
 /*
  * ioctls for VM fds
  */
@@ -498,6 +506,8 @@  struct kvm_irq_routing {
 #define KVM_ASSIGN_SET_MSIX_ENTRY \
 			_IOW(KVMIO, 0x74, struct kvm_assigned_msix_entry)
 #define KVM_DEASSIGN_DEV_IRQ       _IOW(KVMIO, 0x75, struct kvm_assigned_irq)
+#define KVM_ASSIGN_IRQFD           _IOW(KVMIO, 0x76, struct kvm_irqfd)
+#define KVM_DEASSIGN_IRQFD         _IOW(KVMIO, 0x77, __u32)
 
 /*
  * ioctls for vcpu fds
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 2b8df0c..1acc528 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -134,6 +134,7 @@  struct kvm {
 	struct list_head vm_list;
 	struct kvm_io_bus mmio_bus;
 	struct kvm_io_bus pio_bus;
+	struct list_head irqfds;
 	struct kvm_vm_stat stat;
 	struct kvm_arch arch;
 	atomic_t users_count;
@@ -525,4 +526,8 @@  static inline void kvm_free_irq_routing(struct kvm *kvm) {}
 
 #endif
 
+int kvm_assign_irqfd(struct kvm *kvm, int fd, int gsi, int flags);
+int kvm_deassign_irqfd(struct kvm *kvm, int fd);
+void kvm_irqfd_release(struct kvm *kvm);
+
 #endif
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
new file mode 100644
index 0000000..71afd62
--- /dev/null
+++ b/virt/kvm/eventfd.c
@@ -0,0 +1,187 @@ 
+/*
+ * kvm eventfd support - use eventfd objects to signal various KVM events
+ *
+ * Copyright 2009 Novell.  All Rights Reserved.
+ *
+ * Author:
+ *	Gregory Haskins <ghaskins@novell.com>
+ *
+ * This file is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/workqueue.h>
+#include <linux/syscalls.h>
+#include <linux/wait.h>
+#include <linux/poll.h>
+#include <linux/file.h>
+#include <linux/list.h>
+
+/*
+ * --------------------------------------------------------------------
+ * irqfd: Allows an fd to be used to inject an interrupt to the guest
+ *
+ * Credit goes to Avi Kivity for the original idea.
+ * --------------------------------------------------------------------
+ */
+struct _irqfd {
+	struct kvm               *kvm;
+	int                       gsi;
+	int                       fd;
+	struct file              *file;
+	struct list_head          list;
+	poll_table                pt;
+	wait_queue_head_t        *wqh;
+	wait_queue_t              wait;
+	struct work_struct        work;
+};
+
+static void
+irqfd_inject(struct work_struct *work)
+{
+	struct _irqfd *irqfd = container_of(work, struct _irqfd, work);
+	struct kvm *kvm = irqfd->kvm;
+
+	mutex_lock(&kvm->lock);
+	kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1);
+	kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0);
+	mutex_unlock(&kvm->lock);
+}
+
+static int
+irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key)
+{
+	struct _irqfd *irqfd = container_of(wait, struct _irqfd, wait);
+
+	/*
+	 * The wake_up with interrupts disabled.  Therefore we need to defer
+	 * the IRQ injection until later since we need to acquire the
+	 * kvm->lock to do so.
+	 */
+	schedule_work(&irqfd->work);
+
+	return 0;
+}
+
+static void
+irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh,
+			poll_table *pt)
+{
+	struct _irqfd *irqfd = container_of(pt, struct _irqfd, pt);
+
+	irqfd->wqh = wqh;
+	add_wait_queue(wqh, &irqfd->wait);
+}
+
+int
+kvm_assign_irqfd(struct kvm *kvm, int fd, int gsi, int flags)
+{
+	struct _irqfd *irqfd;
+	struct file *file = NULL;
+	int ret;
+
+	irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL);
+	if (!irqfd)
+		return -ENOMEM;
+
+	irqfd->kvm = kvm;
+	irqfd->gsi = gsi;
+	irqfd->fd  = fd;
+	INIT_LIST_HEAD(&irqfd->list);
+	INIT_WORK(&irqfd->work, irqfd_inject);
+
+	/*
+	 * Embed the file* lifetime in the irqfd.
+	 */
+	file = fget(fd);
+	if (IS_ERR(file)) {
+		ret = PTR_ERR(file);
+		goto fail;
+	}
+
+	/*
+	 * Install our own custom wake-up handling so we are notified via
+	 * a callback whenever someone signals the underlying eventfd
+	 */
+	init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup);
+	init_poll_funcptr(&irqfd->pt, irqfd_ptable_queue_proc);
+
+	ret = file->f_op->poll(file, &irqfd->pt);
+	if (ret < 0)
+		goto fail;
+
+	irqfd->file = file;
+
+	mutex_lock(&kvm->lock);
+	list_add_tail(&irqfd->list, &kvm->irqfds);
+	mutex_unlock(&kvm->lock);
+
+	return 0;
+
+fail:
+	if (irqfd->wqh)
+		remove_wait_queue(irqfd->wqh, &irqfd->wait);
+
+	if (file && !IS_ERR(file))
+		fput(file);
+
+	kfree(irqfd);
+	return ret;
+}
+
+static void
+irqfd_release(struct _irqfd *irqfd)
+{
+	remove_wait_queue(irqfd->wqh, &irqfd->wait);
+
+	flush_work(&irqfd->work);
+	fput(irqfd->file);
+
+	list_del(&irqfd->list);
+	kfree(irqfd);
+}
+
+int
+kvm_deassign_irqfd(struct kvm *kvm, int fd)
+{
+	struct _irqfd *irqfd, *tmp;
+
+	mutex_lock(&kvm->lock);
+
+	/*
+	 * linear search isn't brilliant, but this should be a infrequent
+	 * operation and the list should not grow very large
+	 */
+	list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds, list) {
+		if (irqfd->fd != fd)
+			continue;
+
+		irqfd_release(irqfd);
+		mutex_unlock(&kvm->lock);
+		return 0;
+	}
+	mutex_unlock(&kvm->lock);
+
+	return -ENOENT;
+}
+
+void
+kvm_irqfd_release(struct kvm *kvm)
+{
+	struct _irqfd *irqfd, *tmp;
+
+	/* don't bother with the lock..we are shutting down */
+	list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds, list)
+		irqfd_release(irqfd);
+}
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 4d00942..7aa9f0a 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -983,6 +983,7 @@  static struct kvm *kvm_create_vm(void)
 	atomic_inc(&kvm->mm->mm_count);
 	spin_lock_init(&kvm->mmu_lock);
 	kvm_io_bus_init(&kvm->pio_bus);
+	INIT_LIST_HEAD(&kvm->irqfds);
 	mutex_init(&kvm->lock);
 	kvm_io_bus_init(&kvm->mmio_bus);
 	init_rwsem(&kvm->slots_lock);
@@ -1034,6 +1035,7 @@  static void kvm_destroy_vm(struct kvm *kvm)
 	spin_lock(&kvm_lock);
 	list_del(&kvm->vm_list);
 	spin_unlock(&kvm_lock);
+	kvm_irqfd_release(kvm);
 	kvm_free_irq_routing(kvm);
 	kvm_io_bus_destroy(&kvm->pio_bus);
 	kvm_io_bus_destroy(&kvm->mmio_bus);
@@ -2208,6 +2210,24 @@  static long kvm_vm_ioctl(struct file *filp,
 	}
 #endif
 #endif /* KVM_CAP_IRQ_ROUTING */
+	case KVM_ASSIGN_IRQFD: {
+		struct kvm_irqfd data;
+
+		r = -EFAULT;
+		if (copy_from_user(&data, argp, sizeof data))
+			goto out;
+		r = kvm_assign_irqfd(kvm, data.fd, data.gsi, data.flags);
+		break;
+	}
+	case KVM_DEASSIGN_IRQFD: {
+		u32 data;
+
+		r = -EFAULT;
+		if (copy_from_user(&data, argp, sizeof data))
+			goto out;
+		r = kvm_deassign_irqfd(kvm, data);
+		break;
+	}
 	default:
 		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
 	}