diff mbox

[v4,2/2] kvm: add support for irqfd via eventfd-notification interface

Message ID 20090504175750.26758.7023.stgit@dev.haskins.net (mailing list archive)
State New, archived
Headers show

Commit Message

Gregory Haskins May 4, 2009, 5:57 p.m. UTC
KVM provides a complete virtual system environment for guests, including
support for injecting interrupts modeled after the real exception/interrupt
facilities present on the native platform (such as the IDT on x86).
Virtual interrupts can come from a variety of sources (emulated devices,
pass-through devices, etc) but all must be injected to the guest via
the KVM infrastructure.  This patch adds a new mechanism to inject a specific
interrupt to a guest using a decoupled eventfd mechnanism:  Any legal signal
on the irqfd (using eventfd semantics from either userspace or kernel) will
translate into an injected interrupt in the guest at the next available
interrupt window.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
---

 arch/x86/kvm/Makefile    |    2 -
 arch/x86/kvm/x86.c       |    1 
 include/linux/kvm.h      |    7 ++
 include/linux/kvm_host.h |    4 +
 virt/kvm/irqfd.c         |  159 ++++++++++++++++++++++++++++++++++++++++++++++
 virt/kvm/kvm_main.c      |   11 +++
 6 files changed, 183 insertions(+), 1 deletions(-)
 create mode 100644 virt/kvm/irqfd.c


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Avi Kivity May 5, 2009, 3:45 p.m. UTC | #1
Gregory Haskins wrote:
> KVM provides a complete virtual system environment for guests, including
> support for injecting interrupts modeled after the real exception/interrupt
> facilities present on the native platform (such as the IDT on x86).
> Virtual interrupts can come from a variety of sources (emulated devices,
> pass-through devices, etc) but all must be injected to the guest via
> the KVM infrastructure.  This patch adds a new mechanism to inject a specific
> interrupt to a guest using a decoupled eventfd mechnanism:  Any legal signal
> on the irqfd (using eventfd semantics from either userspace or kernel) will
> translate into an injected interrupt in the guest at the next available
> interrupt window.
>
>  
> +struct kvm_irqfd {
> +	__u32 gsi;
> +	__u32 flags;
> +};
> +
>   

Please add some reserved space here.

> +int
> +kvm_irqfd(struct kvm *kvm, int gsi, int flags)
> +{
> +	struct _irqfd *irqfd;
> +	struct file *file = NULL;
> +	int fd = -1;
> +	int ret;
> +
> +	irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL);
> +	if (!irqfd)
> +		return -ENOMEM;
> +
> +	irqfd->kvm = kvm;
>   

You need to increase the refcount on struct kvm here.  Otherwise evil 
userspace will create an irqfd, close the vm and vcpu fds, and inject an 
interrupt.

Otherwise, looks good.
Gregory Haskins May 5, 2009, 5:34 p.m. UTC | #2
Avi Kivity wrote:
> Gregory Haskins wrote:
>> KVM provides a complete virtual system environment for guests, including
>> support for injecting interrupts modeled after the real
>> exception/interrupt
>> facilities present on the native platform (such as the IDT on x86).
>> Virtual interrupts can come from a variety of sources (emulated devices,
>> pass-through devices, etc) but all must be injected to the guest via
>> the KVM infrastructure.  This patch adds a new mechanism to inject a
>> specific
>> interrupt to a guest using a decoupled eventfd mechnanism:  Any legal
>> signal
>> on the irqfd (using eventfd semantics from either userspace or
>> kernel) will
>> translate into an injected interrupt in the guest at the next available
>> interrupt window.
>>
>>  
>> +struct kvm_irqfd {
>> +    __u32 gsi;
>> +    __u32 flags;
>> +};
>> +
>>   
>
> Please add some reserved space here.

Ack.  Any rule of thumb here?  How about a "__u8 pad[16]" ?

>
>> +int
>> +kvm_irqfd(struct kvm *kvm, int gsi, int flags)
>> +{
>> +    struct _irqfd *irqfd;
>> +    struct file *file = NULL;
>> +    int fd = -1;
>> +    int ret;
>> +
>> +    irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL);
>> +    if (!irqfd)
>> +        return -ENOMEM;
>> +
>> +    irqfd->kvm = kvm;
>>   
>
> You need to increase the refcount on struct kvm here.  Otherwise evil
> userspace will create an irqfd, close the vm and vcpu fds, and inject
> an interrupt.

Good catch.  Will fix.

Thanks Avi,
-Greg
Avi Kivity May 5, 2009, 5:43 p.m. UTC | #3
Gregory Haskins wrote:
>>>  
>>> +struct kvm_irqfd {
>>> +    __u32 gsi;
>>> +    __u32 flags;
>>> +};
>>> +
>>>   
>>>       
>> Please add some reserved space here.
>>     
>
> Ack.  Any rule of thumb here?  How about a "__u8 pad[16]" ?
>   

I'd round it up so the whole thing is 32 bytes (not that it matters).
Gregory Haskins May 5, 2009, 5:56 p.m. UTC | #4
Avi Kivity wrote:
> Gregory Haskins wrote:
>
>> +int
>> +kvm_irqfd(struct kvm *kvm, int gsi, int flags)
>> +{
>> +    struct _irqfd *irqfd;
>> +    struct file *file = NULL;
>> +    int fd = -1;
>> +    int ret;
>> +
>> +    irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL);
>> +    if (!irqfd)
>> +        return -ENOMEM;
>> +
>> +    irqfd->kvm = kvm;
>>   
>
> You need to increase the refcount on struct kvm here.  Otherwise evil
> userspace will create an irqfd, close the vm and vcpu fds, and inject
> an interrupt.

I just reviewed the code in prep for v5, and now I remember why I didnt
take a reference:  I designed it the opposite direction:  the vm-fd owns
a reference to the irqfd, and will decouple the kvm context from the
eventfd on shutdown  (see kvm_irqfd_release()).   I still need to spin a
v5 regardless in order to add the padding as previously discussed.  But
let me know if you still see any holes in light of this alternate object
lifetime approach I am using.

-Greg
Avi Kivity May 5, 2009, 6:10 p.m. UTC | #5
Gregory Haskins wrote:
> Avi Kivity wrote:
>   
>> Gregory Haskins wrote:
>>
>>     
>>> +int
>>> +kvm_irqfd(struct kvm *kvm, int gsi, int flags)
>>> +{
>>> +    struct _irqfd *irqfd;
>>> +    struct file *file = NULL;
>>> +    int fd = -1;
>>> +    int ret;
>>> +
>>> +    irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL);
>>> +    if (!irqfd)
>>> +        return -ENOMEM;
>>> +
>>> +    irqfd->kvm = kvm;
>>>   
>>>       
>> You need to increase the refcount on struct kvm here.  Otherwise evil
>> userspace will create an irqfd, close the vm and vcpu fds, and inject
>> an interrupt.
>>     
>
> I just reviewed the code in prep for v5, and now I remember why I didnt
> take a reference:  I designed it the opposite direction:  the vm-fd owns
> a reference to the irqfd, and will decouple the kvm context from the
> eventfd on shutdown  (see kvm_irqfd_release()).   I still need to spin a
> v5 regardless in order to add the padding as previously discussed.  But
> let me know if you still see any holes in light of this alternate object
> lifetime approach I am using.
>   

Right, irqfd_release works.  But I think refcounting is simpler, since 
we already kvm_get_kvm() and kvm_put_kvm(), and you wouldn't need the 
irqfd list.  On the other hand, I'm not sure you get a callback from 
eventfd on close(), so refcounting may not be implementable.

Drat, irqfd_release doesn't work.  You reference kvm->lock in 
irqfd_inject without taking any locks.

btw, there's still your original idea of creating the eventfd in 
userspace and passing it down.  That would be workable if we can see a 
way to both signal the eventfd and get called back in irq context.  
Maybe that's preferable to what we're doing here, but we need to see how 
it would work.
Gregory Haskins May 5, 2009, 6:21 p.m. UTC | #6
Avi Kivity wrote:
> Gregory Haskins wrote:
>> Avi Kivity wrote:
>>  
>>> Gregory Haskins wrote:
>>>
>>>    
>>>> +int
>>>> +kvm_irqfd(struct kvm *kvm, int gsi, int flags)
>>>> +{
>>>> +    struct _irqfd *irqfd;
>>>> +    struct file *file = NULL;
>>>> +    int fd = -1;
>>>> +    int ret;
>>>> +
>>>> +    irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL);
>>>> +    if (!irqfd)
>>>> +        return -ENOMEM;
>>>> +
>>>> +    irqfd->kvm = kvm;
>>>>         
>>> You need to increase the refcount on struct kvm here.  Otherwise evil
>>> userspace will create an irqfd, close the vm and vcpu fds, and inject
>>> an interrupt.
>>>     
>>
>> I just reviewed the code in prep for v5, and now I remember why I didnt
>> take a reference:  I designed it the opposite direction:  the vm-fd owns
>> a reference to the irqfd, and will decouple the kvm context from the
>> eventfd on shutdown  (see kvm_irqfd_release()).   I still need to spin a
>> v5 regardless in order to add the padding as previously discussed.  But
>> let me know if you still see any holes in light of this alternate object
>> lifetime approach I am using.
>>   
>
> Right, irqfd_release works.  But I think refcounting is simpler, since
> we already kvm_get_kvm() and kvm_put_kvm(), and you wouldn't need the
> irqfd list.  On the other hand, I'm not sure you get a callback from
> eventfd on close(), so refcounting may not be implementable.

;)

>
> Drat, irqfd_release doesn't work.  You reference kvm->lock in
> irqfd_inject without taking any locks.

I *think* this is ok, tho.  I remove myself from the waitq, and then
flush any potentially scheduled deferred work before returning.  This
all happens synchronously to the vm_release() code when the vm-fd is
bring dropped, but before we actually release the struct kvm*. 
Therefore, I think kvm->lock is guaranteed to remain valid for the
duration of the irqfd_release(), and we guarantee it wont be accessed
after the irqfd_release() completes.  Or do you have a different concern?

On this topic of proper ref counts, though....

I wonder if I need an extra fget() in there.  I presume that the
evenfd_file_create() returns with only a single reference, which
presumably I am handing one to userspace, and one to the irqfd.... which
is broken.  Or does fd_install() bump that for me (doesnt look like
it)?  Al, Davide, any comments?

>
> btw, there's still your original idea of creating the eventfd in
> userspace and passing it down.  That would be workable if we can see a
> way to both signal the eventfd and get called back in irq context. 
> Maybe that's preferable to what we're doing here, but we need to see
> how it would work.

We can do that, but I don't see it as changing the general problem
here.  However, I think if you find that the above comments about the
kvm->lock w.r.t. irqfd_release() are ok, we don't need to worry about
it.  If you prefer the userspace allocation of eventfd() for other
reasons, we can easily go back to that model as well...but its not
strictly necessary for this particular issue iiuc.

-Greg
Gregory Haskins May 6, 2009, 11:35 a.m. UTC | #7
Al, Davide,

Gregory Haskins wrote:
> +
> +int
> +kvm_irqfd(struct kvm *kvm, int gsi, int flags)
> +{
> +	struct _irqfd *irqfd;
> +	struct file *file = NULL;
> +	int fd = -1;
> +	int ret;
> +
> +	irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL);
> +	if (!irqfd)
> +		return -ENOMEM;
> +
> +	irqfd->kvm = kvm;
> +	irqfd->gsi = gsi;
> +	INIT_LIST_HEAD(&irqfd->list);
> +	INIT_WORK(&irqfd->work, irqfd_inject);
> +
> +	/*
> +	 * We re-use eventfd for irqfd, and therefore will embed the eventfd
> +	 * lifetime in the irqfd.
> +	 */
> +	file = eventfd_file_create(0, 0);
> +	if (IS_ERR(file)) {
> +		ret = PTR_ERR(file);
> +		goto fail;
> +	}
> +
> +	irqfd->file = file;
> +
> +	/*
> +	 * Install our own custom wake-up handling so we are notified via
> +	 * a callback whenever someone signals the underlying eventfd
> +	 */
> +	init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup);
> +	init_poll_funcptr(&irqfd->pt, irqfd_ptable_queue_proc);
> +
> +	ret = file->f_op->poll(file, &irqfd->pt);
> +	if (ret < 0)
> +		goto fail;
> +
> +	mutex_lock(&kvm->lock);
> +	list_add_tail(&irqfd->list, &kvm->irqfds);
> +	mutex_unlock(&kvm->lock);
> +
> +	ret = get_unused_fd();
> +	if (ret < 0)
> +		goto fail;
> +
> +	fd = ret;
> +
> +	fd_install(fd, file);
>   

Can you comment on whether this function needs to take an additional
reference on file here?  (one for the one held by userspace/fd, and the
other for irqfd->file)  My instinct is telling me this may be currently
broken, but I am not sure.

-Greg
Davide Libenzi May 6, 2009, 3:24 p.m. UTC | #8
On Wed, 6 May 2009, Gregory Haskins wrote:

> Al, Davide,
> 
> Gregory Haskins wrote:
> > +
> > +int
> > +kvm_irqfd(struct kvm *kvm, int gsi, int flags)
> > +{
> > +	struct _irqfd *irqfd;
> > +	struct file *file = NULL;
> > +	int fd = -1;
> > +	int ret;
> > +
> > +	irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL);
> > +	if (!irqfd)
> > +		return -ENOMEM;
> > +
> > +	irqfd->kvm = kvm;
> > +	irqfd->gsi = gsi;
> > +	INIT_LIST_HEAD(&irqfd->list);
> > +	INIT_WORK(&irqfd->work, irqfd_inject);
> > +
> > +	/*
> > +	 * We re-use eventfd for irqfd, and therefore will embed the eventfd
> > +	 * lifetime in the irqfd.
> > +	 */
> > +	file = eventfd_file_create(0, 0);
> > +	if (IS_ERR(file)) {
> > +		ret = PTR_ERR(file);
> > +		goto fail;
> > +	}
> > +
> > +	irqfd->file = file;
> > +
> > +	/*
> > +	 * Install our own custom wake-up handling so we are notified via
> > +	 * a callback whenever someone signals the underlying eventfd
> > +	 */
> > +	init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup);
> > +	init_poll_funcptr(&irqfd->pt, irqfd_ptable_queue_proc);
> > +
> > +	ret = file->f_op->poll(file, &irqfd->pt);
> > +	if (ret < 0)
> > +		goto fail;
> > +
> > +	mutex_lock(&kvm->lock);
> > +	list_add_tail(&irqfd->list, &kvm->irqfds);
> > +	mutex_unlock(&kvm->lock);
> > +
> > +	ret = get_unused_fd();
> > +	if (ret < 0)
> > +		goto fail;
> > +
> > +	fd = ret;
> > +
> > +	fd_install(fd, file);
> >   
> 
> Can you comment on whether this function needs to take an additional
> reference on file here?  (one for the one held by userspace/fd, and the
> other for irqfd->file)  My instinct is telling me this may be currently
> broken, but I am not sure.

I don't know exactly how it is used, but looks broken. If the fd is 
returned to userspace, and userspace closes the fd, you will leak the 
irqfd*. If you do an extra fget(), you will never know if the userspace 
closed the last-1 instance of the eventfd file*.
What is likely needed, is add a callback to eventfd_file_create(), so that 
the eventfd can call your callback on ->release, and give you a chance to 
perform proper cleanup of the irqfd*.



- Davide


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gregory Haskins May 6, 2009, 3:37 p.m. UTC | #9
Davide Libenzi wrote:
> On Wed, 6 May 2009, Gregory Haskins wrote:
>
>   
>> Al, Davide,
>>
>> Gregory Haskins wrote:
>>     
>>> +
>>> +int
>>> +kvm_irqfd(struct kvm *kvm, int gsi, int flags)
>>> +{
>>> +	struct _irqfd *irqfd;
>>> +	struct file *file = NULL;
>>> +	int fd = -1;
>>> +	int ret;
>>> +
>>> +	irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL);
>>> +	if (!irqfd)
>>> +		return -ENOMEM;
>>> +
>>> +	irqfd->kvm = kvm;
>>> +	irqfd->gsi = gsi;
>>> +	INIT_LIST_HEAD(&irqfd->list);
>>> +	INIT_WORK(&irqfd->work, irqfd_inject);
>>> +
>>> +	/*
>>> +	 * We re-use eventfd for irqfd, and therefore will embed the eventfd
>>> +	 * lifetime in the irqfd.
>>> +	 */
>>> +	file = eventfd_file_create(0, 0);
>>> +	if (IS_ERR(file)) {
>>> +		ret = PTR_ERR(file);
>>> +		goto fail;
>>> +	}
>>> +
>>> +	irqfd->file = file;
>>> +
>>> +	/*
>>> +	 * Install our own custom wake-up handling so we are notified via
>>> +	 * a callback whenever someone signals the underlying eventfd
>>> +	 */
>>> +	init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup);
>>> +	init_poll_funcptr(&irqfd->pt, irqfd_ptable_queue_proc);
>>> +
>>> +	ret = file->f_op->poll(file, &irqfd->pt);
>>> +	if (ret < 0)
>>> +		goto fail;
>>> +
>>> +	mutex_lock(&kvm->lock);
>>> +	list_add_tail(&irqfd->list, &kvm->irqfds);
>>> +	mutex_unlock(&kvm->lock);
>>> +
>>> +	ret = get_unused_fd();
>>> +	if (ret < 0)
>>> +		goto fail;
>>> +
>>> +	fd = ret;
>>> +
>>> +	fd_install(fd, file);
>>>   
>>>       
>> Can you comment on whether this function needs to take an additional
>> reference on file here?  (one for the one held by userspace/fd, and the
>> other for irqfd->file)  My instinct is telling me this may be currently
>> broken, but I am not sure.
>>     
>
> I don't know exactly how it is used, but looks broken.
Yeah, I think it is.  I addressed this in v5 so please review that
version if you have a moment.

>  If the fd is 
> returned to userspace, and userspace closes the fd, you will leak the 
> irqfd*. If you do an extra fget(), you will never know if the userspace 
> closed the last-1 instance of the eventfd file*.
> What is likely needed, is add a callback to eventfd_file_create(), so that 
> the eventfd can call your callback on ->release, and give you a chance to 
> perform proper cleanup of the irqfd*.
>   

I think we are ok in this regard (at least in v5) without the callback. 
kvm holds irqfd, which holds eventfd.  In a normal situation, we will
have eventfd with 2 references.  If userspace closes the eventfd, it
will drop 1 of the 2 eventfd file references, but the object should
remain intact as long as kvm still holds it as well.  When the kvm-fd is
released, we will then decouple from the eventfd->wqh and drop the last
fput(), officially freeing it.

Likewise, if kvm is closed before the eventfd, we will simply decouple
from the wqh and fput(eventfd), leaving the last reference held by
userspace until it closes as well.

Let me know if you see any holes in that.

Thanks Davide,
-Greg
Davide Libenzi May 7, 2009, 1:34 a.m. UTC | #10
On Wed, 6 May 2009, Gregory Haskins wrote:

> I think we are ok in this regard (at least in v5) without the callback. 
> kvm holds irqfd, which holds eventfd.  In a normal situation, we will
> have eventfd with 2 references.  If userspace closes the eventfd, it
> will drop 1 of the 2 eventfd file references, but the object should
> remain intact as long as kvm still holds it as well.  When the kvm-fd is
> released, we will then decouple from the eventfd->wqh and drop the last
> fput(), officially freeing it.
> 
> Likewise, if kvm is closed before the eventfd, we will simply decouple
> from the wqh and fput(eventfd), leaving the last reference held by
> userspace until it closes as well.
> 
> Let me know if you see any holes in that.

Looks OK, modulo my knowledge of KVM internals.


- Davide


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gregory Haskins May 7, 2009, 2:06 a.m. UTC | #11
Davide Libenzi wrote:
> On Wed, 6 May 2009, Gregory Haskins wrote:
>
>   
>> I think we are ok in this regard (at least in v5) without the callback. 
>> kvm holds irqfd, which holds eventfd.  In a normal situation, we will
>> have eventfd with 2 references.  If userspace closes the eventfd, it
>> will drop 1 of the 2 eventfd file references, but the object should
>> remain intact as long as kvm still holds it as well.  When the kvm-fd is
>> released, we will then decouple from the eventfd->wqh and drop the last
>> fput(), officially freeing it.
>>
>> Likewise, if kvm is closed before the eventfd, we will simply decouple
>> from the wqh and fput(eventfd), leaving the last reference held by
>> userspace until it closes as well.
>>
>> Let me know if you see any holes in that.
>>     
>
> Looks OK, modulo my knowledge of KVM internals.
>   

Thanks Davide,

If there isn't any more feedback on the series from Al, Avi,
etc...please formally submit your eventfd patch so this series is
available for Avi to pull in for inclusion when/if he deems it fit.

Thanks again,
-Greg
Avi Kivity May 7, 2009, 9:48 a.m. UTC | #12
Davide Libenzi wrote:
> On Wed, 6 May 2009, Gregory Haskins wrote:
>
>   
>> I think we are ok in this regard (at least in v5) without the callback. 
>> kvm holds irqfd, which holds eventfd.  In a normal situation, we will
>> have eventfd with 2 references.  If userspace closes the eventfd, it
>> will drop 1 of the 2 eventfd file references, but the object should
>> remain intact as long as kvm still holds it as well.  When the kvm-fd is
>> released, we will then decouple from the eventfd->wqh and drop the last
>> fput(), officially freeing it.
>>
>> Likewise, if kvm is closed before the eventfd, we will simply decouple
>> from the wqh and fput(eventfd), leaving the last reference held by
>> userspace until it closes as well.
>>
>> Let me know if you see any holes in that.
>>     
>
> Looks OK, modulo my knowledge of KVM internals.
>   

What's your take on adding irq context safe callbacks to irqfd?

To give some background here, we would like to use eventfd as a generic 
connector between components, so the components do not know about each 
other.  So far eventfd successfully abstracts among components in the 
same process, in different processes, and in the kernel.

eventfd_signal() can be safely called from irq context, and will wake up 
a waiting task.  But in some cases, if the consumer is in the kernel, it 
may be able to consume the event from irq context, saving a context switch.

So, will you consider patches adding this capability to eventfd?
Marcelo Tosatti May 7, 2009, 1:46 p.m. UTC | #13
On Thu, May 07, 2009 at 12:48:21PM +0300, Avi Kivity wrote:
> Davide Libenzi wrote:
>> On Wed, 6 May 2009, Gregory Haskins wrote:
>>
>>   
>>> I think we are ok in this regard (at least in v5) without the 
>>> callback. kvm holds irqfd, which holds eventfd.  In a normal 
>>> situation, we will
>>> have eventfd with 2 references.  If userspace closes the eventfd, it
>>> will drop 1 of the 2 eventfd file references, but the object should
>>> remain intact as long as kvm still holds it as well.  When the kvm-fd is
>>> released, we will then decouple from the eventfd->wqh and drop the last
>>> fput(), officially freeing it.
>>>
>>> Likewise, if kvm is closed before the eventfd, we will simply decouple
>>> from the wqh and fput(eventfd), leaving the last reference held by
>>> userspace until it closes as well.
>>>
>>> Let me know if you see any holes in that.
>>>     
>>
>> Looks OK, modulo my knowledge of KVM internals.
>>   
>
> What's your take on adding irq context safe callbacks to irqfd?
>
> To give some background here, we would like to use eventfd as a generic  
> connector between components, so the components do not know about each  
> other.  So far eventfd successfully abstracts among components in the  
> same process, in different processes, and in the kernel.
>
> eventfd_signal() can be safely called from irq context, and will wake up  
> a waiting task.  But in some cases, if the consumer is in the kernel, it  
> may be able to consume the event from irq context, saving a context 
> switch.
>
> So, will you consider patches adding this capability to eventfd?

(pasting from a separate thread)

> That's my thinking.  PCI interrupts don't work because we need to do  
> some hacky stuff in there, but MSI should.  Oh, and we could improve
> UIO  
> support for interrupts when using MSI, since there's no need to  
> acknowledge the interrupt.

Ok, so for INTx assigned devices all you need to do on the ACK handler
is to re-enable the host interrupt (and set the guest interrupt line to
low).

Right now the ack comes through a kvm internal irq ack callback.

AFAICS there is no mechanism in irqfd for ACK notification, and
interrupt injection is edge triggered.

So for PCI INTx assigned devices (or any INTx level), you'd want to keep
the guest interrupt high, with some way to notify the ACK.

Avi mentioned a separate irqfd to notify the ACK. For assigned devices,
you could register a fd wakeup function in that fd, which replaces the
current irq ACK callback?

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gregory Haskins May 7, 2009, 2:01 p.m. UTC | #14
Marcelo Tosatti wrote:
> On Thu, May 07, 2009 at 12:48:21PM +0300, Avi Kivity wrote:
>   
>> Davide Libenzi wrote:
>>     
>>> On Wed, 6 May 2009, Gregory Haskins wrote:
>>>
>>>   
>>>       
>>>> I think we are ok in this regard (at least in v5) without the 
>>>> callback. kvm holds irqfd, which holds eventfd.  In a normal 
>>>> situation, we will
>>>> have eventfd with 2 references.  If userspace closes the eventfd, it
>>>> will drop 1 of the 2 eventfd file references, but the object should
>>>> remain intact as long as kvm still holds it as well.  When the kvm-fd is
>>>> released, we will then decouple from the eventfd->wqh and drop the last
>>>> fput(), officially freeing it.
>>>>
>>>> Likewise, if kvm is closed before the eventfd, we will simply decouple
>>>> from the wqh and fput(eventfd), leaving the last reference held by
>>>> userspace until it closes as well.
>>>>
>>>> Let me know if you see any holes in that.
>>>>     
>>>>         
>>> Looks OK, modulo my knowledge of KVM internals.
>>>   
>>>       
>> What's your take on adding irq context safe callbacks to irqfd?
>>
>> To give some background here, we would like to use eventfd as a generic  
>> connector between components, so the components do not know about each  
>> other.  So far eventfd successfully abstracts among components in the  
>> same process, in different processes, and in the kernel.
>>
>> eventfd_signal() can be safely called from irq context, and will wake up  
>> a waiting task.  But in some cases, if the consumer is in the kernel, it  
>> may be able to consume the event from irq context, saving a context 
>> switch.
>>
>> So, will you consider patches adding this capability to eventfd?
>>     
>
> (pasting from a separate thread)
>
>   
>> That's my thinking.  PCI interrupts don't work because we need to do  
>> some hacky stuff in there, but MSI should.  Oh, and we could improve
>> UIO  
>> support for interrupts when using MSI, since there's no need to  
>> acknowledge the interrupt.
>>     
>
> Ok, so for INTx assigned devices all you need to do on the ACK handler
> is to re-enable the host interrupt (and set the guest interrupt line to
> low).
>
> Right now the ack comes through a kvm internal irq ack callback.
>
> AFAICS there is no mechanism in irqfd for ACK notification, and
> interrupt injection is edge triggered.
>
> So for PCI INTx assigned devices (or any INTx level), you'd want to keep
> the guest interrupt high, with some way to notify the ACK.
>
> Avi mentioned a separate irqfd to notify the ACK. For assigned devices,
> you could register a fd wakeup function in that fd, which replaces the
> current irq ACK callback?
>   

One thing I was thinking here was that I could create a flag for the
kvm_irqfd() function for something like "KVM_IRQFD_MODE_CLEAR".  This
flag when specified at creation time will cause the event to execute a
clear operation instead of a set when triggered.  That way, the default
mode is an edge-triggered set.  The non-default mode is to trigger a
clear.  Level-triggered ints could therefore create two irqfds, one for
raising, the other for clearing.

An alternative is to abandon the use of eventfd, and allow the irqfd to
be a first-class anon-fd.  The parameters passed to the write/signal()
function could then indicate the desired level.  The disadvantage would
be that it would not be compatible with eventfd, so we would need to
decide if the tradeoff is worth it.

OTOH, I suspect level triggered interrupts will be primarily in the
legacy domain, so perhaps we do not need to worry about it too much. 
Therefore, another option is that we *could* simply set the stake in the
ground that legacy/level cannot use irqfd.

Thoughts?

-Greg
Avi Kivity May 7, 2009, 2:34 p.m. UTC | #15
Gregory Haskins wrote:
> One thing I was thinking here was that I could create a flag for the
> kvm_irqfd() function for something like "KVM_IRQFD_MODE_CLEAR".  This
> flag when specified at creation time will cause the event to execute a
> clear operation instead of a set when triggered.  That way, the default
> mode is an edge-triggered set.  The non-default mode is to trigger a
> clear.  Level-triggered ints could therefore create two irqfds, one for
> raising, the other for clearing.
>   

That's my second choice option.

> An alternative is to abandon the use of eventfd, and allow the irqfd to
> be a first-class anon-fd.  The parameters passed to the write/signal()
> function could then indicate the desired level.  The disadvantage would
> be that it would not be compatible with eventfd, so we would need to
> decide if the tradeoff is worth it.
>   

I would really like to keep using eventfd.  Which is why I asked Davide 
about the prospects of direct callbacks (vs wakeups).

> OTOH, I suspect level triggered interrupts will be primarily in the
> legacy domain, so perhaps we do not need to worry about it too much. 
> Therefore, another option is that we *could* simply set the stake in the
> ground that legacy/level cannot use irqfd.
>   

This is my preferred option.  For a virtio-net-server in the kernel, 
we'd service its eventfd in qemu, raising and lowering the pci interrupt 
in the traditional way.

But we'd still need to know when to lower the interrupt.  How?
Davide Libenzi May 7, 2009, 2:46 p.m. UTC | #16
On Thu, 7 May 2009, Avi Kivity wrote:

> What's your take on adding irq context safe callbacks to irqfd?
> 
> To give some background here, we would like to use eventfd as a generic
> connector between components, so the components do not know about each other.
> So far eventfd successfully abstracts among components in the same process, in
> different processes, and in the kernel.
> 
> eventfd_signal() can be safely called from irq context, and will wake up a
> waiting task.  But in some cases, if the consumer is in the kernel, it may be
> able to consume the event from irq context, saving a context switch.
> 
> So, will you consider patches adding this capability to eventfd?

Maybe I got lost in the thread, but inside the kernel we have 
callback-based wakeup since long time. This is what epoll uses, when 
hooking into the file* f_op->poll() subsystem.
Did you mean something else?


- Davide


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gregory Haskins May 7, 2009, 2:54 p.m. UTC | #17
Avi Kivity wrote:
> Gregory Haskins wrote:
>> One thing I was thinking here was that I could create a flag for the
>> kvm_irqfd() function for something like "KVM_IRQFD_MODE_CLEAR".  This
>> flag when specified at creation time will cause the event to execute a
>> clear operation instead of a set when triggered.  That way, the default
>> mode is an edge-triggered set.  The non-default mode is to trigger a
>> clear.  Level-triggered ints could therefore create two irqfds, one for
>> raising, the other for clearing.
>>   
>
> That's my second choice option.
>
>> An alternative is to abandon the use of eventfd, and allow the irqfd to
>> be a first-class anon-fd.  The parameters passed to the write/signal()
>> function could then indicate the desired level.  The disadvantage would
>> be that it would not be compatible with eventfd, so we would need to
>> decide if the tradeoff is worth it.
>>   
>
> I would really like to keep using eventfd.  Which is why I asked
> Davide about the prospects of direct callbacks (vs wakeups).

I saw that request.  That would be ideal.

>
>> OTOH, I suspect level triggered interrupts will be primarily in the
>> legacy domain, so perhaps we do not need to worry about it too much.
>> Therefore, another option is that we *could* simply set the stake in the
>> ground that legacy/level cannot use irqfd.
>>   
>
> This is my preferred option.  For a virtio-net-server in the kernel,
> we'd service its eventfd in qemu, raising and lowering the pci
> interrupt in the traditional way.
>
> But we'd still need to know when to lower the interrupt.  How?

IIUC, isn't that  usually device/subsystem specific, and out of scope of
the GSI delivery vehicle?  For instance, most devices I have seen with
level ints have a register in their device register namespace for acking
the int.  As an aside, this is what causes some of the grief in dealing
with shared interrupts like KVM pass-through and/or threaded-isrs:  
There isn't a standardized way to ACK them.

You may also see some generalization of masking/acking in things like
the MSI-X table.  But again, this would be out of scope of the general
GSI delivery path IIUC.

I understand that there is a feedback mechanism in the ioapic model for
calling back on acknowledgment of the interrupt.  But I am not sure what
is how the real hardware works normally, and therefore I am not
convinced that is something we need to feed all the way back (i.e. via
irqfd or whatever).  In the interest of full disclosure, its been a few
years since I studied the xAPIC docs, so I might be out to lunch on that
assertion. ;)

-Greg
Avi Kivity May 7, 2009, 3:26 p.m. UTC | #18
Gregory Haskins wrote:

  

>> This is my preferred option.  For a virtio-net-server in the kernel,
>> we'd service its eventfd in qemu, raising and lowering the pci
>> interrupt in the traditional way.
>>
>> But we'd still need to know when to lower the interrupt.  How?
>>     
>
> IIUC, isn't that  usually device/subsystem specific, and out of scope of
> the GSI delivery vehicle?  For instance, most devices I have seen with
> level ints have a register in their device register namespace for acking
> the int.  

Yes it is.

> As an aside, this is what causes some of the grief in dealing
> with shared interrupts like KVM pass-through and/or threaded-isrs:  
> There isn't a standardized way to ACK them.
>   

So we'd need a side channel to tell userspace to lower the irq.  Another 
eventfd likely.

Note we don't support device assignment for devices with shared interrupts.

> You may also see some generalization of masking/acking in things like
> the MSI-X table.  But again, this would be out of scope of the general
> GSI delivery path IIUC.
>
> I understand that there is a feedback mechanism in the ioapic model for
> calling back on acknowledgment of the interrupt.  But I am not sure what
> is how the real hardware works normally, and therefore I am not
> convinced that is something we need to feed all the way back (i.e. via
> irqfd or whatever).  In the interest of full disclosure, its been a few
> years since I studied the xAPIC docs, so I might be out to lunch on that
> assertion. ;)
>   

Right, that ack thing is completely internal, used for catching up on 
time drift, and for shutting down level triggered assigned interrupts.
Avi Kivity May 7, 2009, 3:47 p.m. UTC | #19
Davide Libenzi wrote:
>   
>> What's your take on adding irq context safe callbacks to irqfd?
>>
>> To give some background here, we would like to use eventfd as a generic
>> connector between components, so the components do not know about each other.
>> So far eventfd successfully abstracts among components in the same process, in
>> different processes, and in the kernel.
>>
>> eventfd_signal() can be safely called from irq context, and will wake up a
>> waiting task.  But in some cases, if the consumer is in the kernel, it may be
>> able to consume the event from irq context, saving a context switch.
>>
>> So, will you consider patches adding this capability to eventfd?
>>     
>
> Maybe I got lost in the thread, but inside the kernel we have 
> callback-based wakeup since long time. This is what epoll uses, when 
> hooking into the file* f_op->poll() subsystem.
> Did you mean something else?
>   

Do you mean wait_queue_t::func?
Davide Libenzi May 7, 2009, 4:44 p.m. UTC | #20
On Thu, 7 May 2009, Avi Kivity wrote:

> Davide Libenzi wrote:
> >   
> > > What's your take on adding irq context safe callbacks to irqfd?
> > > 
> > > To give some background here, we would like to use eventfd as a generic
> > > connector between components, so the components do not know about each
> > > other.
> > > So far eventfd successfully abstracts among components in the same
> > > process, in
> > > different processes, and in the kernel.
> > > 
> > > eventfd_signal() can be safely called from irq context, and will wake up a
> > > waiting task.  But in some cases, if the consumer is in the kernel, it may
> > > be
> > > able to consume the event from irq context, saving a context switch.
> > > 
> > > So, will you consider patches adding this capability to eventfd?
> > >     
> > 
> > Maybe I got lost in the thread, but inside the kernel we have callback-based
> > wakeup since long time. This is what epoll uses, when hooking into the file*
> > f_op->poll() subsystem.
> > Did you mean something else?
> >   
> 
> Do you mean wait_queue_t::func?

Yes, it is since long time ago that a wakeup in Linux can lead either to a 
real wakeup (in scheduler terms), or to a simple function call.
Isn't that enough?


- Davide


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity May 7, 2009, 6:12 p.m. UTC | #21
Davide Libenzi wrote:
> On Thu, 7 May 2009, Avi Kivity wrote:
>
>   
>> Davide Libenzi wrote:
>>     
>>>   
>>>       
>>>> What's your take on adding irq context safe callbacks to irqfd?
>>>>
>>>> To give some background here, we would like to use eventfd as a generic
>>>> connector between components, so the components do not know about each
>>>> other.
>>>> So far eventfd successfully abstracts among components in the same
>>>> process, in
>>>> different processes, and in the kernel.
>>>>
>>>> eventfd_signal() can be safely called from irq context, and will wake up a
>>>> waiting task.  But in some cases, if the consumer is in the kernel, it may
>>>> be
>>>> able to consume the event from irq context, saving a context switch.
>>>>
>>>> So, will you consider patches adding this capability to eventfd?
>>>>     
>>>>         
>>> Maybe I got lost in the thread, but inside the kernel we have callback-based
>>> wakeup since long time. This is what epoll uses, when hooking into the file*
>>> f_op->poll() subsystem.
>>> Did you mean something else?
>>>   
>>>       
>> Do you mean wait_queue_t::func?
>>     
>
> Yes, it is since long time ago that a wakeup in Linux can lead either to a 
> real wakeup (in scheduler terms), or to a simple function call.
> Isn't that enough?
>
>   

It's perfect.
Davide Libenzi May 8, 2009, 3:13 a.m. UTC | #22
On Thu, 7 May 2009, Avi Kivity wrote:

> What's your take on adding irq context safe callbacks to irqfd?
> 
> To give some background here, we would like to use eventfd as a generic
> connector between components, so the components do not know about each other.
> So far eventfd successfully abstracts among components in the same process, in
> different processes, and in the kernel.
> 
> eventfd_signal() can be safely called from irq context, and will wake up a
> waiting task.  But in some cases, if the consumer is in the kernel, it may be
> able to consume the event from irq context, saving a context switch.
> 
> So, will you consider patches adding this capability to eventfd?

Since I received this one after your ACK on the capability of eventfd of 
triggering callbacks, I assume we're clear on this point, aren't we?


- Davide


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity May 8, 2009, 8:19 a.m. UTC | #23
Davide Libenzi wrote:
> On Thu, 7 May 2009, Avi Kivity wrote:
>
>   
>> What's your take on adding irq context safe callbacks to irqfd?
>>
>> To give some background here, we would like to use eventfd as a generic
>> connector between components, so the components do not know about each other.
>> So far eventfd successfully abstracts among components in the same process, in
>> different processes, and in the kernel.
>>
>> eventfd_signal() can be safely called from irq context, and will wake up a
>> waiting task.  But in some cases, if the consumer is in the kernel, it may be
>> able to consume the event from irq context, saving a context switch.
>>
>> So, will you consider patches adding this capability to eventfd?
>>     
>
> Since I received this one after your ACK on the capability of eventfd of 
> triggering callbacks, I assume we're clear on this point, aren't we?
>   

We are.
Davide Libenzi May 8, 2009, 3:06 p.m. UTC | #24
On Wed, 6 May 2009, Gregory Haskins wrote:

> If there isn't any more feedback on the series from Al, Avi,
> etc...please formally submit your eventfd patch so this series is
> available for Avi to pull in for inclusion when/if he deems it fit.

Did you decide you will be using those bits?


- Davide


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gregory Haskins May 12, 2009, 3:55 a.m. UTC | #25
Davide Libenzi wrote:
> On Wed, 6 May 2009, Gregory Haskins wrote:
>
>   
>> If there isn't any more feedback on the series from Al, Avi,
>> etc...please formally submit your eventfd patch so this series is
>> available for Avi to pull in for inclusion when/if he deems it fit.
>>     
>
> Did you decide you will be using those bits?
>   

Hi Davide,
  It appears that we will not end up needing them since the new version
I am about to push goes back to creating the eventfd in userspace to
begin with, per Avi's request.  Regardless, thank you kindly for your
help here, and also for your poll suggestion which we will definitely be
using.

Regards,
-Greg
Davide Libenzi May 12, 2009, 6:55 a.m. UTC | #26
On Mon, 11 May 2009, Gregory Haskins wrote:

> Davide Libenzi wrote:
> > On Wed, 6 May 2009, Gregory Haskins wrote:
> >
> >   
> >> If there isn't any more feedback on the series from Al, Avi,
> >> etc...please formally submit your eventfd patch so this series is
> >> available for Avi to pull in for inclusion when/if he deems it fit.
> >>     
> >
> > Did you decide you will be using those bits?
> >   
> 
> Hi Davide,
>   It appears that we will not end up needing them since the new version
> I am about to push goes back to creating the eventfd in userspace to
> begin with, per Avi's request.

That looks like a good idea.


- Davide


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index b43c4ef..d5fff51 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -3,7 +3,7 @@ 
 #
 
 common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
-                coalesced_mmio.o irq_comm.o)
+                coalesced_mmio.o irq_comm.o irqfd.o)
 ifeq ($(CONFIG_KVM_TRACE),y)
 common-objs += $(addprefix ../../../virt/kvm/, kvm_trace.o)
 endif
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2d7082c..699a407 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1027,6 +1027,7 @@  int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_REINJECT_CONTROL:
 	case KVM_CAP_IRQ_INJECT_STATUS:
 	case KVM_CAP_ASSIGN_DEV_IRQ:
+	case KVM_CAP_IRQFD:
 		r = 1;
 		break;
 	case KVM_CAP_COALESCED_MMIO:
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 3db5d8d..5e9b861 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -415,6 +415,7 @@  struct kvm_trace_rec {
 #define KVM_CAP_ASSIGN_DEV_IRQ 29
 /* Another bug in KVM_SET_USER_MEMORY_REGION fixed: */
 #define KVM_CAP_JOIN_MEMORY_REGIONS_WORKS 30
+#define KVM_CAP_IRQFD 31
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -454,6 +455,11 @@  struct kvm_irq_routing {
 
 #endif
 
+struct kvm_irqfd {
+	__u32 gsi;
+	__u32 flags;
+};
+
 /*
  * ioctls for VM fds
  */
@@ -498,6 +504,7 @@  struct kvm_irq_routing {
 #define KVM_ASSIGN_SET_MSIX_ENTRY \
 			_IOW(KVMIO, 0x74, struct kvm_assigned_msix_entry)
 #define KVM_DEASSIGN_DEV_IRQ       _IOW(KVMIO, 0x75, struct kvm_assigned_irq)
+#define KVM_IRQFD                  _IOW(KVMIO, 0x76, struct kvm_irqfd)
 
 /*
  * ioctls for vcpu fds
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 095ebb6..6a8d1c1 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -134,6 +134,7 @@  struct kvm {
 	struct list_head vm_list;
 	struct kvm_io_bus mmio_bus;
 	struct kvm_io_bus pio_bus;
+	struct list_head irqfds;
 	struct kvm_vm_stat stat;
 	struct kvm_arch arch;
 	atomic_t users_count;
@@ -524,4 +525,7 @@  static inline void kvm_free_irq_routing(struct kvm *kvm) {}
 
 #endif
 
+int kvm_irqfd(struct kvm *kvm, int gsi, int flags);
+void kvm_irqfd_release(struct kvm *kvm);
+
 #endif
diff --git a/virt/kvm/irqfd.c b/virt/kvm/irqfd.c
new file mode 100644
index 0000000..6c82fcb
--- /dev/null
+++ b/virt/kvm/irqfd.c
@@ -0,0 +1,159 @@ 
+/*
+ * irqfd: Allows an eventfd to be used to inject an interrupt to the guest
+ *
+ * Credit goes to Avi Kivity for the original idea.
+ *
+ * Copyright 2009 Novell.  All Rights Reserved.
+ *
+ * Author:
+ *	Gregory Haskins <ghaskins@novell.com>
+ *
+ * This file is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/eventfd.h>
+#include <linux/workqueue.h>
+#include <linux/syscalls.h>
+#include <linux/wait.h>
+#include <linux/poll.h>
+#include <linux/file.h>
+#include <linux/list.h>
+
+struct _irqfd {
+	struct kvm               *kvm;
+	int                       gsi;
+	struct file              *file;
+	struct list_head          list;
+	poll_table                pt;
+	wait_queue_head_t        *wqh;
+	wait_queue_t              wait;
+	struct work_struct        work;
+};
+
+static void
+irqfd_inject(struct work_struct *work)
+{
+	struct _irqfd *irqfd = container_of(work, struct _irqfd, work);
+	struct kvm *kvm = irqfd->kvm;
+
+	mutex_lock(&kvm->lock);
+	kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1);
+	kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0);
+	mutex_unlock(&kvm->lock);
+}
+
+static int
+irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key)
+{
+	struct _irqfd *irqfd = container_of(wait, struct _irqfd, wait);
+
+	/*
+	 * The eventfd calls its wake_up with interrupts disabled, and
+	 * eventfd_signal() may be called from interrupt context. Therefore
+	 * we need to defer the IRQ injection until later since we need to
+	 * acquire the kvm->lock to do so.
+	 */
+	schedule_work(&irqfd->work);
+
+	return 0;
+}
+
+static void
+irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh,
+			poll_table *pt)
+{
+	struct _irqfd *irqfd = container_of(pt, struct _irqfd, pt);
+
+	irqfd->wqh = wqh;
+	add_wait_queue(wqh, &irqfd->wait);
+}
+
+int
+kvm_irqfd(struct kvm *kvm, int gsi, int flags)
+{
+	struct _irqfd *irqfd;
+	struct file *file = NULL;
+	int fd = -1;
+	int ret;
+
+	irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL);
+	if (!irqfd)
+		return -ENOMEM;
+
+	irqfd->kvm = kvm;
+	irqfd->gsi = gsi;
+	INIT_LIST_HEAD(&irqfd->list);
+	INIT_WORK(&irqfd->work, irqfd_inject);
+
+	/*
+	 * We re-use eventfd for irqfd, and therefore will embed the eventfd
+	 * lifetime in the irqfd.
+	 */
+	file = eventfd_file_create(0, 0);
+	if (IS_ERR(file)) {
+		ret = PTR_ERR(file);
+		goto fail;
+	}
+
+	irqfd->file = file;
+
+	/*
+	 * Install our own custom wake-up handling so we are notified via
+	 * a callback whenever someone signals the underlying eventfd
+	 */
+	init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup);
+	init_poll_funcptr(&irqfd->pt, irqfd_ptable_queue_proc);
+
+	ret = file->f_op->poll(file, &irqfd->pt);
+	if (ret < 0)
+		goto fail;
+
+	mutex_lock(&kvm->lock);
+	list_add_tail(&irqfd->list, &kvm->irqfds);
+	mutex_unlock(&kvm->lock);
+
+	ret = get_unused_fd();
+	if (ret < 0)
+		goto fail;
+
+	fd = ret;
+
+	fd_install(fd, file);
+
+	return fd;
+
+fail:
+	if (file && !IS_ERR(file))
+		fput(file);
+
+	kfree(irqfd);
+	return ret;
+}
+
+void
+kvm_irqfd_release(struct kvm *kvm)
+{
+	struct _irqfd *irqfd, *tmp;
+
+	list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds, list) {
+		remove_wait_queue(irqfd->wqh, &irqfd->wait);
+
+		flush_work(&irqfd->work);
+		fput(irqfd->file);
+
+		list_del(&irqfd->list);
+		kfree(irqfd);
+	}
+}
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 2b73e19..8b3b06a 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -972,6 +972,7 @@  static struct kvm *kvm_create_vm(void)
 	atomic_inc(&kvm->mm->mm_count);
 	spin_lock_init(&kvm->mmu_lock);
 	kvm_io_bus_init(&kvm->pio_bus);
+	INIT_LIST_HEAD(&kvm->irqfds);
 	mutex_init(&kvm->lock);
 	kvm_io_bus_init(&kvm->mmio_bus);
 	init_rwsem(&kvm->slots_lock);
@@ -1023,6 +1024,7 @@  static void kvm_destroy_vm(struct kvm *kvm)
 	spin_lock(&kvm_lock);
 	list_del(&kvm->vm_list);
 	spin_unlock(&kvm_lock);
+	kvm_irqfd_release(kvm);
 	kvm_free_irq_routing(kvm);
 	kvm_io_bus_destroy(&kvm->pio_bus);
 	kvm_io_bus_destroy(&kvm->mmio_bus);
@@ -2197,6 +2199,15 @@  static long kvm_vm_ioctl(struct file *filp,
 	}
 #endif
 #endif /* KVM_CAP_IRQ_ROUTING */
+	case KVM_IRQFD: {
+		struct kvm_irqfd data;
+
+		r = -EFAULT;
+		if (copy_from_user(&data, argp, sizeof data))
+			goto out;
+		r = kvm_irqfd(kvm, data.gsi, data.flags);
+		break;
+	}
 	default:
 		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
 	}