diff mbox

[KVM-RFC,1/2] eventfd: add an explicit srcu based notifier interface

Message ID 20090616022956.23890.63776.stgit@dev.haskins.net (mailing list archive)
State New, archived
Headers show

Commit Message

Gregory Haskins June 16, 2009, 2:29 a.m. UTC
irqfd and its underlying implementation, eventfd, currently utilize
the embedded wait-queue in eventfd for signal notification.  The nice thing
about this design decision is that it re-uses the existing
eventfd/wait-queue code and it generally works well....with several
limitations.

One of the limitations is that notification callbacks are always called
inside a spin_lock_irqsave critical section.  Another limitation is
that it is very difficult to build a system that can recieve release
notification without being racy.

Therefore, we introduce a new registration interface that is SRCU based
instead of wait-queue based, and implement the internal wait-queue
infrastructure in terms of this new interface.  We then convert irqfd
to use this new interface instead of the existing wait-queue code.

The end result is that we now have the opportunity to run the interrupt
injection code serially to the callback (when the signal is raised from
process-context, at least) instead of always deferring the injection to a
work-queue.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
CC: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
CC: Davide Libenzi <davidel@xmailserver.org>
---

 fs/eventfd.c            |  115 +++++++++++++++++++++++++++++++++++++++++++----
 include/linux/eventfd.h |   30 ++++++++++++
 virt/kvm/eventfd.c      |  114 +++++++++++++++++++++--------------------------
 3 files changed, 188 insertions(+), 71 deletions(-)


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Michael S. Tsirkin June 16, 2009, 2:02 p.m. UTC | #1
On Mon, Jun 15, 2009 at 10:29:56PM -0400, Gregory Haskins wrote:
> irqfd and its underlying implementation, eventfd, currently utilize
> the embedded wait-queue in eventfd for signal notification.  The nice thing
> about this design decision is that it re-uses the existing
> eventfd/wait-queue code and it generally works well....with several
> limitations.
> 
> One of the limitations is that notification callbacks are always called
> inside a spin_lock_irqsave critical section.  Another limitation is
> that it is very difficult to build a system that can recieve release
> notification without being racy.
> 
> Therefore, we introduce a new registration interface that is SRCU based
> instead of wait-queue based, and implement the internal wait-queue
> infrastructure in terms of this new interface.  We then convert irqfd
> to use this new interface instead of the existing wait-queue code.
> 
> The end result is that we now have the opportunity to run the interrupt
> injection code serially to the callback (when the signal is raised from
> process-context, at least) instead of always deferring the injection to a
> work-queue.
> 
> Signed-off-by: Gregory Haskins <ghaskins@novell.com>
> CC: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
> CC: Davide Libenzi <davidel@xmailserver.org>
> ---
> 
>  fs/eventfd.c            |  115 +++++++++++++++++++++++++++++++++++++++++++----
>  include/linux/eventfd.h |   30 ++++++++++++
>  virt/kvm/eventfd.c      |  114 +++++++++++++++++++++--------------------------
>  3 files changed, 188 insertions(+), 71 deletions(-)
> 
> diff --git a/fs/eventfd.c b/fs/eventfd.c
> index 72f5f8d..505d5de 100644
> --- a/fs/eventfd.c
> +++ b/fs/eventfd.c
> @@ -30,8 +30,47 @@ struct eventfd_ctx {
>  	 */
>  	__u64 count;
>  	unsigned int flags;
> +	struct srcu_struct srcu;
> +	struct list_head nh;
> +	struct eventfd_notifier notifier;
>  };
>  
> +static void _eventfd_wqh_notify(struct eventfd_notifier *en)
> +{
> +	struct eventfd_ctx *ctx = container_of(en,
> +					       struct eventfd_ctx,
> +					       notifier);
> +
> +	if (waitqueue_active(&ctx->wqh))
> +		wake_up_poll(&ctx->wqh, POLLIN);
> +}
> +
> +static void _eventfd_notify(struct eventfd_ctx *ctx)
> +{
> +	struct eventfd_notifier *en;
> +	int idx;
> +
> +	idx = srcu_read_lock(&ctx->srcu);
> +
> +	/*
> +	 * The goal here is to allow the notification to be preemptible
> +	 * as often as possible.  We cannot achieve this with the basic
> +	 * wqh mechanism because it requires the wqh->lock.  Therefore
> +	 * we have an internal srcu list mechanism of which the wqh is
> +	 * a client.
> +	 *
> +	 * Not all paths will invoke this function in process context.
> +	 * Callers should check for suitable state before assuming they
> +	 * can sleep (such as with preemptible()).  Paul McKenney assures
> +	 * me that srcu_read_lock is compatible with in-atomic, as long as
> +	 * the code within the critical section is also compatible.
> +	 */
> +	list_for_each_entry_rcu(en, &ctx->nh, list)
> +		en->ops->signal(en);
> +
> +	srcu_read_unlock(&ctx->srcu, idx);
> +}
> +
>  /*
>   * Adds "n" to the eventfd counter "count". Returns "n" in case of
>   * success, or a value lower then "n" in case of coutner overflow.

This is ugly, isn't it? With CONFIG_PREEMPT=no preemptible() is always false.

Further, to do useful things it might not be enough that you can sleep:
with iofd you also want to access current task with e.g. copy from user.

Here's an idea: let's pass a flag to ->signal, along the lines of
signal_is_task, that tells us that it is safe to use current, and add
eventfd_signal_task() which is the same as eventfd_signal but lets everyone
know that it's safe to both sleep and use current->mm.

Makes sense?
Gregory Haskins June 16, 2009, 2:11 p.m. UTC | #2
Michael S. Tsirkin wrote:
> On Mon, Jun 15, 2009 at 10:29:56PM -0400, Gregory Haskins wrote:
>   
>> irqfd and its underlying implementation, eventfd, currently utilize
>> the embedded wait-queue in eventfd for signal notification.  The nice thing
>> about this design decision is that it re-uses the existing
>> eventfd/wait-queue code and it generally works well....with several
>> limitations.
>>
>> One of the limitations is that notification callbacks are always called
>> inside a spin_lock_irqsave critical section.  Another limitation is
>> that it is very difficult to build a system that can recieve release
>> notification without being racy.
>>
>> Therefore, we introduce a new registration interface that is SRCU based
>> instead of wait-queue based, and implement the internal wait-queue
>> infrastructure in terms of this new interface.  We then convert irqfd
>> to use this new interface instead of the existing wait-queue code.
>>
>> The end result is that we now have the opportunity to run the interrupt
>> injection code serially to the callback (when the signal is raised from
>> process-context, at least) instead of always deferring the injection to a
>> work-queue.
>>
>> Signed-off-by: Gregory Haskins <ghaskins@novell.com>
>> CC: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
>> CC: Davide Libenzi <davidel@xmailserver.org>
>> ---
>>
>>  fs/eventfd.c            |  115 +++++++++++++++++++++++++++++++++++++++++++----
>>  include/linux/eventfd.h |   30 ++++++++++++
>>  virt/kvm/eventfd.c      |  114 +++++++++++++++++++++--------------------------
>>  3 files changed, 188 insertions(+), 71 deletions(-)
>>
>> diff --git a/fs/eventfd.c b/fs/eventfd.c
>> index 72f5f8d..505d5de 100644
>> --- a/fs/eventfd.c
>> +++ b/fs/eventfd.c
>> @@ -30,8 +30,47 @@ struct eventfd_ctx {
>>  	 */
>>  	__u64 count;
>>  	unsigned int flags;
>> +	struct srcu_struct srcu;
>> +	struct list_head nh;
>> +	struct eventfd_notifier notifier;
>>  };
>>  
>> +static void _eventfd_wqh_notify(struct eventfd_notifier *en)
>> +{
>> +	struct eventfd_ctx *ctx = container_of(en,
>> +					       struct eventfd_ctx,
>> +					       notifier);
>> +
>> +	if (waitqueue_active(&ctx->wqh))
>> +		wake_up_poll(&ctx->wqh, POLLIN);
>> +}
>> +
>> +static void _eventfd_notify(struct eventfd_ctx *ctx)
>> +{
>> +	struct eventfd_notifier *en;
>> +	int idx;
>> +
>> +	idx = srcu_read_lock(&ctx->srcu);
>> +
>> +	/*
>> +	 * The goal here is to allow the notification to be preemptible
>> +	 * as often as possible.  We cannot achieve this with the basic
>> +	 * wqh mechanism because it requires the wqh->lock.  Therefore
>> +	 * we have an internal srcu list mechanism of which the wqh is
>> +	 * a client.
>> +	 *
>> +	 * Not all paths will invoke this function in process context.
>> +	 * Callers should check for suitable state before assuming they
>> +	 * can sleep (such as with preemptible()).  Paul McKenney assures
>> +	 * me that srcu_read_lock is compatible with in-atomic, as long as
>> +	 * the code within the critical section is also compatible.
>> +	 */
>> +	list_for_each_entry_rcu(en, &ctx->nh, list)
>> +		en->ops->signal(en);
>> +
>> +	srcu_read_unlock(&ctx->srcu, idx);
>> +}
>> +
>>  /*
>>   * Adds "n" to the eventfd counter "count". Returns "n" in case of
>>   * success, or a value lower then "n" in case of coutner overflow.
>>     
>
> This is ugly, isn't it? With CONFIG_PREEMPT=no preemptible() is always false.
>
> Further, to do useful things it might not be enough that you can sleep:
> with iofd you also want to access current task with e.g. copy from user.
>
> Here's an idea: let's pass a flag to ->signal, along the lines of
> signal_is_task, that tells us that it is safe to use current, and add
> eventfd_signal_task() which is the same as eventfd_signal but lets everyone
> know that it's safe to both sleep and use current->mm.
>
> Makes sense?
>   

It does make sense, yes.  What I am not clear on is how would eventfd
detect this state such as to populate such flags, and why cant the
->signal() CB do the same?

Thanks Michael,
-Greg
Gregory Haskins June 16, 2009, 2:17 p.m. UTC | #3
Michael S. Tsirkin wrote:
> On Mon, Jun 15, 2009 at 10:29:56PM -0400, Gregory Haskins wrote:
>   
>> irqfd and its underlying implementation, eventfd, currently utilize
>> the embedded wait-queue in eventfd for signal notification.  The nice thing
>> about this design decision is that it re-uses the existing
>> eventfd/wait-queue code and it generally works well....with several
>> limitations.
>>
>> One of the limitations is that notification callbacks are always called
>> inside a spin_lock_irqsave critical section.  Another limitation is
>> that it is very difficult to build a system that can recieve release
>> notification without being racy.
>>
>> Therefore, we introduce a new registration interface that is SRCU based
>> instead of wait-queue based, and implement the internal wait-queue
>> infrastructure in terms of this new interface.  We then convert irqfd
>> to use this new interface instead of the existing wait-queue code.
>>
>> The end result is that we now have the opportunity to run the interrupt
>> injection code serially to the callback (when the signal is raised from
>> process-context, at least) instead of always deferring the injection to a
>> work-queue.
>>
>> Signed-off-by: Gregory Haskins <ghaskins@novell.com>
>> CC: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
>> CC: Davide Libenzi <davidel@xmailserver.org>
>> ---
>>
>>  fs/eventfd.c            |  115 +++++++++++++++++++++++++++++++++++++++++++----
>>  include/linux/eventfd.h |   30 ++++++++++++
>>  virt/kvm/eventfd.c      |  114 +++++++++++++++++++++--------------------------
>>  3 files changed, 188 insertions(+), 71 deletions(-)
>>
>> diff --git a/fs/eventfd.c b/fs/eventfd.c
>> index 72f5f8d..505d5de 100644
>> --- a/fs/eventfd.c
>> +++ b/fs/eventfd.c
>> @@ -30,8 +30,47 @@ struct eventfd_ctx {
>>  	 */
>>  	__u64 count;
>>  	unsigned int flags;
>> +	struct srcu_struct srcu;
>> +	struct list_head nh;
>> +	struct eventfd_notifier notifier;
>>  };
>>  
>> +static void _eventfd_wqh_notify(struct eventfd_notifier *en)
>> +{
>> +	struct eventfd_ctx *ctx = container_of(en,
>> +					       struct eventfd_ctx,
>> +					       notifier);
>> +
>> +	if (waitqueue_active(&ctx->wqh))
>> +		wake_up_poll(&ctx->wqh, POLLIN);
>> +}
>> +
>> +static void _eventfd_notify(struct eventfd_ctx *ctx)
>> +{
>> +	struct eventfd_notifier *en;
>> +	int idx;
>> +
>> +	idx = srcu_read_lock(&ctx->srcu);
>> +
>> +	/*
>> +	 * The goal here is to allow the notification to be preemptible
>> +	 * as often as possible.  We cannot achieve this with the basic
>> +	 * wqh mechanism because it requires the wqh->lock.  Therefore
>> +	 * we have an internal srcu list mechanism of which the wqh is
>> +	 * a client.
>> +	 *
>> +	 * Not all paths will invoke this function in process context.
>> +	 * Callers should check for suitable state before assuming they
>> +	 * can sleep (such as with preemptible()).  Paul McKenney assures
>> +	 * me that srcu_read_lock is compatible with in-atomic, as long as
>> +	 * the code within the critical section is also compatible.
>> +	 */
>> +	list_for_each_entry_rcu(en, &ctx->nh, list)
>> +		en->ops->signal(en);
>> +
>> +	srcu_read_unlock(&ctx->srcu, idx);
>> +}
>> +
>>  /*
>>   * Adds "n" to the eventfd counter "count". Returns "n" in case of
>>   * success, or a value lower then "n" in case of coutner overflow.
>>     
>
> This is ugly, isn't it? With CONFIG_PREEMPT=no preemptible() is always false.
>   

As an aside, this is something I would like to address.  I keep running
into this pattern where I could do something in-line if I had a
"can_sleep()" context.  Otherwise, I have to punt to something like a
workqueue which adds latency.  The closest thing I have to "can_sleep()"
is preemptible(), which, as you correctly pointed out is limited to only
working with CONFIG_PREEMPT=y.

Its been a while since I looked into it, but one of the barriers that
would need to be overcome is the fact that the preempt_count stuff gets
compiled away with CONFIG_PREEMPT=n.  It is conceivable that we could
make the preempt_count logic an independent config variable from
CONFIG_PREEMPT to provide a can_sleep() macro without requiring
full-blow preemption to be enabled.  So my questions would be as follows:

a) Is the community conducive to such an idea?
b) Are there other things to consider/fix besides the lack of
preempt_count in order to implement such a beast?

-Greg
Gregory Haskins June 16, 2009, 2:22 p.m. UTC | #4
[Adding Ingo]

Gregory Haskins wrote:
> Michael S. Tsirkin wrote:
>   
>> On Mon, Jun 15, 2009 at 10:29:56PM -0400, Gregory Haskins wrote:
>>   
>>     
>>> irqfd and its underlying implementation, eventfd, currently utilize
>>> the embedded wait-queue in eventfd for signal notification.  The nice thing
>>> about this design decision is that it re-uses the existing
>>> eventfd/wait-queue code and it generally works well....with several
>>> limitations.
>>>
>>> One of the limitations is that notification callbacks are always called
>>> inside a spin_lock_irqsave critical section.  Another limitation is
>>> that it is very difficult to build a system that can recieve release
>>> notification without being racy.
>>>
>>> Therefore, we introduce a new registration interface that is SRCU based
>>> instead of wait-queue based, and implement the internal wait-queue
>>> infrastructure in terms of this new interface.  We then convert irqfd
>>> to use this new interface instead of the existing wait-queue code.
>>>
>>> The end result is that we now have the opportunity to run the interrupt
>>> injection code serially to the callback (when the signal is raised from
>>> process-context, at least) instead of always deferring the injection to a
>>> work-queue.
>>>
>>> Signed-off-by: Gregory Haskins <ghaskins@novell.com>
>>> CC: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
>>> CC: Davide Libenzi <davidel@xmailserver.org>
>>> ---
>>>
>>>  fs/eventfd.c            |  115 +++++++++++++++++++++++++++++++++++++++++++----
>>>  include/linux/eventfd.h |   30 ++++++++++++
>>>  virt/kvm/eventfd.c      |  114 +++++++++++++++++++++--------------------------
>>>  3 files changed, 188 insertions(+), 71 deletions(-)
>>>
>>> diff --git a/fs/eventfd.c b/fs/eventfd.c
>>> index 72f5f8d..505d5de 100644
>>> --- a/fs/eventfd.c
>>> +++ b/fs/eventfd.c
>>> @@ -30,8 +30,47 @@ struct eventfd_ctx {
>>>  	 */
>>>  	__u64 count;
>>>  	unsigned int flags;
>>> +	struct srcu_struct srcu;
>>> +	struct list_head nh;
>>> +	struct eventfd_notifier notifier;
>>>  };
>>>  
>>> +static void _eventfd_wqh_notify(struct eventfd_notifier *en)
>>> +{
>>> +	struct eventfd_ctx *ctx = container_of(en,
>>> +					       struct eventfd_ctx,
>>> +					       notifier);
>>> +
>>> +	if (waitqueue_active(&ctx->wqh))
>>> +		wake_up_poll(&ctx->wqh, POLLIN);
>>> +}
>>> +
>>> +static void _eventfd_notify(struct eventfd_ctx *ctx)
>>> +{
>>> +	struct eventfd_notifier *en;
>>> +	int idx;
>>> +
>>> +	idx = srcu_read_lock(&ctx->srcu);
>>> +
>>> +	/*
>>> +	 * The goal here is to allow the notification to be preemptible
>>> +	 * as often as possible.  We cannot achieve this with the basic
>>> +	 * wqh mechanism because it requires the wqh->lock.  Therefore
>>> +	 * we have an internal srcu list mechanism of which the wqh is
>>> +	 * a client.
>>> +	 *
>>> +	 * Not all paths will invoke this function in process context.
>>> +	 * Callers should check for suitable state before assuming they
>>> +	 * can sleep (such as with preemptible()).  Paul McKenney assures
>>> +	 * me that srcu_read_lock is compatible with in-atomic, as long as
>>> +	 * the code within the critical section is also compatible.
>>> +	 */
>>> +	list_for_each_entry_rcu(en, &ctx->nh, list)
>>> +		en->ops->signal(en);
>>> +
>>> +	srcu_read_unlock(&ctx->srcu, idx);
>>> +}
>>> +
>>>  /*
>>>   * Adds "n" to the eventfd counter "count". Returns "n" in case of
>>>   * success, or a value lower then "n" in case of coutner overflow.
>>>     
>>>       
>> This is ugly, isn't it? With CONFIG_PREEMPT=no preemptible() is always false.
>>   
>>     
>
> As an aside, this is something I would like to address.  I keep running
> into this pattern where I could do something in-line if I had a
> "can_sleep()" context.  Otherwise, I have to punt to something like a
> workqueue which adds latency.  The closest thing I have to "can_sleep()"
> is preemptible(), which, as you correctly pointed out is limited to only
> working with CONFIG_PREEMPT=y.
>
> Its been a while since I looked into it, but one of the barriers that
> would need to be overcome is the fact that the preempt_count stuff gets
> compiled away with CONFIG_PREEMPT=n.  It is conceivable that we could
> make the preempt_count logic an independent config variable from
> CONFIG_PREEMPT to provide a can_sleep() macro without requiring
> full-blow preemption to be enabled.  So my questions would be as follows:
>
> a) Is the community conducive to such an idea?
> b) Are there other things to consider/fix besides the lack of
> preempt_count in order to implement such a beast?
>
> -Greg
>
>
>   
Hi Ingo,

Any thoughts here?

-Greg
Michael S. Tsirkin June 16, 2009, 2:38 p.m. UTC | #5
On Tue, Jun 16, 2009 at 10:11:08AM -0400, Gregory Haskins wrote:
> Michael S. Tsirkin wrote:
> > On Mon, Jun 15, 2009 at 10:29:56PM -0400, Gregory Haskins wrote:
> >   
> >> irqfd and its underlying implementation, eventfd, currently utilize
> >> the embedded wait-queue in eventfd for signal notification.  The nice thing
> >> about this design decision is that it re-uses the existing
> >> eventfd/wait-queue code and it generally works well....with several
> >> limitations.
> >>
> >> One of the limitations is that notification callbacks are always called
> >> inside a spin_lock_irqsave critical section.  Another limitation is
> >> that it is very difficult to build a system that can recieve release
> >> notification without being racy.
> >>
> >> Therefore, we introduce a new registration interface that is SRCU based
> >> instead of wait-queue based, and implement the internal wait-queue
> >> infrastructure in terms of this new interface.  We then convert irqfd
> >> to use this new interface instead of the existing wait-queue code.
> >>
> >> The end result is that we now have the opportunity to run the interrupt
> >> injection code serially to the callback (when the signal is raised from
> >> process-context, at least) instead of always deferring the injection to a
> >> work-queue.
> >>
> >> Signed-off-by: Gregory Haskins <ghaskins@novell.com>
> >> CC: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
> >> CC: Davide Libenzi <davidel@xmailserver.org>
> >> ---
> >>
> >>  fs/eventfd.c            |  115 +++++++++++++++++++++++++++++++++++++++++++----
> >>  include/linux/eventfd.h |   30 ++++++++++++
> >>  virt/kvm/eventfd.c      |  114 +++++++++++++++++++++--------------------------
> >>  3 files changed, 188 insertions(+), 71 deletions(-)
> >>
> >> diff --git a/fs/eventfd.c b/fs/eventfd.c
> >> index 72f5f8d..505d5de 100644
> >> --- a/fs/eventfd.c
> >> +++ b/fs/eventfd.c
> >> @@ -30,8 +30,47 @@ struct eventfd_ctx {
> >>  	 */
> >>  	__u64 count;
> >>  	unsigned int flags;
> >> +	struct srcu_struct srcu;
> >> +	struct list_head nh;
> >> +	struct eventfd_notifier notifier;
> >>  };
> >>  
> >> +static void _eventfd_wqh_notify(struct eventfd_notifier *en)
> >> +{
> >> +	struct eventfd_ctx *ctx = container_of(en,
> >> +					       struct eventfd_ctx,
> >> +					       notifier);
> >> +
> >> +	if (waitqueue_active(&ctx->wqh))
> >> +		wake_up_poll(&ctx->wqh, POLLIN);
> >> +}
> >> +
> >> +static void _eventfd_notify(struct eventfd_ctx *ctx)
> >> +{
> >> +	struct eventfd_notifier *en;
> >> +	int idx;
> >> +
> >> +	idx = srcu_read_lock(&ctx->srcu);
> >> +
> >> +	/*
> >> +	 * The goal here is to allow the notification to be preemptible
> >> +	 * as often as possible.  We cannot achieve this with the basic
> >> +	 * wqh mechanism because it requires the wqh->lock.  Therefore
> >> +	 * we have an internal srcu list mechanism of which the wqh is
> >> +	 * a client.
> >> +	 *
> >> +	 * Not all paths will invoke this function in process context.
> >> +	 * Callers should check for suitable state before assuming they
> >> +	 * can sleep (such as with preemptible()).  Paul McKenney assures
> >> +	 * me that srcu_read_lock is compatible with in-atomic, as long as
> >> +	 * the code within the critical section is also compatible.
> >> +	 */
> >> +	list_for_each_entry_rcu(en, &ctx->nh, list)
> >> +		en->ops->signal(en);
> >> +
> >> +	srcu_read_unlock(&ctx->srcu, idx);
> >> +}
> >> +
> >>  /*
> >>   * Adds "n" to the eventfd counter "count". Returns "n" in case of
> >>   * success, or a value lower then "n" in case of coutner overflow.
> >>     
> >
> > This is ugly, isn't it? With CONFIG_PREEMPT=no preemptible() is always false.
> >
> > Further, to do useful things it might not be enough that you can sleep:
> > with iofd you also want to access current task with e.g. copy from user.
> >
> > Here's an idea: let's pass a flag to ->signal, along the lines of
> > signal_is_task, that tells us that it is safe to use current, and add
> > eventfd_signal_task() which is the same as eventfd_signal but lets everyone
> > know that it's safe to both sleep and use current->mm.
> >
> > Makes sense?
> >   
> 
> It does make sense, yes.  What I am not clear on is how would eventfd
> detect this state such as to populate such flags, and why cant the
> ->signal() CB do the same?
> 
> Thanks Michael,
> -Greg
> 

eventfd can't detect this state. But the callers know in what context they are.
So the *caller* of eventfd_signal_task makes sure of this: if you are in a task,
you can call eventfd_signal_task() if not, you must call eventfd_signal.
Gregory Haskins June 16, 2009, 2:40 p.m. UTC | #6
Michael S. Tsirkin wrote:
> On Mon, Jun 15, 2009 at 10:29:56PM -0400, Gregory Haskins wrote:
>   
>> irqfd and its underlying implementation, eventfd, currently utilize
>> the embedded wait-queue in eventfd for signal notification.  The nice thing
>> about this design decision is that it re-uses the existing
>> eventfd/wait-queue code and it generally works well....with several
>> limitations.
>>
>> One of the limitations is that notification callbacks are always called
>> inside a spin_lock_irqsave critical section.  Another limitation is
>> that it is very difficult to build a system that can recieve release
>> notification without being racy.
>>
>> Therefore, we introduce a new registration interface that is SRCU based
>> instead of wait-queue based, and implement the internal wait-queue
>> infrastructure in terms of this new interface.  We then convert irqfd
>> to use this new interface instead of the existing wait-queue code.
>>
>> The end result is that we now have the opportunity to run the interrupt
>> injection code serially to the callback (when the signal is raised from
>> process-context, at least) instead of always deferring the injection to a
>> work-queue.
>>
>> Signed-off-by: Gregory Haskins <ghaskins@novell.com>
>> CC: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
>> CC: Davide Libenzi <davidel@xmailserver.org>
>> ---
>>
>>  fs/eventfd.c            |  115 +++++++++++++++++++++++++++++++++++++++++++----
>>  include/linux/eventfd.h |   30 ++++++++++++
>>  virt/kvm/eventfd.c      |  114 +++++++++++++++++++++--------------------------
>>  3 files changed, 188 insertions(+), 71 deletions(-)
>>
>> diff --git a/fs/eventfd.c b/fs/eventfd.c
>> index 72f5f8d..505d5de 100644
>> --- a/fs/eventfd.c
>> +++ b/fs/eventfd.c
>> @@ -30,8 +30,47 @@ struct eventfd_ctx {
>>  	 */
>>  	__u64 count;
>>  	unsigned int flags;
>> +	struct srcu_struct srcu;
>> +	struct list_head nh;
>> +	struct eventfd_notifier notifier;
>>  };
>>  
>> +static void _eventfd_wqh_notify(struct eventfd_notifier *en)
>> +{
>> +	struct eventfd_ctx *ctx = container_of(en,
>> +					       struct eventfd_ctx,
>> +					       notifier);
>> +
>> +	if (waitqueue_active(&ctx->wqh))
>> +		wake_up_poll(&ctx->wqh, POLLIN);
>> +}
>> +
>> +static void _eventfd_notify(struct eventfd_ctx *ctx)
>> +{
>> +	struct eventfd_notifier *en;
>> +	int idx;
>> +
>> +	idx = srcu_read_lock(&ctx->srcu);
>> +
>> +	/*
>> +	 * The goal here is to allow the notification to be preemptible
>> +	 * as often as possible.  We cannot achieve this with the basic
>> +	 * wqh mechanism because it requires the wqh->lock.  Therefore
>> +	 * we have an internal srcu list mechanism of which the wqh is
>> +	 * a client.
>> +	 *
>> +	 * Not all paths will invoke this function in process context.
>> +	 * Callers should check for suitable state before assuming they
>> +	 * can sleep (such as with preemptible()).  Paul McKenney assures
>> +	 * me that srcu_read_lock is compatible with in-atomic, as long as
>> +	 * the code within the critical section is also compatible.
>> +	 */
>> +	list_for_each_entry_rcu(en, &ctx->nh, list)
>> +		en->ops->signal(en);
>> +
>> +	srcu_read_unlock(&ctx->srcu, idx);
>> +}
>> +
>>  /*
>>   * Adds "n" to the eventfd counter "count". Returns "n" in case of
>>   * success, or a value lower then "n" in case of coutner overflow.
>>     
>
> This is ugly, isn't it? With CONFIG_PREEMPT=no preemptible() is always false.
>
> Further, to do useful things it might not be enough that you can sleep:
> with iofd you also want to access current task with e.g. copy from user.
>   

Something else to consider:  For iosignalfd, we can assume we will
always be called from vcpu process context so we might not really need
official affirmation from the system.  For irqfd, we cannot predict who
may be injecting the interrupt (for instance, it might be a
PCI-passthrough hard-irq context).  I am not sure if this knowledge
actually helps to simplify the problem space, but I thought I should
mention it.

-Greg
Michael S. Tsirkin June 16, 2009, 2:46 p.m. UTC | #7
On Tue, Jun 16, 2009 at 10:40:55AM -0400, Gregory Haskins wrote:
> > This is ugly, isn't it? With CONFIG_PREEMPT=no preemptible() is always false.
> >
> > Further, to do useful things it might not be enough that you can sleep:
> > with iofd you also want to access current task with e.g. copy from user.
> >   
> 
> Something else to consider:  For iosignalfd, we can assume we will
> always be called from vcpu process context so we might not really need
> official affirmation from the system.  For irqfd, we cannot predict who
> may be injecting the interrupt (for instance, it might be a
> PCI-passthrough hard-irq context).  I am not sure if this knowledge
> actually helps to simplify the problem space, but I thought I should
> mention it.
> 
> -Greg
> 
> 

The way this is addressed with eventfd_signal_task proposal is:
- user calls eventfd_signal_task
  we look at current->mm and figure out whether this is the right
  context or we need a context switch
- everyone else calls eventfd_signal
  we know that we need a context switch


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gregory Haskins June 16, 2009, 2:48 p.m. UTC | #8
Michael S. Tsirkin wrote:
> On Tue, Jun 16, 2009 at 10:11:08AM -0400, Gregory Haskins wrote:
>   
>> Michael S. Tsirkin wrote:
>>     
>>> On Mon, Jun 15, 2009 at 10:29:56PM -0400, Gregory Haskins wrote:
>>>   
>>>       
>>>> irqfd and its underlying implementation, eventfd, currently utilize
>>>> the embedded wait-queue in eventfd for signal notification.  The nice thing
>>>> about this design decision is that it re-uses the existing
>>>> eventfd/wait-queue code and it generally works well....with several
>>>> limitations.
>>>>
>>>> One of the limitations is that notification callbacks are always called
>>>> inside a spin_lock_irqsave critical section.  Another limitation is
>>>> that it is very difficult to build a system that can recieve release
>>>> notification without being racy.
>>>>
>>>> Therefore, we introduce a new registration interface that is SRCU based
>>>> instead of wait-queue based, and implement the internal wait-queue
>>>> infrastructure in terms of this new interface.  We then convert irqfd
>>>> to use this new interface instead of the existing wait-queue code.
>>>>
>>>> The end result is that we now have the opportunity to run the interrupt
>>>> injection code serially to the callback (when the signal is raised from
>>>> process-context, at least) instead of always deferring the injection to a
>>>> work-queue.
>>>>
>>>> Signed-off-by: Gregory Haskins <ghaskins@novell.com>
>>>> CC: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
>>>> CC: Davide Libenzi <davidel@xmailserver.org>
>>>> ---
>>>>
>>>>  fs/eventfd.c            |  115 +++++++++++++++++++++++++++++++++++++++++++----
>>>>  include/linux/eventfd.h |   30 ++++++++++++
>>>>  virt/kvm/eventfd.c      |  114 +++++++++++++++++++++--------------------------
>>>>  3 files changed, 188 insertions(+), 71 deletions(-)
>>>>
>>>> diff --git a/fs/eventfd.c b/fs/eventfd.c
>>>> index 72f5f8d..505d5de 100644
>>>> --- a/fs/eventfd.c
>>>> +++ b/fs/eventfd.c
>>>> @@ -30,8 +30,47 @@ struct eventfd_ctx {
>>>>  	 */
>>>>  	__u64 count;
>>>>  	unsigned int flags;
>>>> +	struct srcu_struct srcu;
>>>> +	struct list_head nh;
>>>> +	struct eventfd_notifier notifier;
>>>>  };
>>>>  
>>>> +static void _eventfd_wqh_notify(struct eventfd_notifier *en)
>>>> +{
>>>> +	struct eventfd_ctx *ctx = container_of(en,
>>>> +					       struct eventfd_ctx,
>>>> +					       notifier);
>>>> +
>>>> +	if (waitqueue_active(&ctx->wqh))
>>>> +		wake_up_poll(&ctx->wqh, POLLIN);
>>>> +}
>>>> +
>>>> +static void _eventfd_notify(struct eventfd_ctx *ctx)
>>>> +{
>>>> +	struct eventfd_notifier *en;
>>>> +	int idx;
>>>> +
>>>> +	idx = srcu_read_lock(&ctx->srcu);
>>>> +
>>>> +	/*
>>>> +	 * The goal here is to allow the notification to be preemptible
>>>> +	 * as often as possible.  We cannot achieve this with the basic
>>>> +	 * wqh mechanism because it requires the wqh->lock.  Therefore
>>>> +	 * we have an internal srcu list mechanism of which the wqh is
>>>> +	 * a client.
>>>> +	 *
>>>> +	 * Not all paths will invoke this function in process context.
>>>> +	 * Callers should check for suitable state before assuming they
>>>> +	 * can sleep (such as with preemptible()).  Paul McKenney assures
>>>> +	 * me that srcu_read_lock is compatible with in-atomic, as long as
>>>> +	 * the code within the critical section is also compatible.
>>>> +	 */
>>>> +	list_for_each_entry_rcu(en, &ctx->nh, list)
>>>> +		en->ops->signal(en);
>>>> +
>>>> +	srcu_read_unlock(&ctx->srcu, idx);
>>>> +}
>>>> +
>>>>  /*
>>>>   * Adds "n" to the eventfd counter "count". Returns "n" in case of
>>>>   * success, or a value lower then "n" in case of coutner overflow.
>>>>     
>>>>         
>>> This is ugly, isn't it? With CONFIG_PREEMPT=no preemptible() is always false.
>>>
>>> Further, to do useful things it might not be enough that you can sleep:
>>> with iofd you also want to access current task with e.g. copy from user.
>>>
>>> Here's an idea: let's pass a flag to ->signal, along the lines of
>>> signal_is_task, that tells us that it is safe to use current, and add
>>> eventfd_signal_task() which is the same as eventfd_signal but lets everyone
>>> know that it's safe to both sleep and use current->mm.
>>>
>>> Makes sense?
>>>   
>>>       
>> It does make sense, yes.  What I am not clear on is how would eventfd
>> detect this state such as to populate such flags, and why cant the
>> ->signal() CB do the same?
>>
>> Thanks Michael,
>> -Greg
>>
>>     
>
> eventfd can't detect this state. But the callers know in what context they are.
> So the *caller* of eventfd_signal_task makes sure of this: if you are in a task,
> you can call eventfd_signal_task() if not, you must call eventfd_signal.
>
>
>   
Hmm, this is an interesting idea, but I think it would be problematic in
real-world applications for the long-term.  For instance, the -rt tree
and irq-threads .config option in the process of merging into mainline
changes context types for established code.  Therefore, what might be
"hardirq/softirq" logic today may execute in a kthread tomorrow.  I
think its dangerous to try to solve the problem with caller provided
info:  the caller may be ignorant of its true state.  IMO, the ideal
solution needs to be something we can detect at run-time.

Thanks Michael,
-Greg
Gregory Haskins June 16, 2009, 2:54 p.m. UTC | #9
Gregory Haskins wrote:
> Michael S. Tsirkin wrote:
>   
>> On Tue, Jun 16, 2009 at 10:11:08AM -0400, Gregory Haskins wrote:
>>   
>>     
>>> Michael S. Tsirkin wrote:
>>>     
>>>       
>>>> On Mon, Jun 15, 2009 at 10:29:56PM -0400, Gregory Haskins wrote:
>>>>   
>>>>       
>>>>         
>>>>> irqfd and its underlying implementation, eventfd, currently utilize
>>>>> the embedded wait-queue in eventfd for signal notification.  The nice thing
>>>>> about this design decision is that it re-uses the existing
>>>>> eventfd/wait-queue code and it generally works well....with several
>>>>> limitations.
>>>>>
>>>>> One of the limitations is that notification callbacks are always called
>>>>> inside a spin_lock_irqsave critical section.  Another limitation is
>>>>> that it is very difficult to build a system that can recieve release
>>>>> notification without being racy.
>>>>>
>>>>> Therefore, we introduce a new registration interface that is SRCU based
>>>>> instead of wait-queue based, and implement the internal wait-queue
>>>>> infrastructure in terms of this new interface.  We then convert irqfd
>>>>> to use this new interface instead of the existing wait-queue code.
>>>>>
>>>>> The end result is that we now have the opportunity to run the interrupt
>>>>> injection code serially to the callback (when the signal is raised from
>>>>> process-context, at least) instead of always deferring the injection to a
>>>>> work-queue.
>>>>>
>>>>> Signed-off-by: Gregory Haskins <ghaskins@novell.com>
>>>>> CC: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
>>>>> CC: Davide Libenzi <davidel@xmailserver.org>
>>>>> ---
>>>>>
>>>>>  fs/eventfd.c            |  115 +++++++++++++++++++++++++++++++++++++++++++----
>>>>>  include/linux/eventfd.h |   30 ++++++++++++
>>>>>  virt/kvm/eventfd.c      |  114 +++++++++++++++++++++--------------------------
>>>>>  3 files changed, 188 insertions(+), 71 deletions(-)
>>>>>
>>>>> diff --git a/fs/eventfd.c b/fs/eventfd.c
>>>>> index 72f5f8d..505d5de 100644
>>>>> --- a/fs/eventfd.c
>>>>> +++ b/fs/eventfd.c
>>>>> @@ -30,8 +30,47 @@ struct eventfd_ctx {
>>>>>  	 */
>>>>>  	__u64 count;
>>>>>  	unsigned int flags;
>>>>> +	struct srcu_struct srcu;
>>>>> +	struct list_head nh;
>>>>> +	struct eventfd_notifier notifier;
>>>>>  };
>>>>>  
>>>>> +static void _eventfd_wqh_notify(struct eventfd_notifier *en)
>>>>> +{
>>>>> +	struct eventfd_ctx *ctx = container_of(en,
>>>>> +					       struct eventfd_ctx,
>>>>> +					       notifier);
>>>>> +
>>>>> +	if (waitqueue_active(&ctx->wqh))
>>>>> +		wake_up_poll(&ctx->wqh, POLLIN);
>>>>> +}
>>>>> +
>>>>> +static void _eventfd_notify(struct eventfd_ctx *ctx)
>>>>> +{
>>>>> +	struct eventfd_notifier *en;
>>>>> +	int idx;
>>>>> +
>>>>> +	idx = srcu_read_lock(&ctx->srcu);
>>>>> +
>>>>> +	/*
>>>>> +	 * The goal here is to allow the notification to be preemptible
>>>>> +	 * as often as possible.  We cannot achieve this with the basic
>>>>> +	 * wqh mechanism because it requires the wqh->lock.  Therefore
>>>>> +	 * we have an internal srcu list mechanism of which the wqh is
>>>>> +	 * a client.
>>>>> +	 *
>>>>> +	 * Not all paths will invoke this function in process context.
>>>>> +	 * Callers should check for suitable state before assuming they
>>>>> +	 * can sleep (such as with preemptible()).  Paul McKenney assures
>>>>> +	 * me that srcu_read_lock is compatible with in-atomic, as long as
>>>>> +	 * the code within the critical section is also compatible.
>>>>> +	 */
>>>>> +	list_for_each_entry_rcu(en, &ctx->nh, list)
>>>>> +		en->ops->signal(en);
>>>>> +
>>>>> +	srcu_read_unlock(&ctx->srcu, idx);
>>>>> +}
>>>>> +
>>>>>  /*
>>>>>   * Adds "n" to the eventfd counter "count". Returns "n" in case of
>>>>>   * success, or a value lower then "n" in case of coutner overflow.
>>>>>     
>>>>>         
>>>>>           
>>>> This is ugly, isn't it? With CONFIG_PREEMPT=no preemptible() is always false.
>>>>
>>>> Further, to do useful things it might not be enough that you can sleep:
>>>> with iofd you also want to access current task with e.g. copy from user.
>>>>
>>>> Here's an idea: let's pass a flag to ->signal, along the lines of
>>>> signal_is_task, that tells us that it is safe to use current, and add
>>>> eventfd_signal_task() which is the same as eventfd_signal but lets everyone
>>>> know that it's safe to both sleep and use current->mm.
>>>>
>>>> Makes sense?
>>>>   
>>>>       
>>>>         
>>> It does make sense, yes.  What I am not clear on is how would eventfd
>>> detect this state such as to populate such flags, and why cant the
>>> ->signal() CB do the same?
>>>
>>> Thanks Michael,
>>> -Greg
>>>
>>>     
>>>       
>> eventfd can't detect this state. But the callers know in what context they are.
>> So the *caller* of eventfd_signal_task makes sure of this: if you are in a task,
>> you can call eventfd_signal_task() if not, you must call eventfd_signal.
>>
>>
>>   
>>     
> Hmm, this is an interesting idea, but I think it would be problematic in
> real-world applications for the long-term.  For instance, the -rt tree
> and irq-threads .config option in the process of merging into mainline
> changes context types for established code.  Therefore, what might be
> "hardirq/softirq" logic today may execute in a kthread tomorrow.  I
> think its dangerous to try to solve the problem with caller provided
> info:  the caller may be ignorant of its true state.

Also, we need to consider that a process context can still be in-atomic
if the user did something like disabled interrupts, preemption, used a
spinlock, etc, before calling the eventfd_signal_task() function. 
Perhaps we can put a stake in the ground that says you must not call
this from atomic context, but I still prefer just being able to detect
this from our state.

-Greg
Michael S. Tsirkin June 16, 2009, 2:55 p.m. UTC | #10
On Tue, Jun 16, 2009 at 10:48:27AM -0400, Gregory Haskins wrote:
> >>>> +static void _eventfd_notify(struct eventfd_ctx *ctx)
> >>>> +{
> >>>> +	struct eventfd_notifier *en;
> >>>> +	int idx;
> >>>> +
> >>>> +	idx = srcu_read_lock(&ctx->srcu);
> >>>> +
> >>>> +	/*
> >>>> +	 * The goal here is to allow the notification to be preemptible
> >>>> +	 * as often as possible.  We cannot achieve this with the basic
> >>>> +	 * wqh mechanism because it requires the wqh->lock.  Therefore
> >>>> +	 * we have an internal srcu list mechanism of which the wqh is
> >>>> +	 * a client.
> >>>> +	 *
> >>>> +	 * Not all paths will invoke this function in process context.
> >>>> +	 * Callers should check for suitable state before assuming they
> >>>> +	 * can sleep (such as with preemptible()).  Paul McKenney assures
> >>>> +	 * me that srcu_read_lock is compatible with in-atomic, as long as
> >>>> +	 * the code within the critical section is also compatible.
> >>>> +	 */
> >>>> +	list_for_each_entry_rcu(en, &ctx->nh, list)
> >>>> +		en->ops->signal(en);
> >>>> +
> >>>> +	srcu_read_unlock(&ctx->srcu, idx);
> >>>> +}
> >>>> +
> >>>>  /*
> >>>>   * Adds "n" to the eventfd counter "count". Returns "n" in case of
> >>>>   * success, or a value lower then "n" in case of coutner overflow.
> >>>>     
> >>>>         
> >>> This is ugly, isn't it? With CONFIG_PREEMPT=no preemptible() is always false.
> >>>
> >>> Further, to do useful things it might not be enough that you can sleep:
> >>> with iofd you also want to access current task with e.g. copy from user.
> >>>
> >>> Here's an idea: let's pass a flag to ->signal, along the lines of
> >>> signal_is_task, that tells us that it is safe to use current, and add
> >>> eventfd_signal_task() which is the same as eventfd_signal but lets everyone
> >>> know that it's safe to both sleep and use current->mm.
> >>>
> >>> Makes sense?
> >>>   
> >>>       
> >> It does make sense, yes.  What I am not clear on is how would eventfd
> >> detect this state such as to populate such flags, and why cant the
> >> ->signal() CB do the same?
> >>
> >> Thanks Michael,
> >> -Greg
> >>
> >>     
> >
> > eventfd can't detect this state. But the callers know in what context they are.
> > So the *caller* of eventfd_signal_task makes sure of this: if you are in a task,
> > you can call eventfd_signal_task() if not, you must call eventfd_signal.
> >
> >
> >   
> Hmm, this is an interesting idea, but I think it would be problematic in
> real-world applications for the long-term.  For instance, the -rt tree
> and irq-threads .config option in the process of merging into mainline
> changes context types for established code.  Therefore, what might be
> "hardirq/softirq" logic today may execute in a kthread tomorrow.

That's OK, it's always safe to call eventfd_signal: eventfd_signal_task is just
an optimization. I think everyone not in the context of a system call or vmexit
can just call eventfd_signal_task.

>  I
> think its dangerous to try to solve the problem with caller provided
> info:  the caller may be ignorant of its true state.

I assume this wasn't clear enough: the idea is that you only
calls eventfd_signal_task if you know you are on a systemcall path.
If you are ignorant of the state, call eventfd_signal.

>  IMO, the ideal
> solution needs to be something we can detect at run-time.
> 
> Thanks Michael,
> -Greg
>
Michael S. Tsirkin June 16, 2009, 3:16 p.m. UTC | #11
On Tue, Jun 16, 2009 at 10:54:28AM -0400, Gregory Haskins wrote:
> >>>>> +static void _eventfd_notify(struct eventfd_ctx *ctx)
> >>>>> +{
> >>>>> +	struct eventfd_notifier *en;
> >>>>> +	int idx;
> >>>>> +
> >>>>> +	idx = srcu_read_lock(&ctx->srcu);
> >>>>> +
> >>>>> +	/*
> >>>>> +	 * The goal here is to allow the notification to be preemptible
> >>>>> +	 * as often as possible.  We cannot achieve this with the basic
> >>>>> +	 * wqh mechanism because it requires the wqh->lock.  Therefore
> >>>>> +	 * we have an internal srcu list mechanism of which the wqh is
> >>>>> +	 * a client.
> >>>>> +	 *
> >>>>> +	 * Not all paths will invoke this function in process context.
> >>>>> +	 * Callers should check for suitable state before assuming they
> >>>>> +	 * can sleep (such as with preemptible()).  Paul McKenney assures
> >>>>> +	 * me that srcu_read_lock is compatible with in-atomic, as long as
> >>>>> +	 * the code within the critical section is also compatible.
> >>>>> +	 */
> >>>>> +	list_for_each_entry_rcu(en, &ctx->nh, list)
> >>>>> +		en->ops->signal(en);
> >>>>> +
> >>>>> +	srcu_read_unlock(&ctx->srcu, idx);
> >>>>> +}
> >>>>> +
> >>>>>  /*
> >>>>>   * Adds "n" to the eventfd counter "count". Returns "n" in case of
> >>>>>   * success, or a value lower then "n" in case of coutner overflow.
> >>>>>     
> >>>>>         
> >>>>>           
> >>>> This is ugly, isn't it? With CONFIG_PREEMPT=no preemptible() is always false.
> >>>>
> >>>> Further, to do useful things it might not be enough that you can sleep:
> >>>> with iofd you also want to access current task with e.g. copy from user.
> >>>>
> >>>> Here's an idea: let's pass a flag to ->signal, along the lines of
> >>>> signal_is_task, that tells us that it is safe to use current, and add
> >>>> eventfd_signal_task() which is the same as eventfd_signal but lets everyone
> >>>> know that it's safe to both sleep and use current->mm.
> >>>>
> >>>> Makes sense?
> >>>>   
> >>>>       
> >>>>         
> >>> It does make sense, yes.  What I am not clear on is how would eventfd
> >>> detect this state such as to populate such flags, and why cant the
> >>> ->signal() CB do the same?
> >>>
> >>> Thanks Michael,
> >>> -Greg
> >>>
> >>>     
> >>>       
> >> eventfd can't detect this state. But the callers know in what context they are.
> >> So the *caller* of eventfd_signal_task makes sure of this: if you are in a task,
> >> you can call eventfd_signal_task() if not, you must call eventfd_signal.
> >>
> >>
> >>   
> >>     
> > Hmm, this is an interesting idea, but I think it would be problematic in
> > real-world applications for the long-term.  For instance, the -rt tree
> > and irq-threads .config option in the process of merging into mainline
> > changes context types for established code.  Therefore, what might be
> > "hardirq/softirq" logic today may execute in a kthread tomorrow.  I
> > think its dangerous to try to solve the problem with caller provided
> > info:  the caller may be ignorant of its true state.
> 
> Also, we need to consider that a process context can still be in-atomic
> if the user did something like disabled interrupts, preemption, used a
> spinlock, etc, before calling the eventfd_signal_task() function. 
> Perhaps we can put a stake in the ground that says you must not call
> this from atomic context,

That's the ticket.

> but I still prefer just being able to detect
> this from our state.
> 
> -Greg
> 
>
Gregory Haskins June 16, 2009, 3:20 p.m. UTC | #12
Michael S. Tsirkin wrote:
> On Tue, Jun 16, 2009 at 10:48:27AM -0400, Gregory Haskins wrote:
>   
>>>>>> +static void _eventfd_notify(struct eventfd_ctx *ctx)
>>>>>> +{
>>>>>> +	struct eventfd_notifier *en;
>>>>>> +	int idx;
>>>>>> +
>>>>>> +	idx = srcu_read_lock(&ctx->srcu);
>>>>>> +
>>>>>> +	/*
>>>>>> +	 * The goal here is to allow the notification to be preemptible
>>>>>> +	 * as often as possible.  We cannot achieve this with the basic
>>>>>> +	 * wqh mechanism because it requires the wqh->lock.  Therefore
>>>>>> +	 * we have an internal srcu list mechanism of which the wqh is
>>>>>> +	 * a client.
>>>>>> +	 *
>>>>>> +	 * Not all paths will invoke this function in process context.
>>>>>> +	 * Callers should check for suitable state before assuming they
>>>>>> +	 * can sleep (such as with preemptible()).  Paul McKenney assures
>>>>>> +	 * me that srcu_read_lock is compatible with in-atomic, as long as
>>>>>> +	 * the code within the critical section is also compatible.
>>>>>> +	 */
>>>>>> +	list_for_each_entry_rcu(en, &ctx->nh, list)
>>>>>> +		en->ops->signal(en);
>>>>>> +
>>>>>> +	srcu_read_unlock(&ctx->srcu, idx);
>>>>>> +}
>>>>>> +
>>>>>>  /*
>>>>>>   * Adds "n" to the eventfd counter "count". Returns "n" in case of
>>>>>>   * success, or a value lower then "n" in case of coutner overflow.
>>>>>>     
>>>>>>         
>>>>>>             
>>>>> This is ugly, isn't it? With CONFIG_PREEMPT=no preemptible() is always false.
>>>>>
>>>>> Further, to do useful things it might not be enough that you can sleep:
>>>>> with iofd you also want to access current task with e.g. copy from user.
>>>>>
>>>>> Here's an idea: let's pass a flag to ->signal, along the lines of
>>>>> signal_is_task, that tells us that it is safe to use current, and add
>>>>> eventfd_signal_task() which is the same as eventfd_signal but lets everyone
>>>>> know that it's safe to both sleep and use current->mm.
>>>>>
>>>>> Makes sense?
>>>>>   
>>>>>       
>>>>>           
>>>> It does make sense, yes.  What I am not clear on is how would eventfd
>>>> detect this state such as to populate such flags, and why cant the
>>>> ->signal() CB do the same?
>>>>
>>>> Thanks Michael,
>>>> -Greg
>>>>
>>>>     
>>>>         
>>> eventfd can't detect this state. But the callers know in what context they are.
>>> So the *caller* of eventfd_signal_task makes sure of this: if you are in a task,
>>> you can call eventfd_signal_task() if not, you must call eventfd_signal.
>>>
>>>
>>>   
>>>       
>> Hmm, this is an interesting idea, but I think it would be problematic in
>> real-world applications for the long-term.  For instance, the -rt tree
>> and irq-threads .config option in the process of merging into mainline
>> changes context types for established code.  Therefore, what might be
>> "hardirq/softirq" logic today may execute in a kthread tomorrow.
>>     
>
> That's OK, it's always safe to call eventfd_signal: eventfd_signal_task is just
> an optimization. I think everyone not in the context of a system call or vmexit
> can just call eventfd_signal_task.
>   
                                 ^^^^^^^^^^^^^^^^^^^^

I assume you meant s/eventfd_signal_task/eventfd_signal there?

>   
>>  I
>> think its dangerous to try to solve the problem with caller provided
>> info:  the caller may be ignorant of its true state.
>>     
>
> I assume this wasn't clear enough: the idea is that you only
> calls eventfd_signal_task if you know you are on a systemcall path.
> If you are ignorant of the state, call eventfd_signal.
>   

Well, its not a matter of correctness.  Its more for optimal
performance.  If I have PCI pass-though injecting interrupts from
hardirq in mainline, clearly eventfd_signal() is proper.  In -rt, the
hardirq is transparently converted to a kthread, so technically
eventfd_signal_task() would work (at least for the can_sleep() part, not
for current->mm per se).  But in this case, the PCI logic would not know
it was converted to a kthread.  It all happens transparently in the
low-level code and the pci code is unmodified.

In this case, your proposal would have the passthrough path invoking
irqfd with eventfd_signal().  It  would therefore still shunt to a
workqueue to inject the interrupt, even though it would have been
perfectly fine to just inject it directly because taking
mutex_lock(&kvm->irq_lock) is legal.  Perhaps I am over-optimizing, but
this is the scenario I am concerned about and what I was trying to
address with preemptible()/can_sleep().

I think your idea is a good one to address the current->mm portion.  It
would only ever be safe to access the MM context from syscall/vmexit
context, as you point out.  Therefore, I see no problem with
implementing something like iosignalfd with eventfd_signal_task().

But accessing current->mm is only a subset of the use-cases.  The other
use-cases would include the ability to sleep, and possibly the ability
to address other->mm.  For these latter cases, I really only need the
"can_sleep()" behavior, not the full blown "can_access_current_mm()". 
Additionally, the eventfd_signal_task() data at least for iosignalfd is
superfluous:  I already know that I can access current->mm by virtue of
the design.

So since I cannot use it accurately for the hardirq/threaded-irq type
case, and I don't actually need it for the iosignalfd case, I am not
sure its the right direction (at least for now).  I do think it might
have merit for syscal/vmexit uses outside of iosignalfd, but I do not
see a current use-case for it so perhaps it can wait until one arises.

-Greg

>   
>>  IMO, the ideal
>> solution needs to be something we can detect at run-time.
>>
>> Thanks Michael,
>> -Greg
>>
>>     
>
>
>
Michael S. Tsirkin June 16, 2009, 3:41 p.m. UTC | #13
On Tue, Jun 16, 2009 at 11:20:18AM -0400, Gregory Haskins wrote:
> >>> eventfd can't detect this state. But the callers know in what context they are.
> >>> So the *caller* of eventfd_signal_task makes sure of this: if you are in a task,
> >>> you can call eventfd_signal_task() if not, you must call eventfd_signal.
> >>>
> >>>
> >>>   
> >>>       
> >> Hmm, this is an interesting idea, but I think it would be problematic in
> >> real-world applications for the long-term.  For instance, the -rt tree
> >> and irq-threads .config option in the process of merging into mainline
> >> changes context types for established code.  Therefore, what might be
> >> "hardirq/softirq" logic today may execute in a kthread tomorrow.
> >>     
> >
> > That's OK, it's always safe to call eventfd_signal: eventfd_signal_task is just
> > an optimization. I think everyone not in the context of a system call or vmexit
> > can just call eventfd_signal_task.
> >   
>                                  ^^^^^^^^^^^^^^^^^^^^
> 
> I assume you meant s/eventfd_signal_task/eventfd_signal there?

Yea. Sorry.

> >   
> >>  I
> >> think its dangerous to try to solve the problem with caller provided
> >> info:  the caller may be ignorant of its true state.
> >>     
> >
> > I assume this wasn't clear enough: the idea is that you only
> > calls eventfd_signal_task if you know you are on a systemcall path.
> > If you are ignorant of the state, call eventfd_signal.
> >   
> 
> Well, its not a matter of correctness.  Its more for optimal
> performance.  If I have PCI pass-though injecting interrupts from
> hardirq in mainline, clearly eventfd_signal() is proper.  In -rt, the
> hardirq is transparently converted to a kthread, so technically
> eventfd_signal_task() would work

I think it's wrong to sleep for a long time while handling interrupts even if
you technically can with threaded interrupts. For that matter, I think you can
sleep while within a spinlock if preempt is on, but that does not mean you
should - and I think you will get warnings in the log if you do. No?

> (at least for the can_sleep() part, not
> for current->mm per se).  But in this case, the PCI logic would not know
> it was converted to a kthread.  It all happens transparently in the
> low-level code and the pci code is unmodified.
> 
> In this case, your proposal would have the passthrough path invoking
> irqfd with eventfd_signal().  It  would therefore still shunt to a
> workqueue to inject the interrupt, even though it would have been
> perfectly fine to just inject it directly because taking
> mutex_lock(&kvm->irq_lock) is legal.

This specific issue should just be addressed by making it possible
to inject the interrupt from an atomic context.

>  Perhaps I am over-optimizing, but
> this is the scenario I am concerned about and what I was trying to
> address with preemptible()/can_sleep().

What, a path that is never invoked without threaded interrupts?
Yes, I would say it currently looks like an over-optimization.

> I think your idea is a good one to address the current->mm portion.  It
> would only ever be safe to access the MM context from syscall/vmexit
> context, as you point out.  Therefore, I see no problem with
> implementing something like iosignalfd with eventfd_signal_task().
> 
> But accessing current->mm is only a subset of the use-cases.  The other
> use-cases would include the ability to sleep, and possibly the ability
> to address other->mm.  For these latter cases, I really only need the
> "can_sleep()" behavior, not the full blown "can_access_current_mm()". 
> Additionally, the eventfd_signal_task() data at least for iosignalfd is
> superfluous:  I already know that I can access current->mm by virtue of
> the design.

Maybe I misunderstand what iosignalfd is - isn't it true that you get eventfd
and kvm will signal that when guest performs io write to a specific
address, and then drivers can get eventfd and poll it to detect
these writes?

If yes, you have no way to know that the other end of eventfd
is connected to kvm, so you don't know you can access current->mm.

> So since I cannot use it accurately for the hardirq/threaded-irq type
> case, and I don't actually need it for the iosignalfd case, I am not
> sure its the right direction (at least for now).  I do think it might
> have merit for syscal/vmexit uses outside of iosignalfd, but I do not
> see a current use-case for it so perhaps it can wait until one arises.
> 
> -Greg

But, it addresses the CONFIG_PREEMPT off case, which your design doesn't
seem to.

> >   
> >>  IMO, the ideal
> >> solution needs to be something we can detect at run-time.
> >>
> >> Thanks Michael,
> >> -Greg
> >>
> >>     
> >
> >
> >   
> 
> 


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gregory Haskins June 16, 2009, 4:17 p.m. UTC | #14
Michael S. Tsirkin wrote:
> On Tue, Jun 16, 2009 at 11:20:18AM -0400, Gregory Haskins wrote:
>   
>>>>> eventfd can't detect this state. But the callers know in what context they are.
>>>>> So the *caller* of eventfd_signal_task makes sure of this: if you are in a task,
>>>>> you can call eventfd_signal_task() if not, you must call eventfd_signal.
>>>>>
>>>>>
>>>>>   
>>>>>       
>>>>>           
>>>> Hmm, this is an interesting idea, but I think it would be problematic in
>>>> real-world applications for the long-term.  For instance, the -rt tree
>>>> and irq-threads .config option in the process of merging into mainline
>>>> changes context types for established code.  Therefore, what might be
>>>> "hardirq/softirq" logic today may execute in a kthread tomorrow.
>>>>     
>>>>         
>>> That's OK, it's always safe to call eventfd_signal: eventfd_signal_task is just
>>> an optimization. I think everyone not in the context of a system call or vmexit
>>> can just call eventfd_signal_task.
>>>   
>>>       
>>                                  ^^^^^^^^^^^^^^^^^^^^
>>
>> I assume you meant s/eventfd_signal_task/eventfd_signal there?
>>     
>
> Yea. Sorry.
>   

np.  I knew what you meant ;)
>   
>>>   
>>>       
>>>>  I
>>>> think its dangerous to try to solve the problem with caller provided
>>>> info:  the caller may be ignorant of its true state.
>>>>     
>>>>         
>>> I assume this wasn't clear enough: the idea is that you only
>>> calls eventfd_signal_task if you know you are on a systemcall path.
>>> If you are ignorant of the state, call eventfd_signal.
>>>   
>>>       
>> Well, its not a matter of correctness.  Its more for optimal
>> performance.  If I have PCI pass-though injecting interrupts from
>> hardirq in mainline, clearly eventfd_signal() is proper.  In -rt, the
>> hardirq is transparently converted to a kthread, so technically
>> eventfd_signal_task() would work
>>     
>
> I think it's wrong to sleep for a long time while handling interrupts even if
> you technically can with threaded interrupts.

Well, this is somewhat of an orthogonal issue so I don't want to open
this can of worms per se.

But, one of the long-term goals of the threaded-irq construct is to
eliminate the need for the traditional "bh" contexts to do work.  The
idea, of course, is that the threaded-irq context can do all of the work
traditionally shunted to tasklets/softirqs/workqueues directly, so why
bother switching contexts.  So, the short answer is that its not
necessarily wrong to sleep or to do significant processing from a
threaded-irq.

In any case, threaded-irqs are just one example.  I will try to
highlight others below.

>  For that matter, I think you can
> sleep while within a spinlock if preempt is on
Yes, in fact the spinlocks are mutexes in this mode, so the locks
themselves can sleep.


> , but that does not mean you
> should - and I think you will get warnings in the log if you do. No?
>
>   
Nope, sleeping is fine (voluntary or involuntary).  The idea is its all
governed by priority, and priority-inheritance, and a scheduler that is
free to make fine-grained decisions (due to the broadly preemptible
kernel).  But this is getting off-topic so I will stop.

>> (at least for the can_sleep() part, not
>> for current->mm per se).  But in this case, the PCI logic would not know
>> it was converted to a kthread.  It all happens transparently in the
>> low-level code and the pci code is unmodified.
>>
>> In this case, your proposal would have the passthrough path invoking
>> irqfd with eventfd_signal().  It  would therefore still shunt to a
>> workqueue to inject the interrupt, even though it would have been
>> perfectly fine to just inject it directly because taking
>> mutex_lock(&kvm->irq_lock) is legal.
>>     
>
> This specific issue should just be addressed by making it possible
> to inject the interrupt from an atomic context.
>   

I assume you mean
s/mutex_lock(&kvm->irq_lock)/spin_lock(&kvm->irq_lock)?  If so, I agree
this is a good idea.  TBH: I am more concerned about the iosignalfd path
w.r.t. the premptiblity of the callback.  We can optimize the virtio-net
interface, for instance, once we make this ->signal() callback
preemptible.  Having irqfd ->signal() preemptible is just a bonus, but
we could work around it by fixing irq_lock as well, I agree.

>   
>>  Perhaps I am over-optimizing, but
>> this is the scenario I am concerned about and what I was trying to
>> address with preemptible()/can_sleep().
>>     
>
> What, a path that is never invoked without threaded interrupts?
> Yes, I would say it currently looks like an over-optimization.
>   

You are only seeing part of the picture though.  This is a cascading
pattern.
>   
>> I think your idea is a good one to address the current->mm portion.  It
>> would only ever be safe to access the MM context from syscall/vmexit
>> context, as you point out.  Therefore, I see no problem with
>> implementing something like iosignalfd with eventfd_signal_task().
>>
>> But accessing current->mm is only a subset of the use-cases.  The other
>> use-cases would include the ability to sleep, and possibly the ability
>> to address other->mm.  For these latter cases, I really only need the
>> "can_sleep()" behavior, not the full blown "can_access_current_mm()". 
>> Additionally, the eventfd_signal_task() data at least for iosignalfd is
>> superfluous:  I already know that I can access current->mm by virtue of
>> the design.
>>     
>
> Maybe I misunderstand what iosignalfd is - isn't it true that you get eventfd
> and kvm will signal that when guest performs io write to a specific
> address, and then drivers can get eventfd and poll it to detect
> these writes?
>   

Correct.

> If yes, you have no way to know that the other end of eventfd
> is connected to kvm, so you don't know you can access current->mm.
>   

Well, my intended use for them *does* know its connected to KVM. 
Perhaps there will be others that do not in the future, but like I said
it could be addressed as those actually arise.


>   
>> So since I cannot use it accurately for the hardirq/threaded-irq type
>> case, and I don't actually need it for the iosignalfd case, I am not
>> sure its the right direction (at least for now).  I do think it might
>> have merit for syscal/vmexit uses outside of iosignalfd, but I do not
>> see a current use-case for it so perhaps it can wait until one arises.
>>
>> -Greg
>>     
>
> But, it addresses the CONFIG_PREEMPT off case, which your design doesn't
> seem to.
>   

Well, the problem is that it only addresses it partially in a limited
set of circumstances, and doesn't address the broader problem.  I'll
give you an example:

(First off, lets assume that we are not going to have
eventfd_signal_task(), but rather eventfd_signal() with two option
flags: EVENTFD_SIGNAL_CANSLEEP, and EVENTFD_SIGNAL_CURRENTVALID

Today vbus-enet has a rx-thread and a tx-thread at least partially
because I need process-context to do the copy_to_user(other->mm) type
stuff (and we will need similar for our virtio-net backend as well).  I
also utilize irqfd to do interrupt injection.  Now, since I know that I
have kthread_context, I could modify vbus-enet to inject interrupts with
EVENTFD_SIGNAL_CANSLEEP set, and this is fine.  I know that is safe.

However, the problem is above that!  I would like to optimize out the
tx-thread to begin with with a similar "can_sleep()" pattern whenever
possible.

For instance, if the netif stack calls me to do a transmit and it
happens to be in a sleepable context, it would be nice to just skip
waking up the tx-thread.  I would therefore just do the
copy_to_user(other->mm) + irqfd directly in the netif callback, thereby
skipping the context switch.

So the problem is a pattern that I would like to address generally
outside of just eventfd: "can I be sleepy"?  If yes, do it now, if not
defer it.

So if the stack calls me in a preemptible state, I would like to detect
that and optimize my deferment mechanisms away.  This isn't something
that happens readily today given the way the stacks locking and
softirq-net work, but its moving in that direction (for instance,
threaded softirqs).

This is why I am pushing for a run-time detection mechanism (like
can_sleep()), as opposed to something in the eventfd interface level
(like EVENTFD_SIGNAL_CANSLEEP).  I think at least the CURRENTVALID idea
is a great one, I just don't have a current need for it.  That said, I
do not have a problem with modifing eventfd to provide such information
if Davide et. al. ack it as well.

Does this all make sense?

-Greg
Davide Libenzi June 16, 2009, 4:19 p.m. UTC | #15
On Tue, 16 Jun 2009, Gregory Haskins wrote:

> Does this all make sense?

This conversation has been *really* long, and I haven't had time to look 
at the patch yet. But looking at the amount of changes, and the amount of 
even more changes talked in this thread, there's a very slim chance that 
I'll ACK the eventfd code.
You may want to consider a solution that does not litter eventfd code that 
much.


- Davide


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gregory Haskins June 16, 2009, 5:01 p.m. UTC | #16
Davide Libenzi wrote:
> On Tue, 16 Jun 2009, Gregory Haskins wrote:
>
>   
>> Does this all make sense?
>>     
>
> This conversation has been *really* long, and I haven't had time to look 
> at the patch yet. But looking at the amount of changes, and the amount of 
> even more changes talked in this thread, there's a very slim chance that 
> I'll ACK the eventfd code.
> You may want to consider a solution that does not litter eventfd code that 
> much.
>
>
> - Davide
>
>
>   
Hi Davide,

I understand your position and value your time/insight into looking at
this things.

Despite the current ongoing discussion, I still stand that the current
patch is my proposed solution (though I have yet to convince Michael). 
But in any case,  if you have the time, please look it over because I
still think its the right direction to head in.

The general solution is that we use an srcu list instead of the
wait-queue, and thats really it.  If we can't eliminate that spinlock
held over the notification, it has usability implications at least for
irqfd/iosignalfd.  The only way I can think of to solve the problem
without modifying eventfd is to not use eventfd at all.   :(

Since using eventfd really captures the concept we are going for here
really well, reusing it has a ton of advantages including interface
compatibility and, of course, code-reuse of a tested/debugged code
base.  Heck, we may hopefully even improve eventfd for other users in
doing this work.  It would therefore be a shame to walk away from it if
it can be avoided.

So if what I proposed is not acceptable but you are willing to work with
me to find a solution that is, that would be ideal from my perspective. 
Otherwise, I apologize for all the noise.  You have been quite the
patient and helpful gentleman with me to date and I appreciate that.

Kind Regards,
-Greg
Michael S. Tsirkin June 16, 2009, 5:54 p.m. UTC | #17
On Tue, Jun 16, 2009 at 12:17:22PM -0400, Gregory Haskins wrote:
> > Maybe I misunderstand what iosignalfd is - isn't it true that you get eventfd
> > and kvm will signal that when guest performs io write to a specific
> > address, and then drivers can get eventfd and poll it to detect
> > these writes?
> >   
> 
> Correct.
> 
> > If yes, you have no way to know that the other end of eventfd
> > is connected to kvm, so you don't know you can access current->mm.
> >   
> 
> Well, my intended use for them *does* know its connected to KVM. 

How does it know? You get a file descriptor and you even figure out it's an
eventfd. Now what? Any process can write there.

> Perhaps there will be others that do not in the future, but like I said
> it could be addressed as those actually arise.
> 
> 
> >   
> >> So since I cannot use it accurately for the hardirq/threaded-irq type
> >> case, and I don't actually need it for the iosignalfd case, I am not
> >> sure its the right direction (at least for now).  I do think it might
> >> have merit for syscal/vmexit uses outside of iosignalfd, but I do not
> >> see a current use-case for it so perhaps it can wait until one arises.
> >>
> >> -Greg
> >>     
> >
> > But, it addresses the CONFIG_PREEMPT off case, which your design doesn't
> > seem to.
> >   
> 
> Well, the problem is that it only addresses it partially in a limited
> set of circumstances, and doesn't address the broader problem.  I'll
> give you an example:
> 
> (First off, lets assume that we are not going to have
> eventfd_signal_task(), but rather eventfd_signal() with two option
> flags: EVENTFD_SIGNAL_CANSLEEP, and EVENTFD_SIGNAL_CURRENTVALID
> 
> Today vbus-enet has a rx-thread and a tx-thread at least partially
> because I need process-context to do the copy_to_user(other->mm) type
> stuff (and we will need similar for our virtio-net backend as well).  I
> also utilize irqfd to do interrupt injection.  Now, since I know that I
> have kthread_context, I could modify vbus-enet to inject interrupts with
> EVENTFD_SIGNAL_CANSLEEP set, and this is fine.  I know that is safe.
> 
> However, the problem is above that!  I would like to optimize out the
> tx-thread to begin with with a similar "can_sleep()" pattern whenever
> possible.
> 
> For instance, if the netif stack calls me to do a transmit and it
> happens to be in a sleepable context, it would be nice to just skip
> waking up the tx-thread.  I would therefore just do the
> copy_to_user(other->mm) + irqfd directly in the netif callback, thereby
> skipping the context switch.

What do you mean by copy_to_user(other->mm) here?  If you are going to switch
to another mm, then I think current->mm must be valid (I think it's not enough
that you can sleep). So preemptible() might not be enough.
Gregory Haskins June 16, 2009, 6:09 p.m. UTC | #18
Michael S. Tsirkin wrote:
> On Tue, Jun 16, 2009 at 12:17:22PM -0400, Gregory Haskins wrote:
>   
>>> Maybe I misunderstand what iosignalfd is - isn't it true that you get eventfd
>>> and kvm will signal that when guest performs io write to a specific
>>> address, and then drivers can get eventfd and poll it to detect
>>> these writes?
>>>   
>>>       
>> Correct.
>>
>>     
>>> If yes, you have no way to know that the other end of eventfd
>>> is connected to kvm, so you don't know you can access current->mm.
>>>   
>>>       
>> Well, my intended use for them *does* know its connected to KVM. 
>>     
>
> How does it know? You get a file descriptor and you even figure out it's an
> eventfd. Now what? Any process can write there.
>   

Well, I suppose that is true.  Unlike my current hypercall coupling
deployed in vbus v3, anyone can signal the event for the iosignalfd. 
However, it wouldn't matter other than looking like an errant signal as
I do not use "current" for anything. (e.g. all the virtio pointers are
stored independently to the iosignalfd)
>   
>> Perhaps there will be others that do not in the future, but like I said
>> it could be addressed as those actually arise.
>>
>>
>>     
>>>   
>>>       
>>>> So since I cannot use it accurately for the hardirq/threaded-irq type
>>>> case, and I don't actually need it for the iosignalfd case, I am not
>>>> sure its the right direction (at least for now).  I do think it might
>>>> have merit for syscal/vmexit uses outside of iosignalfd, but I do not
>>>> see a current use-case for it so perhaps it can wait until one arises.
>>>>
>>>> -Greg
>>>>     
>>>>         
>>> But, it addresses the CONFIG_PREEMPT off case, which your design doesn't
>>> seem to.
>>>   
>>>       
>> Well, the problem is that it only addresses it partially in a limited
>> set of circumstances, and doesn't address the broader problem.  I'll
>> give you an example:
>>
>> (First off, lets assume that we are not going to have
>> eventfd_signal_task(), but rather eventfd_signal() with two option
>> flags: EVENTFD_SIGNAL_CANSLEEP, and EVENTFD_SIGNAL_CURRENTVALID
>>
>> Today vbus-enet has a rx-thread and a tx-thread at least partially
>> because I need process-context to do the copy_to_user(other->mm) type
>> stuff (and we will need similar for our virtio-net backend as well).  I
>> also utilize irqfd to do interrupt injection.  Now, since I know that I
>> have kthread_context, I could modify vbus-enet to inject interrupts with
>> EVENTFD_SIGNAL_CANSLEEP set, and this is fine.  I know that is safe.
>>
>> However, the problem is above that!  I would like to optimize out the
>> tx-thread to begin with with a similar "can_sleep()" pattern whenever
>> possible.
>>
>> For instance, if the netif stack calls me to do a transmit and it
>> happens to be in a sleepable context, it would be nice to just skip
>> waking up the tx-thread.  I would therefore just do the
>> copy_to_user(other->mm) + irqfd directly in the netif callback, thereby
>> skipping the context switch.
>>     
>
> What do you mean by copy_to_user(other->mm) here?  If you are going to switch
> to another mm, then I think current->mm must be valid (I think it's not enough
> that you can sleep). So preemptible() might not be enough.
>   

I dont currently use switch_mm, if that is what you mean.  I save the
task_struct into the appropriate context so current->mm doesn't matter
to me.  I never use it.  All I need (afaik) is to acquire the proper
mutex first.  I am not an MM expert, so perhaps I have this wrong but it
does appear to work properly even from kthread context.

-Greg
Michael S. Tsirkin June 17, 2009, 2:45 p.m. UTC | #19
On Tue, Jun 16, 2009 at 02:09:38PM -0400, Gregory Haskins wrote:
> > What do you mean by copy_to_user(other->mm) here?  If you are going to switch
> > to another mm, then I think current->mm must be valid (I think it's not enough
> > that you can sleep). So preemptible() might not be enough.
> >   
> 
> I dont currently use switch_mm, if that is what you mean.  I save the
> task_struct into the appropriate context so current->mm doesn't matter
> to me.  I never use it.  All I need (afaik) is to acquire the proper
> mutex first.  I am not an MM expert, so perhaps I have this wrong but it
> does appear to work properly even from kthread context.
> 
> -Greg
> 
> 

I think I saw get_user_pages + memcpy in your patch. Yes, that works
without switching contexts but it's slower than copy to user if you are
in the right context, and AFAIK it's slower than get_user_pages_fast.
Gregory Haskins June 17, 2009, 3:02 p.m. UTC | #20
Michael S. Tsirkin wrote:
> On Tue, Jun 16, 2009 at 02:09:38PM -0400, Gregory Haskins wrote:
>   
>>> What do you mean by copy_to_user(other->mm) here?  If you are going to switch
>>> to another mm, then I think current->mm must be valid (I think it's not enough
>>> that you can sleep). So preemptible() might not be enough.
>>>   
>>>       
>> I dont currently use switch_mm, if that is what you mean.  I save the
>> task_struct into the appropriate context so current->mm doesn't matter
>> to me.  I never use it.  All I need (afaik) is to acquire the proper
>> mutex first.  I am not an MM expert, so perhaps I have this wrong but it
>> does appear to work properly even from kthread context.
>>
>> -Greg
>>
>>
>>     
>
> I think I saw get_user_pages + memcpy in your patch. Yes, that works
> without switching contexts but it's slower than copy to user if you are
> in the right context, and AFAIK it's slower than get_user_pages_fast.
>
>   
Yeah, understood.  It will definitely be interesting to try that
optimization with switch_mm that you suggested.

On that front, would "if (p == current)" still work even if we don't
have the "signal_task()" hint?
Michael S. Tsirkin June 17, 2009, 4:25 p.m. UTC | #21
On Wed, Jun 17, 2009 at 11:02:06AM -0400, Gregory Haskins wrote:
> Michael S. Tsirkin wrote:
> > On Tue, Jun 16, 2009 at 02:09:38PM -0400, Gregory Haskins wrote:
> >   
> >>> What do you mean by copy_to_user(other->mm) here?  If you are going to switch
> >>> to another mm, then I think current->mm must be valid (I think it's not enough
> >>> that you can sleep). So preemptible() might not be enough.
> >>>   
> >>>       
> >> I dont currently use switch_mm, if that is what you mean.  I save the
> >> task_struct into the appropriate context so current->mm doesn't matter
> >> to me.  I never use it.  All I need (afaik) is to acquire the proper
> >> mutex first.  I am not an MM expert, so perhaps I have this wrong but it
> >> does appear to work properly even from kthread context.
> >>
> >> -Greg
> >>
> >>
> >>     
> >
> > I think I saw get_user_pages + memcpy in your patch. Yes, that works
> > without switching contexts but it's slower than copy to user if you are
> > in the right context, and AFAIK it's slower than get_user_pages_fast.
> >
> >   
> Yeah, understood.  It will definitely be interesting to try that
> optimization with switch_mm that you suggested.

BTW, I'm kind of confused - in your patch you do get_task_struct: does this
guarantee that mm is not going aways?

> On that front, would "if (p == current)" still work even if we don't
> have the "signal_task()" hint?
> 

Donnu.
Davide Libenzi June 17, 2009, 4:38 p.m. UTC | #22
On Tue, 16 Jun 2009, Gregory Haskins wrote:

> Davide Libenzi wrote:
> > On Tue, 16 Jun 2009, Gregory Haskins wrote:
> >
> >   
> >> Does this all make sense?
> >>     
> >
> > This conversation has been *really* long, and I haven't had time to look 
> > at the patch yet. But looking at the amount of changes, and the amount of 
> > even more changes talked in this thread, there's a very slim chance that 
> > I'll ACK the eventfd code.
> > You may want to consider a solution that does not litter eventfd code that 
> > much.
> >
> >
> > - Davide
> >
> >
> >   
> Hi Davide,
> 
> I understand your position and value your time/insight into looking at
> this things.
> 
> Despite the current ongoing discussion, I still stand that the current
> patch is my proposed solution (though I have yet to convince Michael). 
> But in any case,  if you have the time, please look it over because I
> still think its the right direction to head in.

I don't think so. You basically upload a bunch of stuff it could have been 
inside your irqfd into eventfd. Now the eventfd_signal() can magically 
sleep, or not, depending on what the signal functions do. This makes up a 
pretty aweful interface if you ask me.
A lot simpler and cleaner if eventfd_signal(), like all the wake up 
functions inside the kernel, can be called from atomic context. Always, 
not sometimes.



- Davide


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gregory Haskins June 17, 2009, 4:41 p.m. UTC | #23
Michael S. Tsirkin wrote:
> On Wed, Jun 17, 2009 at 11:02:06AM -0400, Gregory Haskins wrote:
>   
>> Michael S. Tsirkin wrote:
>>     
>>> On Tue, Jun 16, 2009 at 02:09:38PM -0400, Gregory Haskins wrote:
>>>   
>>>       
>>>>> What do you mean by copy_to_user(other->mm) here?  If you are going to switch
>>>>> to another mm, then I think current->mm must be valid (I think it's not enough
>>>>> that you can sleep). So preemptible() might not be enough.
>>>>>   
>>>>>       
>>>>>           
>>>> I dont currently use switch_mm, if that is what you mean.  I save the
>>>> task_struct into the appropriate context so current->mm doesn't matter
>>>> to me.  I never use it.  All I need (afaik) is to acquire the proper
>>>> mutex first.  I am not an MM expert, so perhaps I have this wrong but it
>>>> does appear to work properly even from kthread context.
>>>>
>>>> -Greg
>>>>
>>>>
>>>>     
>>>>         
>>> I think I saw get_user_pages + memcpy in your patch. Yes, that works
>>> without switching contexts but it's slower than copy to user if you are
>>> in the right context, and AFAIK it's slower than get_user_pages_fast.
>>>
>>>   
>>>       
>> Yeah, understood.  It will definitely be interesting to try that
>> optimization with switch_mm that you suggested.
>>     
>
> BTW, I'm kind of confused - in your patch you do get_task_struct: does this
> guarantee that mm is not going aways?
>   

Well, again, I am not an expert on MM, but I would think so.  If p holds
a reference to p->mm (which I would think it would have to), and we hold
a reference to p, p->mm should be indirectly stable coincident with the
lifetime of our p reference.  OTOH, I have not actually looked at
whether p actually takes a p->mm reference or not via inspection, so
this is just an educated guess at this point. ;)

I guess if this is broken, the solution is simple:  Modify the code to
take an mm reference as well.

-Greg
Gregory Haskins June 17, 2009, 5:28 p.m. UTC | #24
Hi Davide,

Davide Libenzi wrote:
> On Tue, 16 Jun 2009, Gregory Haskins wrote:
>
>   
>> Davide Libenzi wrote:
>>     
>>> On Tue, 16 Jun 2009, Gregory Haskins wrote:
>>>
>>>   
>>>       
>>>> Does this all make sense?
>>>>     
>>>>         
>>> This conversation has been *really* long, and I haven't had time to look 
>>> at the patch yet. But looking at the amount of changes, and the amount of 
>>> even more changes talked in this thread, there's a very slim chance that 
>>> I'll ACK the eventfd code.
>>> You may want to consider a solution that does not litter eventfd code that 
>>> much.
>>>
>>>
>>> - Davide
>>>
>>>
>>>   
>>>       
>> Hi Davide,
>>
>> I understand your position and value your time/insight into looking at
>> this things.
>>
>> Despite the current ongoing discussion, I still stand that the current
>> patch is my proposed solution (though I have yet to convince Michael). 
>> But in any case,  if you have the time, please look it over because I
>> still think its the right direction to head in.
>>     
>
> I don't think so. You basically upload a bunch of stuff it could have been 
> inside your irqfd into eventfd.

Can you elaborate?  I currently do not see how I could do the proposed
concept inside of irqfd while still using eventfd.  Of course, that
would be possible if we fork irqfd from eventfd,  and perhaps this is
what you are proposing.  As previously stated I don't want to give up on
the prospect of re-using it quite yet, so bear with me. :)

The issue with eventfd, as I see it, is that eventfd uses a
spin_lock_irqsave (by virtue of the wait-queue stuff) across the
"signal" callback (which today is implemented as a wake-up).  This
spin_lock implicitly creates a non-preemptible critical section that
occurs independently of whether eventfd_signal() itself is invoked from
a sleepable context or not.

What I strive to achieve is to remove the creation of this internal
critical section.  If eventfd_signal() is called from atomic context, so
be it.  We will detect this in the callback and be forced to take the
slow-path, and I am ok with that.  *But*, if eventfd_signal() (or
f_ops->write(), for that matter) are called from a sleepable context
*and* eventfd doesn't introduce its own critical section (such as with
my srcu patch), we can potentially optimize within the callback by
executing serially instead of deferring (e.g. via a workqueue).

(Note: The issue of changing eventfd_signal interface is a separate
tangent that Michael proposed, and is not something I am advocating.  In
the current proposal, eventfd_signal() retains its exact semantics as it
has in mainline).

>  Now the eventfd_signal() can magically 
> sleep, or not, depending on what the signal functions do. This makes up a 
> pretty aweful interface if you ask me.
> A lot simpler and cleaner if eventfd_signal(), like all the wake up 
> functions inside the kernel, can be called from atomic context. Always, 
> not sometimes.
>   

It can!  :) This is not changing from whats in mainline today (covered
above).

Thanks Davide,
-Greg
Davide Libenzi June 17, 2009, 5:44 p.m. UTC | #25
On Wed, 17 Jun 2009, Gregory Haskins wrote:

> Can you elaborate?  I currently do not see how I could do the proposed
> concept inside of irqfd while still using eventfd.  Of course, that
> would be possible if we fork irqfd from eventfd,  and perhaps this is
> what you are proposing.  As previously stated I don't want to give up on
> the prospect of re-using it quite yet, so bear with me. :)
> 
> The issue with eventfd, as I see it, is that eventfd uses a
> spin_lock_irqsave (by virtue of the wait-queue stuff) across the
> "signal" callback (which today is implemented as a wake-up).  This
> spin_lock implicitly creates a non-preemptible critical section that
> occurs independently of whether eventfd_signal() itself is invoked from
> a sleepable context or not.
> 
> What I strive to achieve is to remove the creation of this internal
> critical section.  If eventfd_signal() is called from atomic context, so
> be it.  We will detect this in the callback and be forced to take the
> slow-path, and I am ok with that.  *But*, if eventfd_signal() (or
> f_ops->write(), for that matter) are called from a sleepable context
> *and* eventfd doesn't introduce its own critical section (such as with
> my srcu patch), we can potentially optimize within the callback by
> executing serially instead of deferring (e.g. via a workqueue).

Since when the scheduling (assuming it's not permanently running on 
another core due to high frequency work post) of a kernel thread is such 
a big impact that interfaces need to be redesigned for that?
How much the (possible, but not certain) kernel thread context switch time 
weighs in the overall KVM IRQ service time?



> It can!  :) This is not changing from whats in mainline today (covered
> above).

It can/could, if the signal() function takes very accurate care of doing 
the magic atomic check.



- Davide


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gregory Haskins June 17, 2009, 7:17 p.m. UTC | #26
Davide Libenzi wrote:
> On Wed, 17 Jun 2009, Gregory Haskins wrote:
>
>   
>> Can you elaborate?  I currently do not see how I could do the proposed
>> concept inside of irqfd while still using eventfd.  Of course, that
>> would be possible if we fork irqfd from eventfd,  and perhaps this is
>> what you are proposing.  As previously stated I don't want to give up on
>> the prospect of re-using it quite yet, so bear with me. :)
>>
>> The issue with eventfd, as I see it, is that eventfd uses a
>> spin_lock_irqsave (by virtue of the wait-queue stuff) across the
>> "signal" callback (which today is implemented as a wake-up).  This
>> spin_lock implicitly creates a non-preemptible critical section that
>> occurs independently of whether eventfd_signal() itself is invoked from
>> a sleepable context or not.
>>
>> What I strive to achieve is to remove the creation of this internal
>> critical section.  If eventfd_signal() is called from atomic context, so
>> be it.  We will detect this in the callback and be forced to take the
>> slow-path, and I am ok with that.  *But*, if eventfd_signal() (or
>> f_ops->write(), for that matter) are called from a sleepable context
>> *and* eventfd doesn't introduce its own critical section (such as with
>> my srcu patch), we can potentially optimize within the callback by
>> executing serially instead of deferring (e.g. via a workqueue).
>>     
>
> Since when the scheduling (assuming it's not permanently running on 
> another core due to high frequency work post) of a kernel thread is such 
> a big impact that interfaces need to be redesigned for that?
>   

Low-latency applications, for instance, care about this.  Even one
context switch can add a substantial overhead.

> How much the (possible, but not certain) kernel thread context switch time 
> weighs in the overall KVM IRQ service time?
>   

Generally each one is costing me about 7us on average.  For something
like high-speed networking, we have a path that has about 30us of
base-line overhead.  So one additional ctx-switch puts me at base+7 ( =
~37us), two puts me in base+2*7 (= ~44us).  So in that context (no pun
intended ;), it hurts quite a bit.  I'll be the first to admit that not
everyone (most?) will care about latency, though.  But FWIW, I do.

>
>
>   
>> It can!  :) This is not changing from whats in mainline today (covered
>> above).
>>     
>
> It can/could, if the signal() function takes very accurate care of doing 
> the magic atomic check.
>   

True, but thats the notifiee's burden, not eventfd's.  And its always
going to be opt-in.  Even today, someone is free to either try to sleep
(which will oops on the might_sleep()), or try to check if they can
sleep (it will always say they can't due to the eventfd wqh spinlock). 
The only thing that changes with my patch is that someone that opts in
to check if they can sleep may find that they sometimes (mostly?) can.

In any case, I suspect that there are no other clients of eventfd other
than standard wait-queue sleepers, and irqfd.  The wake_up() code is
never going to care anyway, so this really comes down to future users of
the notification interfaces (irqfd today, iosignalfd in the near-term). 
Therefore, there really aren't any legacy issues to deal with that I am
aware of, even if they mattered.

Thanks Davide,
-Greg
Davide Libenzi June 17, 2009, 7:50 p.m. UTC | #27
On Wed, 17 Jun 2009, Gregory Haskins wrote:

> Davide Libenzi wrote:
> 
> > How much the (possible, but not certain) kernel thread context switch time 
> > weighs in the overall KVM IRQ service time?
> >   
> 
> Generally each one is costing me about 7us on average.  For something
> like high-speed networking, we have a path that has about 30us of
> base-line overhead.  So one additional ctx-switch puts me at base+7 ( =
> ~37us), two puts me in base+2*7 (= ~44us).  So in that context (no pun
> intended ;), it hurts quite a bit.  I'll be the first to admit that not
> everyone (most?) will care about latency, though.  But FWIW, I do.

And how a frame reception is handled in Linux nowadays?



> True, but thats the notifiee's burden, not eventfd's.  And its always
> going to be opt-in.  Even today, someone is free to either try to sleep
> (which will oops on the might_sleep()), ...

No, today you just can't sleep. As you can't sleep in any 
callback-registered wakeups, like epoll, for example.



- Davide


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gregory Haskins June 17, 2009, 9:48 p.m. UTC | #28
Davide Libenzi wrote:
> On Wed, 17 Jun 2009, Gregory Haskins wrote:
>
>   
>> Davide Libenzi wrote:
>>
>>     
>>> How much the (possible, but not certain) kernel thread context switch time 
>>> weighs in the overall KVM IRQ service time?
>>>   
>>>       
>> Generally each one is costing me about 7us on average.  For something
>> like high-speed networking, we have a path that has about 30us of
>> base-line overhead.  So one additional ctx-switch puts me at base+7 ( =
>> ~37us), two puts me in base+2*7 (= ~44us).  So in that context (no pun
>> intended ;), it hurts quite a bit.  I'll be the first to admit that not
>> everyone (most?) will care about latency, though.  But FWIW, I do.
>>     
>
> And how a frame reception is handled in Linux nowadays?
>   

I am not clear on what you are asking here.  So in case this was a
sincere question on how things work, here is a highlight of the typical
flow of a packet that ingresses on a physical adapter and routes to KVM
via vbus.

a) interrupt from eth to host wakes the cpu out of idle, enters
interrupt-context.
b) ISR runs, disables interrupts on the eth, schedules softirq-net-rx
c) ISR completes, kernel checks softirq state before IRET, runs pending
softirq-net-rx in interrupt context to NAPI-poll the ethernet
d) packet is pulled out of eth into a layer-2 bridge, and switched to
the appropriate switch-port (which happens to be a venet-tap (soon to be
virtio-net based) device.  The packet is queued to the tap as an xmit
request, and the tap's tx-thread is awoken.
e) the xmit call returns, the napi-poll completes, and the
softirq-net-rx terminates.  The kernel does an IRET to exit interrupt
context.
f) the scheduler runs and sees the tap's tx-thread is ready to run,
schedules it in.
g) the tx-thread awakens, dequeues the posted skb, copies it to the
virtio-ring, and finally raises an interrupt on irqfd with eventfd_signal().

At this point, all of the data has been posted to the virtio-ring in
shared memory between the host and guest.  All that is left is to inject
the interrupt so the guest knows to process the ring.  We call the
eventfd_signal() from kthread context.  However, the callback to inject
the interrupt is invoked with the wqh spinlock held so we are forced to
defer the interrupt injection to a workqueue so the kvm->lock can be
safely acquired.  This adds additional latency (~7us) in a path that is
only a handful of microseconds to being with.  I can post LTTV
screenshots, if it would be helpful to visualize this.

The reasons we need the workqueue is:

A) kvm is currently implemented with a mutex for IRQ injection, so its
sleepy
B) the wqh used in eventfd wake_up() creates an non-preemptible section
when the callback is invoked.

We can (and should) address (A), but this is but one example in a common
pattern.  We will run into similar issues in other places (such as with
iosignalfd), so I would like to address (B) as well.  My patch attempts
to do that by re-implementing the callback mechanism as something other
than wait-queue based. It then adds a wait-queue as a default client of
the new interface.

Therefore, the eventfd clients that want traditional vanilla wakeups can
go right ahead and use the non-preemptible wait-queue methods as they
always have.  However, the clients that want (potentially) preemptive
callbacks can use the new interface: eventfd_notifier_register(), armed
with the knowledge that it can only sleep if the eventfd_signal() caller
was not in-atomic at the time.  Thus the dynamic state check ala
preemptible().  Without the check, the client should assume it is not
preemptible, as always.
>
>
>   
>> True, but thats the notifiee's burden, not eventfd's.  And its always
>> going to be opt-in.  Even today, someone is free to either try to sleep
>> (which will oops on the might_sleep()), ...
>>     
>
> No, today you just can't sleep. As you can't sleep in any 
> callback-registered wakeups, like epoll, for example.
>   

Right, understood, and note that this is precisely why I said it would
oops.  What I was specifically trying to highlight is that its not like
this change imposes new requirements on the existing callbacks.  I also
wanted to highlight that its not really eventfd's concern what the
callback tries to do, per se (if it wants to sleep it can try, it just
wont work).  Any reasonable wakeup callback in existence would already
assume it cannot sleep, and they wouldn't even try to begin with.

On the other hand, what I am introducing here (specifically to eventfd
callbacks, not wait-queues [*]) is the possibility of removing this
restriction under the proper circumstances.  It would only be apparent
of this change iff the callback in question tried to test for this (e.g.
checking preemptible()) state.  It is thus opt-in, and existing code
does not break.

Thanks Davide,
-Greg

[*]: I contemplated solving this problem generally at the wait-queue
level, but quickly abandoned the idea.  The reason is that something
like *RCU has the properties of being particularly lightweight in the
fast/common/read-path, but being particularly heavyweight in the
slow/uncommon/writer-path.  Contrast that with something like a
traditional lock/mutex which is middleweight in all circumstances.

Something like a wait-queue has a heavy dose of both "writers" (tasks
being added to the queue) as well as "readers" (walking the queue to
invoke callbacks).  RCU would be particularly painful here.  Contrast
this to the way I wrote the patch for eventfd:  The "writer" path is
pretty rare ("add me to be notified" generally happens once at eventfd
init), whereas the "reader" path ("tell me when someone signals") is
fairly common.  RCU is a good candidate here to get rid of the spinlock
held during the walk, while still maintaining thread-saftey w.r.t. the
notifier list.  Therefore I think updating eventfd makes more sense than
the more general case of wait-queues.  More importantly, I do not think
this compromises the integrity of eventfd.  Its applicable for a general
signaling idiom, IMHO, and its why I am comfortable pushing the patch
out to you.
Davide Libenzi June 17, 2009, 11:21 p.m. UTC | #29
On Wed, 17 Jun 2009, Gregory Haskins wrote:

> I am not clear on what you are asking here.  So in case this was a
> sincere question on how things work, here is a highlight of the typical
> flow of a packet that ingresses on a physical adapter and routes to KVM
> via vbus.
> 
> a) interrupt from eth to host wakes the cpu out of idle, enters
> interrupt-context.
> b) ISR runs, disables interrupts on the eth, schedules softirq-net-rx
> c) ISR completes, kernel checks softirq state before IRET, runs pending
> softirq-net-rx in interrupt context to NAPI-poll the ethernet
> d) packet is pulled out of eth into a layer-2 bridge, and switched to
> the appropriate switch-port (which happens to be a venet-tap (soon to be
> virtio-net based) device.  The packet is queued to the tap as an xmit
> request, and the tap's tx-thread is awoken.
> e) the xmit call returns, the napi-poll completes, and the
> softirq-net-rx terminates.  The kernel does an IRET to exit interrupt
> context.
> f) the scheduler runs and sees the tap's tx-thread is ready to run,
> schedules it in.
> g) the tx-thread awakens, dequeues the posted skb, copies it to the
> virtio-ring, and finally raises an interrupt on irqfd with eventfd_signal().

Heh, Gregory, this isn't a job interview. You didn't have to actually 
detail everything ;)  Glad you did though, so we've something to talk 
later.



> At this point, all of the data has been posted to the virtio-ring in
> shared memory between the host and guest.  All that is left is to inject
> the interrupt so the guest knows to process the ring.  We call the
> eventfd_signal() from kthread context.  However, the callback to inject
> the interrupt is invoked with the wqh spinlock held so we are forced to
> defer the interrupt injection to a workqueue so the kvm->lock can be
> safely acquired.  This adds additional latency (~7us) in a path that is
> only a handful of microseconds to being with.  I can post LTTV
> screenshots, if it would be helpful to visualize this.

So, what you're trying to say, is that the extra wakeup required by your 
schedule_work() processing, makes actually a difference in the time it 
takes to go from a) to g), where there are at least two other kernel 
thread wakeups?
Don't think in terms of microbenchs, think in how much are those 7us (are 
they? really? this is a sync, on-cpu, kernel thread, wake up) are 
impacting the whole path.
Everything looks worthwhile in microbenches.




> Right, understood, and note that this is precisely why I said it would
> oops.  What I was specifically trying to highlight is that its not like
> this change imposes new requirements on the existing callbacks.  I also
> wanted to highlight that its not really eventfd's concern what the
> callback tries to do, per se (if it wants to sleep it can try, it just
> wont work).  Any reasonable wakeup callback in existence would already
> assume it cannot sleep, and they wouldn't even try to begin with.
> 
> On the other hand, what I am introducing here (specifically to eventfd
> callbacks, not wait-queues [*]) is the possibility of removing this
> restriction under the proper circumstances.  It would only be apparent
> of this change iff the callback in question tried to test for this (e.g.
> checking preemptible()) state.  It is thus opt-in, and existing code
> does not break.

The interface is just ugly IMO. You have eventfd_signal() that can sleep, 
or not, depending on the registered ->signal() function implementations.
This is pretty bad, a lot worse than the theoretical us spent in the 
schedule_work() processing.



- Davide


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin June 18, 2009, 6:23 a.m. UTC | #30
On Wed, Jun 17, 2009 at 04:21:19PM -0700, Davide Libenzi wrote:
> The interface is just ugly IMO. You have eventfd_signal() that can sleep, 
> or not, depending on the registered ->signal() function implementations.
> This is pretty bad, a lot worse than the theoretical us spent in the 
> schedule_work() processing.

I agree. How about the idea of introducing eventfd_signal_from_task
which can sleep? Does this sound same?
Avi Kivity June 18, 2009, 9:03 a.m. UTC | #31
On 06/16/2009 05:40 PM, Gregory Haskins wrote:
>
> Something else to consider:  For iosignalfd, we can assume we will
> always be called from vcpu process context so we might not really need
> official affirmation from the system.  For irqfd, we cannot predict who
> may be injecting the interrupt (for instance, it might be a
> PCI-passthrough hard-irq context).  I am not sure if this knowledge
> actually helps to simplify the problem space, but I thought I should
> mention it.
>    

We can't assume anything.  Userspace might hand out that eventfd to anyone.
Gregory Haskins June 18, 2009, 11:43 a.m. UTC | #32
Avi Kivity wrote:
> On 06/16/2009 05:40 PM, Gregory Haskins wrote:
>>
>> Something else to consider:  For iosignalfd, we can assume we will
>> always be called from vcpu process context so we might not really need
>> official affirmation from the system.  For irqfd, we cannot predict who
>> may be injecting the interrupt (for instance, it might be a
>> PCI-passthrough hard-irq context).  I am not sure if this knowledge
>> actually helps to simplify the problem space, but I thought I should
>> mention it.
>>    
>
> We can't assume anything.  Userspace might hand out that eventfd to
> anyone.

Yeah agreed, and I misspoke (mistyped? ;).  It's not that I assume its
from a vcpu.  Its that it doesn't matter (at least for vbus).

The reason is that the memory context is established in vbus via a
different channel (i.e. I never look at "current" in the context of the
eventfd callback).  The eventfd is just telling me that the previously
established memory context needs to be looked at (e.g. the virtio-ring
changed).  Therefore, the origin of the signal doesn't actually matter
(at least w.r.t. safe handling of the request).

Obviously, the question of whether something has really changed in the
memory and whether an errant signal could cause problems is a separate
issue.  But we need to be robust against that anyway, for a multitude of
other reasons (e.g. malicious/broken guest scribbling over the
virtio-ring, etc).

-Greg
Gregory Haskins June 18, 2009, 2:01 p.m. UTC | #33
Davide Libenzi wrote:
> On Wed, 17 Jun 2009, Gregory Haskins wrote:
>
>   
>> I am not clear on what you are asking here.  So in case this was a
>> sincere question on how things work, here is a highlight of the typical
>> flow of a packet that ingresses on a physical adapter and routes to KVM
>> via vbus.
>>
>> a) interrupt from eth to host wakes the cpu out of idle, enters
>> interrupt-context.
>> b) ISR runs, disables interrupts on the eth, schedules softirq-net-rx
>> c) ISR completes, kernel checks softirq state before IRET, runs pending
>> softirq-net-rx in interrupt context to NAPI-poll the ethernet
>> d) packet is pulled out of eth into a layer-2 bridge, and switched to
>> the appropriate switch-port (which happens to be a venet-tap (soon to be
>> virtio-net based) device.  The packet is queued to the tap as an xmit
>> request, and the tap's tx-thread is awoken.
>> e) the xmit call returns, the napi-poll completes, and the
>> softirq-net-rx terminates.  The kernel does an IRET to exit interrupt
>> context.
>> f) the scheduler runs and sees the tap's tx-thread is ready to run,
>> schedules it in.
>> g) the tx-thread awakens, dequeues the posted skb, copies it to the
>> virtio-ring, and finally raises an interrupt on irqfd with eventfd_signal().
>>     
>
> Heh, Gregory, this isn't a job interview. You didn't have to actually 
> detail everything ;)  Glad you did though, so we've something to talk 
> later.
>   

:)

>
>
>   
>> At this point, all of the data has been posted to the virtio-ring in
>> shared memory between the host and guest.  All that is left is to inject
>> the interrupt so the guest knows to process the ring.  We call the
>> eventfd_signal() from kthread context.  However, the callback to inject
>> the interrupt is invoked with the wqh spinlock held so we are forced to
>> defer the interrupt injection to a workqueue so the kvm->lock can be
>> safely acquired.  This adds additional latency (~7us) in a path that is
>> only a handful of microseconds to being with.  I can post LTTV
>> screenshots, if it would be helpful to visualize this.
>>     
>
> So, what you're trying to say, is that the extra wakeup required by your 
> schedule_work() processing, makes actually a difference in the time it 
> takes to go from a) to g),

Yes

>  where there are at least two other kernel 
> thread wakeups?
>   
Actually there is only one (the tx-thread) aside from the eventfd
imposed workqueue one.  Incidentally, I would love to get rid of the
other thread too, so I am not just picking on eventfd ;).  The other is
a lot harder since it has to update the virtio-ring and may need to page
in guest memory to do so.


> Don't think in terms of microbenchs,

I'm not.  This goes directly to end-application visible performance. 
These types of bottlenecks directly affect the IOPS rate in quantifiable
ways of the subsystem in question (and once I re-stablize the vbus tree
against the latest kvm/*fd code, I will post some numbers).  This is
particularly true for request-response type operations where stack
latency is the key gating factor.  Consider things like high-performance
shared-nothing clusters to a ramdisk (yes, these exist and have
relevance).  The overhead of the subsystem can be very low, and is
usually gated by the transaction throughput, which is gated by the IOPS
rate of the medium.  A 7us bump when we are only talking about dozens of
microseconds overall is a huge percentage.  Other examples might be
high-resolution clock sync (ala ptpd) or FSI trading applications.

The ultimate goal here is to get something like a KVM guest on par with
baremetal to allow the convergence of the HPC space with the
data-center/cloud trend of utilizing VMs as the compute fabric. 
Baremetal doesn't have a similar context switch in its stack, and these
little microsecond bumps here and there are the reason why we are not
quite at baremetal levels today with KVM.  Therefore, I am working
through trying to eliminate them.

To flip it around on you: try telling a group like the netdev guys that
they should put extra context switches into the stack because they don't
really matter.  Be sure to wear extra thick asbestos. ;)

>  think in how much are those 7us (are 
> they? really? this is a sync, on-cpu, kernel thread, wake up) are 
> impacting the whole path.
> Everything looks worthwhile in microbenches.
>
>
>
>
>   
>> Right, understood, and note that this is precisely why I said it would
>> oops.  What I was specifically trying to highlight is that its not like
>> this change imposes new requirements on the existing callbacks.  I also
>> wanted to highlight that its not really eventfd's concern what the
>> callback tries to do, per se (if it wants to sleep it can try, it just
>> wont work).  Any reasonable wakeup callback in existence would already
>> assume it cannot sleep, and they wouldn't even try to begin with.
>>
>> On the other hand, what I am introducing here (specifically to eventfd
>> callbacks, not wait-queues [*]) is the possibility of removing this
>> restriction under the proper circumstances.  It would only be apparent
>> of this change iff the callback in question tried to test for this (e.g.
>> checking preemptible()) state.  It is thus opt-in, and existing code
>> does not break.
>>     
>
> The interface is just ugly IMO.

Well, everyone is of course entitled to an opinion, but with all due
respect I think you are looking at this backwards. ;)

>  You have eventfd_signal() that can sleep, 
> or not, depending on the registered ->signal() function implementations.
> This is pretty bad, a lot worse than the theoretical us spent in the 
> schedule_work() processing.
>   

I wouldn't describe it that way.  Whether eventfd_signal() sleeps or not
even under your control as it is.  Really what we have is an interface
(eventfd_signal()) that must support being invoked from atomic-context. 
As an interface designer, you should never be saying "can this sleep",
but rather "what contexts is it legal to call this interface".   You
have already spec'd that eventfd_signal() is atomic-safe, and I am not
proposing to change that.  It is, and always will be (unless we screw
up) atomic safe.

Consider kmalloc(GFP_KERNEL), and kmalloc(GFP_ATOMIC).  The former says
you must call this from process context, and the latter says you may
call it from atomic context.  If you think about it, this is technically
orthogonal to whether it can sleep or not, even though people have
become accustomed to associating atomic == cant-sleep, because today
that is true (at least in mainline).  As a counter example, in -rt,
atomic context *is* process context, and kmalloc(GFP_ATOMIC) can in fact
sleep.  But all the code still works unmodified because -rt is still
ensuring that the interface contract is met: it works in atomic-context
(its just that atomic-context is redefined ;)

So, all that aside: I restate that eventfd should *not* care about what
its clients do, as long as they meet the requirements of the interface. 
In this case, the requirement is that they need to work from atomic
context (and with my proposal, they still will).  Its really a question
of should eventfd artificially create an atomic context in some kind of
attempt to enforce that?  The answer, IMO, is that it shouldn't if it
doesn't have to, and there are already community accepted patterns (e.g.
RCU) for accomplishing that. 

One could also make a secondary argument that holding a spinlock across
a critical section of arbitrary complexity is probably not ideal.  Its
fine for quick wake_ups as you originally designed eventfd for. 
However, now that we are exploring the generalization of the callback
interface beyond simple wake-ups, this should probably be re-evaluated
independent of my current requirements for low-latency.

The fact is that eventfd is a really neat general signaling idiom. 
However, its currently geared towards "signaling = wakeup".  As we have
proven with this KVM *fd effort, this is not necessarily accurate to
describe all use cases, nor is it optimal.  I'd like to address that. 
An alternative, of course, is that we make a private anon-fd solution
within KVM.  However, it will be so similar to eventfd so it just seems
wasteful if it can be avoided.

Kind Regards,
-Greg
Davide Libenzi June 18, 2009, 5:44 p.m. UTC | #34
On Thu, 18 Jun 2009, Gregory Haskins wrote:

> Actually there is only one (the tx-thread) aside from the eventfd
> imposed workqueue one.  Incidentally, I would love to get rid of the
> other thread too, so I am not just picking on eventfd ;).  The other is
> a lot harder since it has to update the virtio-ring and may need to page
> in guest memory to do so.

No, there is the interface rx softirq too, that makes two. Plus, the 
process of delivering (especially for KVM & Co.) does not involve only ctx 
switching, there's other stuff in the middle too.
You also talk about latency. Really? Probably RT people aren't looking 
into KVM if RT is even a mild requirement.



> To flip it around on you: try telling a group like the netdev guys that
> they should put extra context switches into the stack because they don't
> really matter.  Be sure to wear extra thick asbestos. ;)

They already do. So you've got to ask yourself why they can achieve Gbps 
throughput already, why can't KVM live with it and being compelled to 
litter an existing interface.



> The fact is that eventfd is a really neat general signaling idiom. 
> However, its currently geared towards "signaling = wakeup".  As we have
> proven with this KVM *fd effort, this is not necessarily accurate to
> describe all use cases, nor is it optimal.  I'd like to address that. 
> An alternative, of course, is that we make a private anon-fd solution
> within KVM.  However, it will be so similar to eventfd so it just seems
> wasteful if it can be avoided.

Even though you take that route, you'll have to prove with replicable real 
life benchmarks, that the bloating make sense.



- Davide


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Davide Libenzi June 18, 2009, 5:52 p.m. UTC | #35
On Thu, 18 Jun 2009, Michael S. Tsirkin wrote:

> On Wed, Jun 17, 2009 at 04:21:19PM -0700, Davide Libenzi wrote:
> > The interface is just ugly IMO. You have eventfd_signal() that can sleep, 
> > or not, depending on the registered ->signal() function implementations.
> > This is pretty bad, a lot worse than the theoretical us spent in the 
> > schedule_work() processing.
> 
> I agree. How about the idea of introducing eventfd_signal_from_task
> which can sleep? Does this sound same?

You're basically asking to double the size of eventfd, make the signal 
path heavier, make the eventf size bigger, w/out having provided any *real 
life* measurement whatsoever to build a case for it.
WAY too much stuff went in by just branding the latest coolest names as 
reasons for them.
And all this to remove the wakeup of a *kernel* thread going to run in the 
same CPU where the work has been scheduled.
Come back with *replicable* real life benchmarks, and then we'll see what 
the best approach to address it will be.



- Davide


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gregory Haskins June 18, 2009, 7:04 p.m. UTC | #36
Davide Libenzi wrote:
> On Thu, 18 Jun 2009, Gregory Haskins wrote:
>
>   
>> Actually there is only one (the tx-thread) aside from the eventfd
>> imposed workqueue one.  Incidentally, I would love to get rid of the
>> other thread too, so I am not just picking on eventfd ;).  The other is
>> a lot harder since it has to update the virtio-ring and may need to page
>> in guest memory to do so.
>>     
>
> No, there is the interface rx softirq too, that makes two.

Actually, I believe you are mistaken.  It normally executes the softirq
in interrupt context, not a thread.

But I digress.  Lets just shelve the SRCU conversation for another day. 
It was my bad for introducing it now prematurely to solve a mostly
unrelated problem: the module-reference thing.  I didn't realize the
SRCU change would be so controversial, and I didn't think to split
things apart as I have done today.

But the fact is: I do not see any way to actually use your referenceless
POLLHUP release code in a race free way without doing something like I
propose in 3/4, 4/4.   Lets keep the discussion focused on that for now,
if we could.

Later, after we get this thing all built, I will re-run the numbers and
post some results so that Davide may have better proof that the context
switch overhead isn't just lost in the noise.  I think that is all he is
asking for anyway, which is understandable.

Regards,
-Greg
Davide Libenzi June 18, 2009, 10:03 p.m. UTC | #37
On Thu, 18 Jun 2009, Gregory Haskins wrote:

> Davide Libenzi wrote:
> > On Thu, 18 Jun 2009, Gregory Haskins wrote:
> >
> >   
> >> Actually there is only one (the tx-thread) aside from the eventfd
> >> imposed workqueue one.  Incidentally, I would love to get rid of the
> >> other thread too, so I am not just picking on eventfd ;).  The other is
> >> a lot harder since it has to update the virtio-ring and may need to page
> >> in guest memory to do so.
> >>     
> >
> > No, there is the interface rx softirq too, that makes two.
> 
> Actually, I believe you are mistaken.  It normally executes the softirq
> in interrupt context, not a thread.
> 
> But I digress.  Lets just shelve the SRCU conversation for another day. 
> It was my bad for introducing it now prematurely to solve a mostly
> unrelated problem: the module-reference thing.  I didn't realize the
> SRCU change would be so controversial, and I didn't think to split
> things apart as I have done today.
> 
> But the fact is: I do not see any way to actually use your referenceless
> POLLHUP release code in a race free way without doing something like I
> propose in 3/4, 4/4.   Lets keep the discussion focused on that for now,
> if we could.

OK, since I got literally swamped by the amount of talks and patches over 
this theoretically simple topic, would you mind  1) posting the global 
patch over eventfd  2) describe exactly what races are you talking about 
3) explain why this should be any concern of eventfd at all?



- Davide


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gregory Haskins June 18, 2009, 10:47 p.m. UTC | #38
Davide Libenzi wrote:
> On Thu, 18 Jun 2009, Gregory Haskins wrote:
>
>   
>> Davide Libenzi wrote:
>>     
>>> On Thu, 18 Jun 2009, Gregory Haskins wrote:
>>>
>>>   
>>>       
>>>> Actually there is only one (the tx-thread) aside from the eventfd
>>>> imposed workqueue one.  Incidentally, I would love to get rid of the
>>>> other thread too, so I am not just picking on eventfd ;).  The other is
>>>> a lot harder since it has to update the virtio-ring and may need to page
>>>> in guest memory to do so.
>>>>     
>>>>         
>>> No, there is the interface rx softirq too, that makes two.
>>>       
>> Actually, I believe you are mistaken.  It normally executes the softirq
>> in interrupt context, not a thread.
>>
>> But I digress.  Lets just shelve the SRCU conversation for another day. 
>> It was my bad for introducing it now prematurely to solve a mostly
>> unrelated problem: the module-reference thing.  I didn't realize the
>> SRCU change would be so controversial, and I didn't think to split
>> things apart as I have done today.
>>
>> But the fact is: I do not see any way to actually use your referenceless
>> POLLHUP release code in a race free way without doing something like I
>> propose in 3/4, 4/4.   Lets keep the discussion focused on that for now,
>> if we could.
>>     
>
> OK, since I got literally swamped by the amount of talks and patches over 
> this theoretically simple topic, would you mind  1) posting the global 
> patch over eventfd  2) describe exactly what races are you talking about 
> 3) explain why this should be any concern of eventfd at all?
>
>
>   
Yes, I can do that.  Thanks Davide,

-Greg
Gregory Haskins June 19, 2009, 6:51 p.m. UTC | #39
Davide Libenzi wrote:
> On Thu, 18 Jun 2009, Gregory Haskins wrote:
>
>>
>> But the fact is: I do not see any way to actually use your referenceless
>> POLLHUP release code in a race free way without doing something like I
>> propose in 3/4, 4/4.   Lets keep the discussion focused on that for now,
>> if we could.
>
> OK, since I got literally swamped by the amount of talks and patches over 
> this theoretically simple topic, would you mind  1) posting the global 
> patch over eventfd

Done.  Should show up as replies to this email.  These apply to v2.6.30 and
are devoid of any KVM-isms. :)

(Note-1: I've built and run these patches against additional kvm/irqfd
patches and verified they all work properly)

(Note-2: I included your POLLHUP patch as 1/3 since it currently only exists
in kvm.git, and is a pre-req for the other two)

> 2) describe exactly what races are you talking about


Covered in the patch headers (3/3 probably has the most detail)

 
> 3) explain why this should be any concern of eventfd at all?
>

To support any  in-kernel eventfd client that wants to get both "signal"
and "release" notifications (based on using your POLLHUP patch) in a
race-free way.  Without 2/3, 3/3, I cannot see how this can be done.  Yet
your 1/3 patch is an important enhancement (to at least one client).  I
suspect any other in-kernel users will find this a good feature as well.

Thanks Davide,
-Greg


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/eventfd.c b/fs/eventfd.c
index 72f5f8d..505d5de 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -30,8 +30,47 @@  struct eventfd_ctx {
 	 */
 	__u64 count;
 	unsigned int flags;
+	struct srcu_struct srcu;
+	struct list_head nh;
+	struct eventfd_notifier notifier;
 };
 
+static void _eventfd_wqh_notify(struct eventfd_notifier *en)
+{
+	struct eventfd_ctx *ctx = container_of(en,
+					       struct eventfd_ctx,
+					       notifier);
+
+	if (waitqueue_active(&ctx->wqh))
+		wake_up_poll(&ctx->wqh, POLLIN);
+}
+
+static void _eventfd_notify(struct eventfd_ctx *ctx)
+{
+	struct eventfd_notifier *en;
+	int idx;
+
+	idx = srcu_read_lock(&ctx->srcu);
+
+	/*
+	 * The goal here is to allow the notification to be preemptible
+	 * as often as possible.  We cannot achieve this with the basic
+	 * wqh mechanism because it requires the wqh->lock.  Therefore
+	 * we have an internal srcu list mechanism of which the wqh is
+	 * a client.
+	 *
+	 * Not all paths will invoke this function in process context.
+	 * Callers should check for suitable state before assuming they
+	 * can sleep (such as with preemptible()).  Paul McKenney assures
+	 * me that srcu_read_lock is compatible with in-atomic, as long as
+	 * the code within the critical section is also compatible.
+	 */
+	list_for_each_entry_rcu(en, &ctx->nh, list)
+		en->ops->signal(en);
+
+	srcu_read_unlock(&ctx->srcu, idx);
+}
+
 /*
  * Adds "n" to the eventfd counter "count". Returns "n" in case of
  * success, or a value lower then "n" in case of coutner overflow.
@@ -51,10 +90,10 @@  int eventfd_signal(struct file *file, int n)
 	if (ULLONG_MAX - ctx->count < n)
 		n = (int) (ULLONG_MAX - ctx->count);
 	ctx->count += n;
-	if (waitqueue_active(&ctx->wqh))
-		wake_up_locked_poll(&ctx->wqh, POLLIN);
 	spin_unlock_irqrestore(&ctx->wqh.lock, flags);
 
+	_eventfd_notify(ctx);
+
 	return n;
 }
 EXPORT_SYMBOL_GPL(eventfd_signal);
@@ -62,13 +101,21 @@  EXPORT_SYMBOL_GPL(eventfd_signal);
 static int eventfd_release(struct inode *inode, struct file *file)
 {
 	struct eventfd_ctx *ctx = file->private_data;
+	struct eventfd_notifier *en, *tmp;
 
 	/*
 	 * No need to hold the lock here, since we are on the file cleanup
-	 * path and the ones still attached to the wait queue will be
-	 * serialized by wake_up_locked_poll().
+	 * path
 	 */
-	wake_up_locked_poll(&ctx->wqh, POLLHUP);
+	list_for_each_entry_safe(en, tmp, &ctx->nh, list) {
+		list_del(&en->list);
+		if (en->ops->release)
+			en->ops->release(en);
+	}
+
+	synchronize_srcu(&ctx->srcu);
+	cleanup_srcu_struct(&ctx->srcu);
+
 	kfree(ctx);
 	return 0;
 }
@@ -176,13 +223,13 @@  static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t c
 		__remove_wait_queue(&ctx->wqh, &wait);
 		__set_current_state(TASK_RUNNING);
 	}
-	if (likely(res > 0)) {
+	if (likely(res > 0))
 		ctx->count += ucnt;
-		if (waitqueue_active(&ctx->wqh))
-			wake_up_locked_poll(&ctx->wqh, POLLIN);
-	}
 	spin_unlock_irq(&ctx->wqh.lock);
 
+	if (likely(res > 0))
+		_eventfd_notify(ctx);
+
 	return res;
 }
 
@@ -209,6 +256,50 @@  struct file *eventfd_fget(int fd)
 }
 EXPORT_SYMBOL_GPL(eventfd_fget);
 
+static int _eventfd_notifier_register(struct eventfd_ctx *ctx,
+				      struct eventfd_notifier *en)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ctx->wqh.lock, flags);
+	list_add_tail_rcu(&en->list, &ctx->nh);
+	spin_unlock_irqrestore(&ctx->wqh.lock, flags);
+
+	return 0;
+}
+
+int eventfd_notifier_register(struct file *file, struct eventfd_notifier *en)
+{
+	struct eventfd_ctx *ctx = file->private_data;
+
+	if (file->f_op != &eventfd_fops)
+		return -EINVAL;
+
+	return _eventfd_notifier_register(ctx, en);
+}
+EXPORT_SYMBOL_GPL(eventfd_notifier_register);
+
+int eventfd_notifier_unregister(struct file *file, struct eventfd_notifier *en)
+{
+	struct eventfd_ctx *ctx = file->private_data;
+
+	if (file->f_op != &eventfd_fops)
+		return -EINVAL;
+
+	spin_lock_irq(&ctx->wqh.lock);
+	list_del_rcu(&en->list);
+	spin_unlock_irq(&ctx->wqh.lock);
+
+	synchronize_srcu(&ctx->srcu);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(eventfd_notifier_unregister);
+
+static const struct eventfd_notifier_ops eventfd_wqh_ops = {
+	.signal = _eventfd_wqh_notify,
+};
+
 SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
 {
 	int fd;
@@ -229,6 +320,12 @@  SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
 	ctx->count = count;
 	ctx->flags = flags;
 
+	init_srcu_struct(&ctx->srcu);
+	INIT_LIST_HEAD(&ctx->nh);
+
+	eventfd_notifier_init(&ctx->notifier, &eventfd_wqh_ops);
+	_eventfd_notifier_register(ctx, &ctx->notifier);
+
 	/*
 	 * When we call this, the initialization must be complete, since
 	 * anon_inode_getfd() will install the fd.
diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h
index f45a8ae..0218cf6 100644
--- a/include/linux/eventfd.h
+++ b/include/linux/eventfd.h
@@ -8,6 +8,28 @@ 
 #ifndef _LINUX_EVENTFD_H
 #define _LINUX_EVENTFD_H
 
+#include <linux/list.h>
+
+struct eventfd_notifier;
+
+struct eventfd_notifier_ops {
+	void (*signal)(struct eventfd_notifier *en);
+	void (*release)(struct eventfd_notifier *en);
+};
+
+struct eventfd_notifier {
+	struct list_head                   list;
+	const struct eventfd_notifier_ops *ops;
+};
+
+static inline void eventfd_notifier_init(struct eventfd_notifier *en,
+					 const struct eventfd_notifier_ops *ops)
+{
+	memset(en, 0, sizeof(*en));
+	INIT_LIST_HEAD(&en->list);
+	en->ops = ops;
+}
+
 #ifdef CONFIG_EVENTFD
 
 /* For O_CLOEXEC and O_NONBLOCK */
@@ -29,12 +51,20 @@ 
 
 struct file *eventfd_fget(int fd);
 int eventfd_signal(struct file *file, int n);
+int eventfd_notifier_register(struct file *file, struct eventfd_notifier *en);
+int eventfd_notifier_unregister(struct file *file, struct eventfd_notifier *en);
 
 #else /* CONFIG_EVENTFD */
 
 #define eventfd_fget(fd) ERR_PTR(-ENOSYS)
 static inline int eventfd_signal(struct file *file, int n)
 { return 0; }
+static inline int eventfd_notifier_register(struct file *file,
+					    struct eventfd_notifier *en)
+{ return -ENOENT; }
+static inline int eventfd_notifier_unregister(struct file *file,
+					      struct eventfd_notifier *en)
+{ return -ENOENT; }
 
 #endif /* CONFIG_EVENTFD */
 
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index 2c8028c..efc3d77 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -43,33 +43,11 @@  struct _irqfd {
 	struct kvm               *kvm;
 	int                       gsi;
 	struct list_head          list;
-	poll_table                pt;
-	wait_queue_head_t        *wqh;
-	wait_queue_t              wait;
 	struct work_struct        inject;
+	struct eventfd_notifier   notifier;
 };
 
 static void
-irqfd_inject(struct work_struct *work)
-{
-	struct _irqfd *irqfd = container_of(work, struct _irqfd, inject);
-	struct kvm *kvm;
-	int idx;
-
-	idx = srcu_read_lock(&irqfd->srcu);
-
-	kvm = rcu_dereference(irqfd->kvm);
-	if (kvm) {
-		mutex_lock(&kvm->irq_lock);
-		kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1);
-		kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0);
-		mutex_unlock(&kvm->irq_lock);
-	}
-
-	srcu_read_unlock(&irqfd->srcu, idx);
-}
-
-static void
 irqfd_disconnect(struct _irqfd *irqfd)
 {
 	struct kvm *kvm;
@@ -97,47 +75,67 @@  irqfd_disconnect(struct _irqfd *irqfd)
 	kvm_put_kvm(kvm);
 }
 
-static int
-irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key)
+static void
+_irqfd_inject(struct _irqfd *irqfd)
 {
-	struct _irqfd *irqfd = container_of(wait, struct _irqfd, wait);
-	unsigned long flags = (unsigned long)key;
-
-	if (flags & POLLIN)
-		/*
-		 * The POLLIN wake_up is called with interrupts disabled.
-		 * Therefore we need to defer the IRQ injection until later
-		 * since we need to acquire the kvm->lock to do so.
-		 */
-		schedule_work(&irqfd->inject);
+	struct kvm *kvm;
+	int idx;
 
-	if (flags & POLLHUP) {
-		/*
-		 * The POLLHUP is called unlocked, so it theoretically should
-		 * be safe to remove ourselves from the wqh using the locked
-		 * variant of remove_wait_queue()
-		 */
-		remove_wait_queue(irqfd->wqh, &irqfd->wait);
-		flush_work(&irqfd->inject);
-		irqfd_disconnect(irqfd);
+	idx = srcu_read_lock(&irqfd->srcu);
 
-		cleanup_srcu_struct(&irqfd->srcu);
-		kfree(irqfd);
+	kvm = rcu_dereference(irqfd->kvm);
+	if (kvm) {
+		mutex_lock(&kvm->irq_lock);
+		kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1);
+		kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0);
+		mutex_unlock(&kvm->irq_lock);
 	}
 
-	return 0;
+	srcu_read_unlock(&irqfd->srcu, idx);
+}
+
+static void
+irqfd_inject(struct work_struct *work)
+{
+	struct _irqfd *irqfd = container_of(work, struct _irqfd, inject);
+
+	_irqfd_inject(irqfd);
+}
+
+static void
+irqfd_eventfd_signal(struct eventfd_notifier *notifier)
+{
+	struct _irqfd *irqfd = container_of(notifier, struct _irqfd, notifier);
+
+	/*
+	 * Our signal may or may not be called from atomic context.  We can
+	 * detect if this can safely sleep by checking preemptible() on
+	 * CONFIG_PREEMPT kernels.  CONFIG_PREEMPT=n, or in_atomic() will fail
+	 * this check and therefore defer the injection via a work-queue
+	 */
+	if (preemptible())
+		_irqfd_inject(irqfd);
+	else
+		schedule_work(&irqfd->inject);
 }
 
 static void
-irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh,
-			poll_table *pt)
+irqfd_eventfd_release(struct eventfd_notifier *notifier)
 {
-	struct _irqfd *irqfd = container_of(pt, struct _irqfd, pt);
+	struct _irqfd *irqfd = container_of(notifier, struct _irqfd, notifier);
 
-	irqfd->wqh = wqh;
-	add_wait_queue(wqh, &irqfd->wait);
+	flush_work(&irqfd->inject);
+	irqfd_disconnect(irqfd);
+
+	cleanup_srcu_struct(&irqfd->srcu);
+	kfree(irqfd);
 }
 
+static const struct eventfd_notifier_ops irqfd_notifier_ops = {
+	.signal  = irqfd_eventfd_signal,
+	.release = irqfd_eventfd_release,
+};
+
 int
 kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags)
 {
@@ -165,14 +163,9 @@  kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags)
 		goto fail;
 	}
 
-	/*
-	 * Install our own custom wake-up handling so we are notified via
-	 * a callback whenever someone signals the underlying eventfd
-	 */
-	init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup);
-	init_poll_funcptr(&irqfd->pt, irqfd_ptable_queue_proc);
+	eventfd_notifier_init(&irqfd->notifier, &irqfd_notifier_ops);
 
-	ret = file->f_op->poll(file, &irqfd->pt);
+	ret = eventfd_notifier_register(file, &irqfd->notifier);
 	if (ret < 0)
 		goto fail;
 
@@ -191,9 +184,6 @@  kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags)
 	return 0;
 
 fail:
-	if (irqfd->wqh)
-		remove_wait_queue(irqfd->wqh, &irqfd->wait);
-
 	if (file && !IS_ERR(file))
 		fput(file);