diff mbox series

[RFC,v3,01/11] eventfd: track eventfd_signal() recursion depth separately in different cases

Message ID 20210119045920.447-2-xieyongji@bytedance.com (mailing list archive)
State New, archived
Headers show
Series Introduce VDUSE - vDPA Device in Userspace | expand

Commit Message

Yongji Xie Jan. 19, 2021, 4:59 a.m. UTC
Now we have a global percpu counter to limit the recursion depth
of eventfd_signal(). This can avoid deadlock or stack overflow.
But in stack overflow case, it should be OK to increase the
recursion depth if needed. So we add a percpu counter in eventfd_ctx
to limit the recursion depth for deadlock case. Then it could be
fine to increase the global percpu counter later.

Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
---
 fs/aio.c                |  3 ++-
 fs/eventfd.c            | 20 +++++++++++++++++++-
 include/linux/eventfd.h |  5 +----
 3 files changed, 22 insertions(+), 6 deletions(-)

Comments

Jason Wang Jan. 20, 2021, 4:24 a.m. UTC | #1
On 2021/1/19 下午12:59, Xie Yongji wrote:
> Now we have a global percpu counter to limit the recursion depth
> of eventfd_signal(). This can avoid deadlock or stack overflow.
> But in stack overflow case, it should be OK to increase the
> recursion depth if needed. So we add a percpu counter in eventfd_ctx
> to limit the recursion depth for deadlock case. Then it could be
> fine to increase the global percpu counter later.


I wonder whether or not it's worth to introduce percpu for each eventfd.

How about simply check if eventfd_signal_count() is greater than 2?

Thanks


>
> Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
> ---
>   fs/aio.c                |  3 ++-
>   fs/eventfd.c            | 20 +++++++++++++++++++-
>   include/linux/eventfd.h |  5 +----
>   3 files changed, 22 insertions(+), 6 deletions(-)
>
> diff --git a/fs/aio.c b/fs/aio.c
> index 1f32da13d39e..5d82903161f5 100644
> --- a/fs/aio.c
> +++ b/fs/aio.c
> @@ -1698,7 +1698,8 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
>   		list_del(&iocb->ki_list);
>   		iocb->ki_res.res = mangle_poll(mask);
>   		req->done = true;
> -		if (iocb->ki_eventfd && eventfd_signal_count()) {
> +		if (iocb->ki_eventfd &&
> +			eventfd_signal_count(iocb->ki_eventfd)) {
>   			iocb = NULL;
>   			INIT_WORK(&req->work, aio_poll_put_work);
>   			schedule_work(&req->work);
> diff --git a/fs/eventfd.c b/fs/eventfd.c
> index e265b6dd4f34..2df24f9bada3 100644
> --- a/fs/eventfd.c
> +++ b/fs/eventfd.c
> @@ -25,6 +25,8 @@
>   #include <linux/idr.h>
>   #include <linux/uio.h>
>   
> +#define EVENTFD_WAKE_DEPTH 0
> +
>   DEFINE_PER_CPU(int, eventfd_wake_count);
>   
>   static DEFINE_IDA(eventfd_ida);
> @@ -42,9 +44,17 @@ struct eventfd_ctx {
>   	 */
>   	__u64 count;
>   	unsigned int flags;
> +	int __percpu *wake_count;
>   	int id;
>   };
>   
> +bool eventfd_signal_count(struct eventfd_ctx *ctx)
> +{
> +	return (this_cpu_read(*ctx->wake_count) ||
> +		this_cpu_read(eventfd_wake_count) > EVENTFD_WAKE_DEPTH);
> +}
> +EXPORT_SYMBOL_GPL(eventfd_signal_count);
> +
>   /**
>    * eventfd_signal - Adds @n to the eventfd counter.
>    * @ctx: [in] Pointer to the eventfd context.
> @@ -71,17 +81,19 @@ __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
>   	 * it returns true, the eventfd_signal() call should be deferred to a
>   	 * safe context.
>   	 */
> -	if (WARN_ON_ONCE(this_cpu_read(eventfd_wake_count)))
> +	if (WARN_ON_ONCE(eventfd_signal_count(ctx)))
>   		return 0;
>   
>   	spin_lock_irqsave(&ctx->wqh.lock, flags);
>   	this_cpu_inc(eventfd_wake_count);
> +	this_cpu_inc(*ctx->wake_count);
>   	if (ULLONG_MAX - ctx->count < n)
>   		n = ULLONG_MAX - ctx->count;
>   	ctx->count += n;
>   	if (waitqueue_active(&ctx->wqh))
>   		wake_up_locked_poll(&ctx->wqh, EPOLLIN);
>   	this_cpu_dec(eventfd_wake_count);
> +	this_cpu_dec(*ctx->wake_count);
>   	spin_unlock_irqrestore(&ctx->wqh.lock, flags);
>   
>   	return n;
> @@ -92,6 +104,7 @@ static void eventfd_free_ctx(struct eventfd_ctx *ctx)
>   {
>   	if (ctx->id >= 0)
>   		ida_simple_remove(&eventfd_ida, ctx->id);
> +	free_percpu(ctx->wake_count);
>   	kfree(ctx);
>   }
>   
> @@ -423,6 +436,11 @@ static int do_eventfd(unsigned int count, int flags)
>   
>   	kref_init(&ctx->kref);
>   	init_waitqueue_head(&ctx->wqh);
> +	ctx->wake_count = alloc_percpu(int);
> +	if (!ctx->wake_count) {
> +		kfree(ctx);
> +		return -ENOMEM;
> +	}
>   	ctx->count = count;
>   	ctx->flags = flags;
>   	ctx->id = ida_simple_get(&eventfd_ida, 0, 0, GFP_KERNEL);
> diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h
> index fa0a524baed0..1a11ebbd74a9 100644
> --- a/include/linux/eventfd.h
> +++ b/include/linux/eventfd.h
> @@ -45,10 +45,7 @@ void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt);
>   
>   DECLARE_PER_CPU(int, eventfd_wake_count);
>   
> -static inline bool eventfd_signal_count(void)
> -{
> -	return this_cpu_read(eventfd_wake_count);
> -}
> +bool eventfd_signal_count(struct eventfd_ctx *ctx);
>   
>   #else /* CONFIG_EVENTFD */
>
Yongji Xie Jan. 20, 2021, 6:52 a.m. UTC | #2
On Wed, Jan 20, 2021 at 12:24 PM Jason Wang <jasowang@redhat.com> wrote:
>
>
> On 2021/1/19 下午12:59, Xie Yongji wrote:
> > Now we have a global percpu counter to limit the recursion depth
> > of eventfd_signal(). This can avoid deadlock or stack overflow.
> > But in stack overflow case, it should be OK to increase the
> > recursion depth if needed. So we add a percpu counter in eventfd_ctx
> > to limit the recursion depth for deadlock case. Then it could be
> > fine to increase the global percpu counter later.
>
>
> I wonder whether or not it's worth to introduce percpu for each eventfd.
>
> How about simply check if eventfd_signal_count() is greater than 2?
>

It can't avoid deadlock in this way. So we need a percpu counter for
each eventfd to limit the recursion depth for deadlock cases. And
using a global percpu counter to avoid stack overflow.

Thanks,
Yongji
Jason Wang Jan. 27, 2021, 3:37 a.m. UTC | #3
On 2021/1/20 下午2:52, Yongji Xie wrote:
> On Wed, Jan 20, 2021 at 12:24 PM Jason Wang <jasowang@redhat.com> wrote:
>>
>> On 2021/1/19 下午12:59, Xie Yongji wrote:
>>> Now we have a global percpu counter to limit the recursion depth
>>> of eventfd_signal(). This can avoid deadlock or stack overflow.
>>> But in stack overflow case, it should be OK to increase the
>>> recursion depth if needed. So we add a percpu counter in eventfd_ctx
>>> to limit the recursion depth for deadlock case. Then it could be
>>> fine to increase the global percpu counter later.
>>
>> I wonder whether or not it's worth to introduce percpu for each eventfd.
>>
>> How about simply check if eventfd_signal_count() is greater than 2?
>>
> It can't avoid deadlock in this way.


I may miss something but the count is to avoid recursive eventfd call. 
So for VDUSE what we suffers is e.g the interrupt injection path:

userspace write IRQFD -> vq->cb() -> another IRQFD.

It looks like increasing EVENTFD_WAKEUP_DEPTH should be sufficient?

Thanks


> So we need a percpu counter for
> each eventfd to limit the recursion depth for deadlock cases. And
> using a global percpu counter to avoid stack overflow.
>
> Thanks,
> Yongji
>
Yongji Xie Jan. 27, 2021, 9:11 a.m. UTC | #4
On Wed, Jan 27, 2021 at 11:38 AM Jason Wang <jasowang@redhat.com> wrote:
>
>
> On 2021/1/20 下午2:52, Yongji Xie wrote:
> > On Wed, Jan 20, 2021 at 12:24 PM Jason Wang <jasowang@redhat.com> wrote:
> >>
> >> On 2021/1/19 下午12:59, Xie Yongji wrote:
> >>> Now we have a global percpu counter to limit the recursion depth
> >>> of eventfd_signal(). This can avoid deadlock or stack overflow.
> >>> But in stack overflow case, it should be OK to increase the
> >>> recursion depth if needed. So we add a percpu counter in eventfd_ctx
> >>> to limit the recursion depth for deadlock case. Then it could be
> >>> fine to increase the global percpu counter later.
> >>
> >> I wonder whether or not it's worth to introduce percpu for each eventfd.
> >>
> >> How about simply check if eventfd_signal_count() is greater than 2?
> >>
> > It can't avoid deadlock in this way.
>
>
> I may miss something but the count is to avoid recursive eventfd call.
> So for VDUSE what we suffers is e.g the interrupt injection path:
>
> userspace write IRQFD -> vq->cb() -> another IRQFD.
>
> It looks like increasing EVENTFD_WAKEUP_DEPTH should be sufficient?
>

Actually I mean the deadlock described in commit f0b493e ("io_uring:
prevent potential eventfd recursion on poll"). It can break this bug
fix if we just increase EVENTFD_WAKEUP_DEPTH.

Thanks,
Yongji
Jason Wang Jan. 28, 2021, 3:04 a.m. UTC | #5
On 2021/1/27 下午5:11, Yongji Xie wrote:
> On Wed, Jan 27, 2021 at 11:38 AM Jason Wang <jasowang@redhat.com> wrote:
>>
>> On 2021/1/20 下午2:52, Yongji Xie wrote:
>>> On Wed, Jan 20, 2021 at 12:24 PM Jason Wang <jasowang@redhat.com> wrote:
>>>> On 2021/1/19 下午12:59, Xie Yongji wrote:
>>>>> Now we have a global percpu counter to limit the recursion depth
>>>>> of eventfd_signal(). This can avoid deadlock or stack overflow.
>>>>> But in stack overflow case, it should be OK to increase the
>>>>> recursion depth if needed. So we add a percpu counter in eventfd_ctx
>>>>> to limit the recursion depth for deadlock case. Then it could be
>>>>> fine to increase the global percpu counter later.
>>>> I wonder whether or not it's worth to introduce percpu for each eventfd.
>>>>
>>>> How about simply check if eventfd_signal_count() is greater than 2?
>>>>
>>> It can't avoid deadlock in this way.
>>
>> I may miss something but the count is to avoid recursive eventfd call.
>> So for VDUSE what we suffers is e.g the interrupt injection path:
>>
>> userspace write IRQFD -> vq->cb() -> another IRQFD.
>>
>> It looks like increasing EVENTFD_WAKEUP_DEPTH should be sufficient?
>>
> Actually I mean the deadlock described in commit f0b493e ("io_uring:
> prevent potential eventfd recursion on poll"). It can break this bug
> fix if we just increase EVENTFD_WAKEUP_DEPTH.


Ok, so can wait do something similar in that commit? (using async stuffs 
like wq).

Thanks


>
> Thanks,
> Yongji
>
Jens Axboe Jan. 28, 2021, 3:08 a.m. UTC | #6
On 1/27/21 8:04 PM, Jason Wang wrote:
> 
> On 2021/1/27 下午5:11, Yongji Xie wrote:
>> On Wed, Jan 27, 2021 at 11:38 AM Jason Wang <jasowang@redhat.com> wrote:
>>>
>>> On 2021/1/20 下午2:52, Yongji Xie wrote:
>>>> On Wed, Jan 20, 2021 at 12:24 PM Jason Wang <jasowang@redhat.com> wrote:
>>>>> On 2021/1/19 下午12:59, Xie Yongji wrote:
>>>>>> Now we have a global percpu counter to limit the recursion depth
>>>>>> of eventfd_signal(). This can avoid deadlock or stack overflow.
>>>>>> But in stack overflow case, it should be OK to increase the
>>>>>> recursion depth if needed. So we add a percpu counter in eventfd_ctx
>>>>>> to limit the recursion depth for deadlock case. Then it could be
>>>>>> fine to increase the global percpu counter later.
>>>>> I wonder whether or not it's worth to introduce percpu for each eventfd.
>>>>>
>>>>> How about simply check if eventfd_signal_count() is greater than 2?
>>>>>
>>>> It can't avoid deadlock in this way.
>>>
>>> I may miss something but the count is to avoid recursive eventfd call.
>>> So for VDUSE what we suffers is e.g the interrupt injection path:
>>>
>>> userspace write IRQFD -> vq->cb() -> another IRQFD.
>>>
>>> It looks like increasing EVENTFD_WAKEUP_DEPTH should be sufficient?
>>>
>> Actually I mean the deadlock described in commit f0b493e ("io_uring:
>> prevent potential eventfd recursion on poll"). It can break this bug
>> fix if we just increase EVENTFD_WAKEUP_DEPTH.
> 
> 
> Ok, so can wait do something similar in that commit? (using async stuffs 
> like wq).

io_uring should be fine in current kernels, but aio would still be
affected by this. But just in terms of recursion, bumping it one more
should probably still be fine.
Yongji Xie Jan. 28, 2021, 3:52 a.m. UTC | #7
On Thu, Jan 28, 2021 at 11:05 AM Jason Wang <jasowang@redhat.com> wrote:
>
>
> On 2021/1/27 下午5:11, Yongji Xie wrote:
> > On Wed, Jan 27, 2021 at 11:38 AM Jason Wang <jasowang@redhat.com> wrote:
> >>
> >> On 2021/1/20 下午2:52, Yongji Xie wrote:
> >>> On Wed, Jan 20, 2021 at 12:24 PM Jason Wang <jasowang@redhat.com> wrote:
> >>>> On 2021/1/19 下午12:59, Xie Yongji wrote:
> >>>>> Now we have a global percpu counter to limit the recursion depth
> >>>>> of eventfd_signal(). This can avoid deadlock or stack overflow.
> >>>>> But in stack overflow case, it should be OK to increase the
> >>>>> recursion depth if needed. So we add a percpu counter in eventfd_ctx
> >>>>> to limit the recursion depth for deadlock case. Then it could be
> >>>>> fine to increase the global percpu counter later.
> >>>> I wonder whether or not it's worth to introduce percpu for each eventfd.
> >>>>
> >>>> How about simply check if eventfd_signal_count() is greater than 2?
> >>>>
> >>> It can't avoid deadlock in this way.
> >>
> >> I may miss something but the count is to avoid recursive eventfd call.
> >> So for VDUSE what we suffers is e.g the interrupt injection path:
> >>
> >> userspace write IRQFD -> vq->cb() -> another IRQFD.
> >>
> >> It looks like increasing EVENTFD_WAKEUP_DEPTH should be sufficient?
> >>
> > Actually I mean the deadlock described in commit f0b493e ("io_uring:
> > prevent potential eventfd recursion on poll"). It can break this bug
> > fix if we just increase EVENTFD_WAKEUP_DEPTH.
>
>
> Ok, so can wait do something similar in that commit? (using async stuffs
> like wq).
>

We can do that. But it will reduce the performance. Because the
eventfd recursion will be triggered every time kvm kick eventfd in
vhost-vdpa cases:

KVM write KICKFD -> ops->kick_vq -> VDUSE write KICKFD

Thanks,
Yongji
Jason Wang Jan. 28, 2021, 4:31 a.m. UTC | #8
On 2021/1/28 上午11:52, Yongji Xie wrote:
> On Thu, Jan 28, 2021 at 11:05 AM Jason Wang <jasowang@redhat.com> wrote:
>>
>> On 2021/1/27 下午5:11, Yongji Xie wrote:
>>> On Wed, Jan 27, 2021 at 11:38 AM Jason Wang <jasowang@redhat.com> wrote:
>>>> On 2021/1/20 下午2:52, Yongji Xie wrote:
>>>>> On Wed, Jan 20, 2021 at 12:24 PM Jason Wang <jasowang@redhat.com> wrote:
>>>>>> On 2021/1/19 下午12:59, Xie Yongji wrote:
>>>>>>> Now we have a global percpu counter to limit the recursion depth
>>>>>>> of eventfd_signal(). This can avoid deadlock or stack overflow.
>>>>>>> But in stack overflow case, it should be OK to increase the
>>>>>>> recursion depth if needed. So we add a percpu counter in eventfd_ctx
>>>>>>> to limit the recursion depth for deadlock case. Then it could be
>>>>>>> fine to increase the global percpu counter later.
>>>>>> I wonder whether or not it's worth to introduce percpu for each eventfd.
>>>>>>
>>>>>> How about simply check if eventfd_signal_count() is greater than 2?
>>>>>>
>>>>> It can't avoid deadlock in this way.
>>>> I may miss something but the count is to avoid recursive eventfd call.
>>>> So for VDUSE what we suffers is e.g the interrupt injection path:
>>>>
>>>> userspace write IRQFD -> vq->cb() -> another IRQFD.
>>>>
>>>> It looks like increasing EVENTFD_WAKEUP_DEPTH should be sufficient?
>>>>
>>> Actually I mean the deadlock described in commit f0b493e ("io_uring:
>>> prevent potential eventfd recursion on poll"). It can break this bug
>>> fix if we just increase EVENTFD_WAKEUP_DEPTH.
>>
>> Ok, so can wait do something similar in that commit? (using async stuffs
>> like wq).
>>
> We can do that. But it will reduce the performance. Because the
> eventfd recursion will be triggered every time kvm kick eventfd in
> vhost-vdpa cases:
>
> KVM write KICKFD -> ops->kick_vq -> VDUSE write KICKFD
>
> Thanks,
> Yongji


Right, I think in the future we need to find a way to let KVM to wakeup 
VDUSE directly.

Havn't had a deep thought but it might work like irq bypass manager.

Thanks
Yongji Xie Jan. 28, 2021, 5:12 a.m. UTC | #9
On Thu, Jan 28, 2021 at 11:08 AM Jens Axboe <axboe@kernel.dk> wrote:
>
> On 1/27/21 8:04 PM, Jason Wang wrote:
> >
> > On 2021/1/27 下午5:11, Yongji Xie wrote:
> >> On Wed, Jan 27, 2021 at 11:38 AM Jason Wang <jasowang@redhat.com> wrote:
> >>>
> >>> On 2021/1/20 下午2:52, Yongji Xie wrote:
> >>>> On Wed, Jan 20, 2021 at 12:24 PM Jason Wang <jasowang@redhat.com> wrote:
> >>>>> On 2021/1/19 下午12:59, Xie Yongji wrote:
> >>>>>> Now we have a global percpu counter to limit the recursion depth
> >>>>>> of eventfd_signal(). This can avoid deadlock or stack overflow.
> >>>>>> But in stack overflow case, it should be OK to increase the
> >>>>>> recursion depth if needed. So we add a percpu counter in eventfd_ctx
> >>>>>> to limit the recursion depth for deadlock case. Then it could be
> >>>>>> fine to increase the global percpu counter later.
> >>>>> I wonder whether or not it's worth to introduce percpu for each eventfd.
> >>>>>
> >>>>> How about simply check if eventfd_signal_count() is greater than 2?
> >>>>>
> >>>> It can't avoid deadlock in this way.
> >>>
> >>> I may miss something but the count is to avoid recursive eventfd call.
> >>> So for VDUSE what we suffers is e.g the interrupt injection path:
> >>>
> >>> userspace write IRQFD -> vq->cb() -> another IRQFD.
> >>>
> >>> It looks like increasing EVENTFD_WAKEUP_DEPTH should be sufficient?
> >>>
> >> Actually I mean the deadlock described in commit f0b493e ("io_uring:
> >> prevent potential eventfd recursion on poll"). It can break this bug
> >> fix if we just increase EVENTFD_WAKEUP_DEPTH.
> >
> >
> > Ok, so can wait do something similar in that commit? (using async stuffs
> > like wq).
>
> io_uring should be fine in current kernels, but aio would still be
> affected by this. But just in terms of recursion, bumping it one more
> should probably still be fine.
>

OK, I see. It should be easy to avoid the A-A deadlock during coding.

Thanks,
Yongji
Yongji Xie Jan. 28, 2021, 6:08 a.m. UTC | #10
On Thu, Jan 28, 2021 at 12:31 PM Jason Wang <jasowang@redhat.com> wrote:
>
>
> On 2021/1/28 上午11:52, Yongji Xie wrote:
> > On Thu, Jan 28, 2021 at 11:05 AM Jason Wang <jasowang@redhat.com> wrote:
> >>
> >> On 2021/1/27 下午5:11, Yongji Xie wrote:
> >>> On Wed, Jan 27, 2021 at 11:38 AM Jason Wang <jasowang@redhat.com> wrote:
> >>>> On 2021/1/20 下午2:52, Yongji Xie wrote:
> >>>>> On Wed, Jan 20, 2021 at 12:24 PM Jason Wang <jasowang@redhat.com> wrote:
> >>>>>> On 2021/1/19 下午12:59, Xie Yongji wrote:
> >>>>>>> Now we have a global percpu counter to limit the recursion depth
> >>>>>>> of eventfd_signal(). This can avoid deadlock or stack overflow.
> >>>>>>> But in stack overflow case, it should be OK to increase the
> >>>>>>> recursion depth if needed. So we add a percpu counter in eventfd_ctx
> >>>>>>> to limit the recursion depth for deadlock case. Then it could be
> >>>>>>> fine to increase the global percpu counter later.
> >>>>>> I wonder whether or not it's worth to introduce percpu for each eventfd.
> >>>>>>
> >>>>>> How about simply check if eventfd_signal_count() is greater than 2?
> >>>>>>
> >>>>> It can't avoid deadlock in this way.
> >>>> I may miss something but the count is to avoid recursive eventfd call.
> >>>> So for VDUSE what we suffers is e.g the interrupt injection path:
> >>>>
> >>>> userspace write IRQFD -> vq->cb() -> another IRQFD.
> >>>>
> >>>> It looks like increasing EVENTFD_WAKEUP_DEPTH should be sufficient?
> >>>>
> >>> Actually I mean the deadlock described in commit f0b493e ("io_uring:
> >>> prevent potential eventfd recursion on poll"). It can break this bug
> >>> fix if we just increase EVENTFD_WAKEUP_DEPTH.
> >>
> >> Ok, so can wait do something similar in that commit? (using async stuffs
> >> like wq).
> >>
> > We can do that. But it will reduce the performance. Because the
> > eventfd recursion will be triggered every time kvm kick eventfd in
> > vhost-vdpa cases:
> >
> > KVM write KICKFD -> ops->kick_vq -> VDUSE write KICKFD
> >
> > Thanks,
> > Yongji
>
>
> Right, I think in the future we need to find a way to let KVM to wakeup
> VDUSE directly.
>

Yes, this would be better.

Thanks,
Yongji
diff mbox series

Patch

diff --git a/fs/aio.c b/fs/aio.c
index 1f32da13d39e..5d82903161f5 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1698,7 +1698,8 @@  static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
 		list_del(&iocb->ki_list);
 		iocb->ki_res.res = mangle_poll(mask);
 		req->done = true;
-		if (iocb->ki_eventfd && eventfd_signal_count()) {
+		if (iocb->ki_eventfd &&
+			eventfd_signal_count(iocb->ki_eventfd)) {
 			iocb = NULL;
 			INIT_WORK(&req->work, aio_poll_put_work);
 			schedule_work(&req->work);
diff --git a/fs/eventfd.c b/fs/eventfd.c
index e265b6dd4f34..2df24f9bada3 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -25,6 +25,8 @@ 
 #include <linux/idr.h>
 #include <linux/uio.h>
 
+#define EVENTFD_WAKE_DEPTH 0
+
 DEFINE_PER_CPU(int, eventfd_wake_count);
 
 static DEFINE_IDA(eventfd_ida);
@@ -42,9 +44,17 @@  struct eventfd_ctx {
 	 */
 	__u64 count;
 	unsigned int flags;
+	int __percpu *wake_count;
 	int id;
 };
 
+bool eventfd_signal_count(struct eventfd_ctx *ctx)
+{
+	return (this_cpu_read(*ctx->wake_count) ||
+		this_cpu_read(eventfd_wake_count) > EVENTFD_WAKE_DEPTH);
+}
+EXPORT_SYMBOL_GPL(eventfd_signal_count);
+
 /**
  * eventfd_signal - Adds @n to the eventfd counter.
  * @ctx: [in] Pointer to the eventfd context.
@@ -71,17 +81,19 @@  __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
 	 * it returns true, the eventfd_signal() call should be deferred to a
 	 * safe context.
 	 */
-	if (WARN_ON_ONCE(this_cpu_read(eventfd_wake_count)))
+	if (WARN_ON_ONCE(eventfd_signal_count(ctx)))
 		return 0;
 
 	spin_lock_irqsave(&ctx->wqh.lock, flags);
 	this_cpu_inc(eventfd_wake_count);
+	this_cpu_inc(*ctx->wake_count);
 	if (ULLONG_MAX - ctx->count < n)
 		n = ULLONG_MAX - ctx->count;
 	ctx->count += n;
 	if (waitqueue_active(&ctx->wqh))
 		wake_up_locked_poll(&ctx->wqh, EPOLLIN);
 	this_cpu_dec(eventfd_wake_count);
+	this_cpu_dec(*ctx->wake_count);
 	spin_unlock_irqrestore(&ctx->wqh.lock, flags);
 
 	return n;
@@ -92,6 +104,7 @@  static void eventfd_free_ctx(struct eventfd_ctx *ctx)
 {
 	if (ctx->id >= 0)
 		ida_simple_remove(&eventfd_ida, ctx->id);
+	free_percpu(ctx->wake_count);
 	kfree(ctx);
 }
 
@@ -423,6 +436,11 @@  static int do_eventfd(unsigned int count, int flags)
 
 	kref_init(&ctx->kref);
 	init_waitqueue_head(&ctx->wqh);
+	ctx->wake_count = alloc_percpu(int);
+	if (!ctx->wake_count) {
+		kfree(ctx);
+		return -ENOMEM;
+	}
 	ctx->count = count;
 	ctx->flags = flags;
 	ctx->id = ida_simple_get(&eventfd_ida, 0, 0, GFP_KERNEL);
diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h
index fa0a524baed0..1a11ebbd74a9 100644
--- a/include/linux/eventfd.h
+++ b/include/linux/eventfd.h
@@ -45,10 +45,7 @@  void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt);
 
 DECLARE_PER_CPU(int, eventfd_wake_count);
 
-static inline bool eventfd_signal_count(void)
-{
-	return this_cpu_read(eventfd_wake_count);
-}
+bool eventfd_signal_count(struct eventfd_ctx *ctx);
 
 #else /* CONFIG_EVENTFD */