diff mbox series

[RFC] eventfd: add EFD_AUTORESET flag

Message ID 20200129172010.162215-1-stefanha@redhat.com (mailing list archive)
State New, archived
Headers show
Series [RFC] eventfd: add EFD_AUTORESET flag | expand

Commit Message

Stefan Hajnoczi Jan. 29, 2020, 5:20 p.m. UTC
Some applications simply use eventfd for inter-thread notifications
without requiring counter or semaphore semantics.  They wait for the
eventfd to become readable using poll(2)/select(2) and then call read(2)
to reset the counter.

This patch adds the EFD_AUTORESET flag to reset the counter when
f_ops->poll() finds the eventfd is readable, eliminating the need to
call read(2) to reset the counter.

This results in a small but measurable 1% performance improvement with
QEMU virtio-blk emulation.  Each read(2) takes 1 microsecond execution
time in the event loop according to perf.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
Does this look like a reasonable thing to do?  I'm not very familiar
with f_ops->poll() or the eventfd internals, so maybe I'm overlooking a
design flaw.

I've tested this with QEMU and it works fine:
https://github.com/stefanha/qemu/commits/eventfd-autoreset
---
 fs/eventfd.c            | 99 +++++++++++++++++++++++++----------------
 include/linux/eventfd.h |  3 +-
 2 files changed, 62 insertions(+), 40 deletions(-)

Comments

Stefan Hajnoczi Feb. 4, 2020, 3:40 p.m. UTC | #1
On Wed, Jan 29, 2020 at 05:20:10PM +0000, Stefan Hajnoczi wrote:
> Some applications simply use eventfd for inter-thread notifications
> without requiring counter or semaphore semantics.  They wait for the
> eventfd to become readable using poll(2)/select(2) and then call read(2)
> to reset the counter.
> 
> This patch adds the EFD_AUTORESET flag to reset the counter when
> f_ops->poll() finds the eventfd is readable, eliminating the need to
> call read(2) to reset the counter.
> 
> This results in a small but measurable 1% performance improvement with
> QEMU virtio-blk emulation.  Each read(2) takes 1 microsecond execution
> time in the event loop according to perf.
> 
> Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
> ---
> Does this look like a reasonable thing to do?  I'm not very familiar
> with f_ops->poll() or the eventfd internals, so maybe I'm overlooking a
> design flaw.

Ping?

> I've tested this with QEMU and it works fine:
> https://github.com/stefanha/qemu/commits/eventfd-autoreset
> ---
>  fs/eventfd.c            | 99 +++++++++++++++++++++++++----------------
>  include/linux/eventfd.h |  3 +-
>  2 files changed, 62 insertions(+), 40 deletions(-)
> 
> diff --git a/fs/eventfd.c b/fs/eventfd.c
> index 8aa0ea8c55e8..208f6b9e2234 100644
> --- a/fs/eventfd.c
> +++ b/fs/eventfd.c
> @@ -116,45 +116,62 @@ static __poll_t eventfd_poll(struct file *file, poll_table *wait)
>  
>  	poll_wait(file, &ctx->wqh, wait);
>  
> -	/*
> -	 * All writes to ctx->count occur within ctx->wqh.lock.  This read
> -	 * can be done outside ctx->wqh.lock because we know that poll_wait
> -	 * takes that lock (through add_wait_queue) if our caller will sleep.
> -	 *
> -	 * The read _can_ therefore seep into add_wait_queue's critical
> -	 * section, but cannot move above it!  add_wait_queue's spin_lock acts
> -	 * as an acquire barrier and ensures that the read be ordered properly
> -	 * against the writes.  The following CAN happen and is safe:
> -	 *
> -	 *     poll                               write
> -	 *     -----------------                  ------------
> -	 *     lock ctx->wqh.lock (in poll_wait)
> -	 *     count = ctx->count
> -	 *     __add_wait_queue
> -	 *     unlock ctx->wqh.lock
> -	 *                                        lock ctx->qwh.lock
> -	 *                                        ctx->count += n
> -	 *                                        if (waitqueue_active)
> -	 *                                          wake_up_locked_poll
> -	 *                                        unlock ctx->qwh.lock
> -	 *     eventfd_poll returns 0
> -	 *
> -	 * but the following, which would miss a wakeup, cannot happen:
> -	 *
> -	 *     poll                               write
> -	 *     -----------------                  ------------
> -	 *     count = ctx->count (INVALID!)
> -	 *                                        lock ctx->qwh.lock
> -	 *                                        ctx->count += n
> -	 *                                        **waitqueue_active is false**
> -	 *                                        **no wake_up_locked_poll!**
> -	 *                                        unlock ctx->qwh.lock
> -	 *     lock ctx->wqh.lock (in poll_wait)
> -	 *     __add_wait_queue
> -	 *     unlock ctx->wqh.lock
> -	 *     eventfd_poll returns 0
> -	 */
> -	count = READ_ONCE(ctx->count);
> +	if (ctx->flags & EFD_AUTORESET) {
> +		unsigned long flags;
> +		__poll_t requested = poll_requested_events(wait);
> +
> +		spin_lock_irqsave(&ctx->wqh.lock, flags);
> +		count = ctx->count;
> +
> +		/* Reset counter if caller is polling for read */
> +		if (count != 0 && (requested & EPOLLIN)) {
> +			ctx->count = 0;
> +			events |= EPOLLOUT;
> +			/* TODO is a EPOLLOUT wakeup necessary here? */
> +		}
> +
> +		spin_unlock_irqrestore(&ctx->wqh.lock, flags);
> +	} else {
> +		/*
> +		 * All writes to ctx->count occur within ctx->wqh.lock.  This read
> +		 * can be done outside ctx->wqh.lock because we know that poll_wait
> +		 * takes that lock (through add_wait_queue) if our caller will sleep.
> +		 *
> +		 * The read _can_ therefore seep into add_wait_queue's critical
> +		 * section, but cannot move above it!  add_wait_queue's spin_lock acts
> +		 * as an acquire barrier and ensures that the read be ordered properly
> +		 * against the writes.  The following CAN happen and is safe:
> +		 *
> +		 *     poll                               write
> +		 *     -----------------                  ------------
> +		 *     lock ctx->wqh.lock (in poll_wait)
> +		 *     count = ctx->count
> +		 *     __add_wait_queue
> +		 *     unlock ctx->wqh.lock
> +		 *                                        lock ctx->qwh.lock
> +		 *                                        ctx->count += n
> +		 *                                        if (waitqueue_active)
> +		 *                                          wake_up_locked_poll
> +		 *                                        unlock ctx->qwh.lock
> +		 *     eventfd_poll returns 0
> +		 *
> +		 * but the following, which would miss a wakeup, cannot happen:
> +		 *
> +		 *     poll                               write
> +		 *     -----------------                  ------------
> +		 *     count = ctx->count (INVALID!)
> +		 *                                        lock ctx->qwh.lock
> +		 *                                        ctx->count += n
> +		 *                                        **waitqueue_active is false**
> +		 *                                        **no wake_up_locked_poll!**
> +		 *                                        unlock ctx->qwh.lock
> +		 *     lock ctx->wqh.lock (in poll_wait)
> +		 *     __add_wait_queue
> +		 *     unlock ctx->wqh.lock
> +		 *     eventfd_poll returns 0
> +		 */
> +		count = READ_ONCE(ctx->count);
> +	}
>  
>  	if (count > 0)
>  		events |= EPOLLIN;
> @@ -400,6 +417,10 @@ static int do_eventfd(unsigned int count, int flags)
>  	if (flags & ~EFD_FLAGS_SET)
>  		return -EINVAL;
>  
> +	/* Semaphore semantics don't make sense when autoreset is enabled */
> +	if ((flags & EFD_SEMAPHORE) && (flags & EFD_AUTORESET))
> +		return -EINVAL;
> +
>  	ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
>  	if (!ctx)
>  		return -ENOMEM;
> diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h
> index ffcc7724ca21..27577fafc553 100644
> --- a/include/linux/eventfd.h
> +++ b/include/linux/eventfd.h
> @@ -21,11 +21,12 @@
>   * shared O_* flags.
>   */
>  #define EFD_SEMAPHORE (1 << 0)
> +#define EFD_AUTORESET (1 << 6) /* aliases O_CREAT */
>  #define EFD_CLOEXEC O_CLOEXEC
>  #define EFD_NONBLOCK O_NONBLOCK
>  
>  #define EFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK)
> -#define EFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS | EFD_SEMAPHORE)
> +#define EFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS | EFD_SEMAPHORE | EFD_AUTORESET)
>  
>  struct eventfd_ctx;
>  struct file;
> -- 
> 2.24.1
>
Stefan Hajnoczi Feb. 11, 2020, 9:32 a.m. UTC | #2
On Tue, Feb 04, 2020 at 03:40:35PM +0000, Stefan Hajnoczi wrote:
> On Wed, Jan 29, 2020 at 05:20:10PM +0000, Stefan Hajnoczi wrote:
> > Some applications simply use eventfd for inter-thread notifications
> > without requiring counter or semaphore semantics.  They wait for the
> > eventfd to become readable using poll(2)/select(2) and then call read(2)
> > to reset the counter.
> > 
> > This patch adds the EFD_AUTORESET flag to reset the counter when
> > f_ops->poll() finds the eventfd is readable, eliminating the need to
> > call read(2) to reset the counter.
> > 
> > This results in a small but measurable 1% performance improvement with
> > QEMU virtio-blk emulation.  Each read(2) takes 1 microsecond execution
> > time in the event loop according to perf.
> > 
> > Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
> > ---
> > Does this look like a reasonable thing to do?  I'm not very familiar
> > with f_ops->poll() or the eventfd internals, so maybe I'm overlooking a
> > design flaw.
> 
> Ping?

Ping?

I would appreciate an indication of whether EFD_AUTORESET is an
acceptable new feature.  Thanks!

> > I've tested this with QEMU and it works fine:
> > https://github.com/stefanha/qemu/commits/eventfd-autoreset
> > ---
> >  fs/eventfd.c            | 99 +++++++++++++++++++++++++----------------
> >  include/linux/eventfd.h |  3 +-
> >  2 files changed, 62 insertions(+), 40 deletions(-)
> > 
> > diff --git a/fs/eventfd.c b/fs/eventfd.c
> > index 8aa0ea8c55e8..208f6b9e2234 100644
> > --- a/fs/eventfd.c
> > +++ b/fs/eventfd.c
> > @@ -116,45 +116,62 @@ static __poll_t eventfd_poll(struct file *file, poll_table *wait)
> >  
> >  	poll_wait(file, &ctx->wqh, wait);
> >  
> > -	/*
> > -	 * All writes to ctx->count occur within ctx->wqh.lock.  This read
> > -	 * can be done outside ctx->wqh.lock because we know that poll_wait
> > -	 * takes that lock (through add_wait_queue) if our caller will sleep.
> > -	 *
> > -	 * The read _can_ therefore seep into add_wait_queue's critical
> > -	 * section, but cannot move above it!  add_wait_queue's spin_lock acts
> > -	 * as an acquire barrier and ensures that the read be ordered properly
> > -	 * against the writes.  The following CAN happen and is safe:
> > -	 *
> > -	 *     poll                               write
> > -	 *     -----------------                  ------------
> > -	 *     lock ctx->wqh.lock (in poll_wait)
> > -	 *     count = ctx->count
> > -	 *     __add_wait_queue
> > -	 *     unlock ctx->wqh.lock
> > -	 *                                        lock ctx->qwh.lock
> > -	 *                                        ctx->count += n
> > -	 *                                        if (waitqueue_active)
> > -	 *                                          wake_up_locked_poll
> > -	 *                                        unlock ctx->qwh.lock
> > -	 *     eventfd_poll returns 0
> > -	 *
> > -	 * but the following, which would miss a wakeup, cannot happen:
> > -	 *
> > -	 *     poll                               write
> > -	 *     -----------------                  ------------
> > -	 *     count = ctx->count (INVALID!)
> > -	 *                                        lock ctx->qwh.lock
> > -	 *                                        ctx->count += n
> > -	 *                                        **waitqueue_active is false**
> > -	 *                                        **no wake_up_locked_poll!**
> > -	 *                                        unlock ctx->qwh.lock
> > -	 *     lock ctx->wqh.lock (in poll_wait)
> > -	 *     __add_wait_queue
> > -	 *     unlock ctx->wqh.lock
> > -	 *     eventfd_poll returns 0
> > -	 */
> > -	count = READ_ONCE(ctx->count);
> > +	if (ctx->flags & EFD_AUTORESET) {
> > +		unsigned long flags;
> > +		__poll_t requested = poll_requested_events(wait);
> > +
> > +		spin_lock_irqsave(&ctx->wqh.lock, flags);
> > +		count = ctx->count;
> > +
> > +		/* Reset counter if caller is polling for read */
> > +		if (count != 0 && (requested & EPOLLIN)) {
> > +			ctx->count = 0;
> > +			events |= EPOLLOUT;
> > +			/* TODO is a EPOLLOUT wakeup necessary here? */
> > +		}
> > +
> > +		spin_unlock_irqrestore(&ctx->wqh.lock, flags);
> > +	} else {
> > +		/*
> > +		 * All writes to ctx->count occur within ctx->wqh.lock.  This read
> > +		 * can be done outside ctx->wqh.lock because we know that poll_wait
> > +		 * takes that lock (through add_wait_queue) if our caller will sleep.
> > +		 *
> > +		 * The read _can_ therefore seep into add_wait_queue's critical
> > +		 * section, but cannot move above it!  add_wait_queue's spin_lock acts
> > +		 * as an acquire barrier and ensures that the read be ordered properly
> > +		 * against the writes.  The following CAN happen and is safe:
> > +		 *
> > +		 *     poll                               write
> > +		 *     -----------------                  ------------
> > +		 *     lock ctx->wqh.lock (in poll_wait)
> > +		 *     count = ctx->count
> > +		 *     __add_wait_queue
> > +		 *     unlock ctx->wqh.lock
> > +		 *                                        lock ctx->qwh.lock
> > +		 *                                        ctx->count += n
> > +		 *                                        if (waitqueue_active)
> > +		 *                                          wake_up_locked_poll
> > +		 *                                        unlock ctx->qwh.lock
> > +		 *     eventfd_poll returns 0
> > +		 *
> > +		 * but the following, which would miss a wakeup, cannot happen:
> > +		 *
> > +		 *     poll                               write
> > +		 *     -----------------                  ------------
> > +		 *     count = ctx->count (INVALID!)
> > +		 *                                        lock ctx->qwh.lock
> > +		 *                                        ctx->count += n
> > +		 *                                        **waitqueue_active is false**
> > +		 *                                        **no wake_up_locked_poll!**
> > +		 *                                        unlock ctx->qwh.lock
> > +		 *     lock ctx->wqh.lock (in poll_wait)
> > +		 *     __add_wait_queue
> > +		 *     unlock ctx->wqh.lock
> > +		 *     eventfd_poll returns 0
> > +		 */
> > +		count = READ_ONCE(ctx->count);
> > +	}
> >  
> >  	if (count > 0)
> >  		events |= EPOLLIN;
> > @@ -400,6 +417,10 @@ static int do_eventfd(unsigned int count, int flags)
> >  	if (flags & ~EFD_FLAGS_SET)
> >  		return -EINVAL;
> >  
> > +	/* Semaphore semantics don't make sense when autoreset is enabled */
> > +	if ((flags & EFD_SEMAPHORE) && (flags & EFD_AUTORESET))
> > +		return -EINVAL;
> > +
> >  	ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
> >  	if (!ctx)
> >  		return -ENOMEM;
> > diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h
> > index ffcc7724ca21..27577fafc553 100644
> > --- a/include/linux/eventfd.h
> > +++ b/include/linux/eventfd.h
> > @@ -21,11 +21,12 @@
> >   * shared O_* flags.
> >   */
> >  #define EFD_SEMAPHORE (1 << 0)
> > +#define EFD_AUTORESET (1 << 6) /* aliases O_CREAT */
> >  #define EFD_CLOEXEC O_CLOEXEC
> >  #define EFD_NONBLOCK O_NONBLOCK
> >  
> >  #define EFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK)
> > -#define EFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS | EFD_SEMAPHORE)
> > +#define EFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS | EFD_SEMAPHORE | EFD_AUTORESET)
> >  
> >  struct eventfd_ctx;
> >  struct file;
> > -- 
> > 2.24.1
> >
Paolo Bonzini Feb. 12, 2020, 8:31 a.m. UTC | #3
On 29/01/20 18:20, Stefan Hajnoczi wrote:
> +	/* Semaphore semantics don't make sense when autoreset is enabled */
> +	if ((flags & EFD_SEMAPHORE) && (flags & EFD_AUTORESET))
> +		return -EINVAL;
> +

I think they do, you just want to subtract 1 instead of setting the
count to 0.  This way, writing 1 would be the post operation on the
semaphore, while poll() would be the wait operation.

Paolo
Avi Kivity Feb. 12, 2020, 10:10 a.m. UTC | #4
On 12/02/2020 10.31, Paolo Bonzini wrote:
> On 29/01/20 18:20, Stefan Hajnoczi wrote:
>> +	/* Semaphore semantics don't make sense when autoreset is enabled */
>> +	if ((flags & EFD_SEMAPHORE) && (flags & EFD_AUTORESET))
>> +		return -EINVAL;
>> +
> I think they do, you just want to subtract 1 instead of setting the
> count to 0.  This way, writing 1 would be the post operation on the
> semaphore, while poll() would be the wait operation.


poll() is usually idempotent. Both resetting to zero and subtracting one 
goes against the grain.


Better to use uring async read. This way you get the value just as you 
do with with poll+read, and the syscall cost is amortized away by uring.
Stefan Hajnoczi Feb. 12, 2020, 10:29 a.m. UTC | #5
On Wed, Feb 12, 2020 at 09:31:32AM +0100, Paolo Bonzini wrote:
> On 29/01/20 18:20, Stefan Hajnoczi wrote:
> > +	/* Semaphore semantics don't make sense when autoreset is enabled */
> > +	if ((flags & EFD_SEMAPHORE) && (flags & EFD_AUTORESET))
> > +		return -EINVAL;
> > +
> 
> I think they do, you just want to subtract 1 instead of setting the
> count to 0.  This way, writing 1 would be the post operation on the
> semaphore, while poll() would be the wait operation.

True!  Then EFD_AUTORESET is not a fitting name.  EFD_AUTOREAD or
EFD_POLL_READS?

Stefan
Paolo Bonzini Feb. 12, 2020, 10:47 a.m. UTC | #6
On 12/02/20 11:29, Stefan Hajnoczi wrote:
> On Wed, Feb 12, 2020 at 09:31:32AM +0100, Paolo Bonzini wrote:
>> On 29/01/20 18:20, Stefan Hajnoczi wrote:
>>> +	/* Semaphore semantics don't make sense when autoreset is enabled */
>>> +	if ((flags & EFD_SEMAPHORE) && (flags & EFD_AUTORESET))
>>> +		return -EINVAL;
>>> +
>>
>> I think they do, you just want to subtract 1 instead of setting the
>> count to 0.  This way, writing 1 would be the post operation on the
>> semaphore, while poll() would be the wait operation.
> 
> True!  Then EFD_AUTORESET is not a fitting name.  EFD_AUTOREAD or
> EFD_POLL_READS?

Avi's suggestion also makes sense.  Switching the event loop from poll()
to IORING_OP_POLL_ADD would be good on its own, and then you could make
it use IORING_OP_READV for eventfds.

In QEMU parlance, perhaps you need a different abstraction than
EventNotifier (let's call it WakeupNotifier) which would also use
eventfd but it would provide a smaller API.  Thanks to the smaller API,
it would not need EFD_NONBLOCK, unlike the regular EventNotifier, and it
could either set up a poll() handler calling read(), or use
IORING_OP_READV when io_uring is in use.

Paolo
Avi Kivity Feb. 12, 2020, 10:54 a.m. UTC | #7
On 12/02/2020 12.47, Paolo Bonzini wrote:
> On 12/02/20 11:29, Stefan Hajnoczi wrote:
>> On Wed, Feb 12, 2020 at 09:31:32AM +0100, Paolo Bonzini wrote:
>>> On 29/01/20 18:20, Stefan Hajnoczi wrote:
>>>> +	/* Semaphore semantics don't make sense when autoreset is enabled */
>>>> +	if ((flags & EFD_SEMAPHORE) && (flags & EFD_AUTORESET))
>>>> +		return -EINVAL;
>>>> +
>>> I think they do, you just want to subtract 1 instead of setting the
>>> count to 0.  This way, writing 1 would be the post operation on the
>>> semaphore, while poll() would be the wait operation.
>> True!  Then EFD_AUTORESET is not a fitting name.  EFD_AUTOREAD or
>> EFD_POLL_READS?
> Avi's suggestion also makes sense.  Switching the event loop from poll()
> to IORING_OP_POLL_ADD would be good on its own, and then you could make
> it use IORING_OP_READV for eventfds.
>
> In QEMU parlance, perhaps you need a different abstraction than
> EventNotifier (let's call it WakeupNotifier) which would also use
> eventfd but it would provide a smaller API.  Thanks to the smaller API,
> it would not need EFD_NONBLOCK, unlike the regular EventNotifier, and it
> could either set up a poll() handler calling read(), or use
> IORING_OP_READV when io_uring is in use.
>

Just to be clear, for best performance don't use IORING_OP_POLL_ADD, 
just IORING_OP_READ. That's what you say in the second paragraph but the 
first can be misleading.
Stefan Hajnoczi Feb. 19, 2020, 10:37 a.m. UTC | #8
On Wed, Feb 12, 2020 at 12:54:30PM +0200, Avi Kivity wrote:
> 
> On 12/02/2020 12.47, Paolo Bonzini wrote:
> > On 12/02/20 11:29, Stefan Hajnoczi wrote:
> > > On Wed, Feb 12, 2020 at 09:31:32AM +0100, Paolo Bonzini wrote:
> > > > On 29/01/20 18:20, Stefan Hajnoczi wrote:
> > > > > +	/* Semaphore semantics don't make sense when autoreset is enabled */
> > > > > +	if ((flags & EFD_SEMAPHORE) && (flags & EFD_AUTORESET))
> > > > > +		return -EINVAL;
> > > > > +
> > > > I think they do, you just want to subtract 1 instead of setting the
> > > > count to 0.  This way, writing 1 would be the post operation on the
> > > > semaphore, while poll() would be the wait operation.
> > > True!  Then EFD_AUTORESET is not a fitting name.  EFD_AUTOREAD or
> > > EFD_POLL_READS?
> > Avi's suggestion also makes sense.  Switching the event loop from poll()
> > to IORING_OP_POLL_ADD would be good on its own, and then you could make
> > it use IORING_OP_READV for eventfds.
> > 
> > In QEMU parlance, perhaps you need a different abstraction than
> > EventNotifier (let's call it WakeupNotifier) which would also use
> > eventfd but it would provide a smaller API.  Thanks to the smaller API,
> > it would not need EFD_NONBLOCK, unlike the regular EventNotifier, and it
> > could either set up a poll() handler calling read(), or use
> > IORING_OP_READV when io_uring is in use.
> > 
> 
> Just to be clear, for best performance don't use IORING_OP_POLL_ADD, just
> IORING_OP_READ. That's what you say in the second paragraph but the first
> can be misleading.

Thanks, that's a nice idea!  I already have experimental io_uring fd
monitoring code written for QEMU and will extend it to use IORING_OP_READ.

Stefan
Avi Kivity Feb. 19, 2020, 10:43 a.m. UTC | #9
On 19/02/2020 12.37, Stefan Hajnoczi wrote:
> On Wed, Feb 12, 2020 at 12:54:30PM +0200, Avi Kivity wrote:
>> On 12/02/2020 12.47, Paolo Bonzini wrote:
>>> On 12/02/20 11:29, Stefan Hajnoczi wrote:
>>>> On Wed, Feb 12, 2020 at 09:31:32AM +0100, Paolo Bonzini wrote:
>>>>> On 29/01/20 18:20, Stefan Hajnoczi wrote:
>>>>>> +	/* Semaphore semantics don't make sense when autoreset is enabled */
>>>>>> +	if ((flags & EFD_SEMAPHORE) && (flags & EFD_AUTORESET))
>>>>>> +		return -EINVAL;
>>>>>> +
>>>>> I think they do, you just want to subtract 1 instead of setting the
>>>>> count to 0.  This way, writing 1 would be the post operation on the
>>>>> semaphore, while poll() would be the wait operation.
>>>> True!  Then EFD_AUTORESET is not a fitting name.  EFD_AUTOREAD or
>>>> EFD_POLL_READS?
>>> Avi's suggestion also makes sense.  Switching the event loop from poll()
>>> to IORING_OP_POLL_ADD would be good on its own, and then you could make
>>> it use IORING_OP_READV for eventfds.
>>>
>>> In QEMU parlance, perhaps you need a different abstraction than
>>> EventNotifier (let's call it WakeupNotifier) which would also use
>>> eventfd but it would provide a smaller API.  Thanks to the smaller API,
>>> it would not need EFD_NONBLOCK, unlike the regular EventNotifier, and it
>>> could either set up a poll() handler calling read(), or use
>>> IORING_OP_READV when io_uring is in use.
>>>
>> Just to be clear, for best performance don't use IORING_OP_POLL_ADD, just
>> IORING_OP_READ. That's what you say in the second paragraph but the first
>> can be misleading.


Actually it turns out that current uring OP_READ throws the work into a 
workqueue. Jens is fixing that now.


> Thanks, that's a nice idea!  I already have experimental io_uring fd
> monitoring code written for QEMU and will extend it to use IORING_OP_READ.


Note linux-aio can do IOCB_CMD_POLL, starting with 4.19.
Paolo Bonzini Feb. 19, 2020, 11:10 a.m. UTC | #10
On 19/02/20 11:43, Avi Kivity wrote:
> 
>> Thanks, that's a nice idea!  I already have experimental io_uring fd
>> monitoring code written for QEMU and will extend it to use
>> IORING_OP_READ.
> 
> Note linux-aio can do IOCB_CMD_POLL, starting with 4.19.

That was on the todo list, but io_uring came first. :)

Paolo
diff mbox series

Patch

diff --git a/fs/eventfd.c b/fs/eventfd.c
index 8aa0ea8c55e8..208f6b9e2234 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -116,45 +116,62 @@  static __poll_t eventfd_poll(struct file *file, poll_table *wait)
 
 	poll_wait(file, &ctx->wqh, wait);
 
-	/*
-	 * All writes to ctx->count occur within ctx->wqh.lock.  This read
-	 * can be done outside ctx->wqh.lock because we know that poll_wait
-	 * takes that lock (through add_wait_queue) if our caller will sleep.
-	 *
-	 * The read _can_ therefore seep into add_wait_queue's critical
-	 * section, but cannot move above it!  add_wait_queue's spin_lock acts
-	 * as an acquire barrier and ensures that the read be ordered properly
-	 * against the writes.  The following CAN happen and is safe:
-	 *
-	 *     poll                               write
-	 *     -----------------                  ------------
-	 *     lock ctx->wqh.lock (in poll_wait)
-	 *     count = ctx->count
-	 *     __add_wait_queue
-	 *     unlock ctx->wqh.lock
-	 *                                        lock ctx->qwh.lock
-	 *                                        ctx->count += n
-	 *                                        if (waitqueue_active)
-	 *                                          wake_up_locked_poll
-	 *                                        unlock ctx->qwh.lock
-	 *     eventfd_poll returns 0
-	 *
-	 * but the following, which would miss a wakeup, cannot happen:
-	 *
-	 *     poll                               write
-	 *     -----------------                  ------------
-	 *     count = ctx->count (INVALID!)
-	 *                                        lock ctx->qwh.lock
-	 *                                        ctx->count += n
-	 *                                        **waitqueue_active is false**
-	 *                                        **no wake_up_locked_poll!**
-	 *                                        unlock ctx->qwh.lock
-	 *     lock ctx->wqh.lock (in poll_wait)
-	 *     __add_wait_queue
-	 *     unlock ctx->wqh.lock
-	 *     eventfd_poll returns 0
-	 */
-	count = READ_ONCE(ctx->count);
+	if (ctx->flags & EFD_AUTORESET) {
+		unsigned long flags;
+		__poll_t requested = poll_requested_events(wait);
+
+		spin_lock_irqsave(&ctx->wqh.lock, flags);
+		count = ctx->count;
+
+		/* Reset counter if caller is polling for read */
+		if (count != 0 && (requested & EPOLLIN)) {
+			ctx->count = 0;
+			events |= EPOLLOUT;
+			/* TODO is a EPOLLOUT wakeup necessary here? */
+		}
+
+		spin_unlock_irqrestore(&ctx->wqh.lock, flags);
+	} else {
+		/*
+		 * All writes to ctx->count occur within ctx->wqh.lock.  This read
+		 * can be done outside ctx->wqh.lock because we know that poll_wait
+		 * takes that lock (through add_wait_queue) if our caller will sleep.
+		 *
+		 * The read _can_ therefore seep into add_wait_queue's critical
+		 * section, but cannot move above it!  add_wait_queue's spin_lock acts
+		 * as an acquire barrier and ensures that the read be ordered properly
+		 * against the writes.  The following CAN happen and is safe:
+		 *
+		 *     poll                               write
+		 *     -----------------                  ------------
+		 *     lock ctx->wqh.lock (in poll_wait)
+		 *     count = ctx->count
+		 *     __add_wait_queue
+		 *     unlock ctx->wqh.lock
+		 *                                        lock ctx->qwh.lock
+		 *                                        ctx->count += n
+		 *                                        if (waitqueue_active)
+		 *                                          wake_up_locked_poll
+		 *                                        unlock ctx->qwh.lock
+		 *     eventfd_poll returns 0
+		 *
+		 * but the following, which would miss a wakeup, cannot happen:
+		 *
+		 *     poll                               write
+		 *     -----------------                  ------------
+		 *     count = ctx->count (INVALID!)
+		 *                                        lock ctx->qwh.lock
+		 *                                        ctx->count += n
+		 *                                        **waitqueue_active is false**
+		 *                                        **no wake_up_locked_poll!**
+		 *                                        unlock ctx->qwh.lock
+		 *     lock ctx->wqh.lock (in poll_wait)
+		 *     __add_wait_queue
+		 *     unlock ctx->wqh.lock
+		 *     eventfd_poll returns 0
+		 */
+		count = READ_ONCE(ctx->count);
+	}
 
 	if (count > 0)
 		events |= EPOLLIN;
@@ -400,6 +417,10 @@  static int do_eventfd(unsigned int count, int flags)
 	if (flags & ~EFD_FLAGS_SET)
 		return -EINVAL;
 
+	/* Semaphore semantics don't make sense when autoreset is enabled */
+	if ((flags & EFD_SEMAPHORE) && (flags & EFD_AUTORESET))
+		return -EINVAL;
+
 	ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
 	if (!ctx)
 		return -ENOMEM;
diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h
index ffcc7724ca21..27577fafc553 100644
--- a/include/linux/eventfd.h
+++ b/include/linux/eventfd.h
@@ -21,11 +21,12 @@ 
  * shared O_* flags.
  */
 #define EFD_SEMAPHORE (1 << 0)
+#define EFD_AUTORESET (1 << 6) /* aliases O_CREAT */
 #define EFD_CLOEXEC O_CLOEXEC
 #define EFD_NONBLOCK O_NONBLOCK
 
 #define EFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK)
-#define EFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS | EFD_SEMAPHORE)
+#define EFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS | EFD_SEMAPHORE | EFD_AUTORESET)
 
 struct eventfd_ctx;
 struct file;