diff mbox

[PATCH-RFC,2/2] eventfd: EFD_STATE flag

Message ID 20090728175538.GC21549@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Michael S. Tsirkin July 28, 2009, 5:55 p.m. UTC
This implements a new EFD_STATE flag for eventfd.
When set, this flag changes eventfd behaviour in the following way:
- write simply stores the value written, and is always non-blocking
- read unblocks when the value written changes, and
  returns the value written

Motivation: we'd like to use eventfd in qemu to pass interrupts from
(emulated or assigned) devices to guest. For level interrupts, the
counter supported currently by eventfd is not a good match: we really
need to set interrupt to a level, typically 0 or 1, and give the guest
ability to see the last value written.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 fs/eventfd.c            |   41 ++++++++++++++++++++++++++++++++++-------
 include/linux/eventfd.h |    3 ++-
 2 files changed, 36 insertions(+), 8 deletions(-)

Comments

Avi Kivity Aug. 3, 2009, 3:09 p.m. UTC | #1
On 07/28/2009 08:55 PM, Michael S. Tsirkin wrote:
> This implements a new EFD_STATE flag for eventfd.
> When set, this flag changes eventfd behaviour in the following way:
> - write simply stores the value written, and is always non-blocking
> - read unblocks when the value written changes, and
>    returns the value written
>
> Motivation: we'd like to use eventfd in qemu to pass interrupts from
> (emulated or assigned) devices to guest. For level interrupts, the
> counter supported currently by eventfd is not a good match: we really
> need to set interrupt to a level, typically 0 or 1, and give the guest
> ability to see the last value written.
>
>
> @@ -31,37 +31,59 @@ struct eventfd_ctx {
>   	 * issue a wakeup.
>   	 */
>   	__u64 count;
> +	/*
> +	 * When EF_STATE flag is set, eventfd behaves differently:
> +	 * value written gets stored in "count", read will copy
> +	 * "count" to "state".
> +	 */
> +	__u64 state;
>   	unsigned int flags;
>   };
>    

Why not write the new value into ->count directly?
Michael S. Tsirkin Aug. 3, 2009, 3:14 p.m. UTC | #2
On Mon, Aug 03, 2009 at 06:09:38PM +0300, Avi Kivity wrote:
> On 07/28/2009 08:55 PM, Michael S. Tsirkin wrote:
>> This implements a new EFD_STATE flag for eventfd.
>> When set, this flag changes eventfd behaviour in the following way:
>> - write simply stores the value written, and is always non-blocking
>> - read unblocks when the value written changes, and
>>    returns the value written
>>
>> Motivation: we'd like to use eventfd in qemu to pass interrupts from
>> (emulated or assigned) devices to guest. For level interrupts, the
>> counter supported currently by eventfd is not a good match: we really
>> need to set interrupt to a level, typically 0 or 1, and give the guest
>> ability to see the last value written.
>>
>>
>> @@ -31,37 +31,59 @@ struct eventfd_ctx {
>>   	 * issue a wakeup.
>>   	 */
>>   	__u64 count;
>> +	/*
>> +	 * When EF_STATE flag is set, eventfd behaves differently:
>> +	 * value written gets stored in "count", read will copy
>> +	 * "count" to "state".
>> +	 */
>> +	__u64 state;
>>   	unsigned int flags;
>>   };
>>    
>
> Why not write the new value into ->count directly?

That's what it says. state is ther to detect that value was changed
after last read. Makes sense?

> -- 
> error compiling committee.c: too many arguments to function
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity Aug. 3, 2009, 3:29 p.m. UTC | #3
On 08/03/2009 06:14 PM, Michael S. Tsirkin wrote:
> On Mon, Aug 03, 2009 at 06:09:38PM +0300, Avi Kivity wrote:
>    
>> On 07/28/2009 08:55 PM, Michael S. Tsirkin wrote:
>>      
>>> This implements a new EFD_STATE flag for eventfd.
>>> When set, this flag changes eventfd behaviour in the following way:
>>> - write simply stores the value written, and is always non-blocking
>>> - read unblocks when the value written changes, and
>>>     returns the value written
>>>
>>> Motivation: we'd like to use eventfd in qemu to pass interrupts from
>>> (emulated or assigned) devices to guest. For level interrupts, the
>>> counter supported currently by eventfd is not a good match: we really
>>> need to set interrupt to a level, typically 0 or 1, and give the guest
>>> ability to see the last value written.
>>>
>>>
>>> @@ -31,37 +31,59 @@ struct eventfd_ctx {
>>>    	 * issue a wakeup.
>>>    	 */
>>>    	__u64 count;
>>> +	/*
>>> +	 * When EF_STATE flag is set, eventfd behaves differently:
>>> +	 * value written gets stored in "count", read will copy
>>> +	 * "count" to "state".
>>> +	 */
>>> +	__u64 state;
>>>    	unsigned int flags;
>>>    };
>>>
>>>        
>> Why not write the new value into ->count directly?
>>      
>
> That's what it says. state is ther to detect that value was changed
> after last read. Makes sense?
>    

Why not do it at the point of the write?

     if (value != ctx->count) {
         ctx->count = value;
         wake_things_up();
     }
Michael S. Tsirkin Aug. 3, 2009, 4:57 p.m. UTC | #4
On Mon, Aug 03, 2009 at 06:29:36PM +0300, Avi Kivity wrote:
> On 08/03/2009 06:14 PM, Michael S. Tsirkin wrote:
>> On Mon, Aug 03, 2009 at 06:09:38PM +0300, Avi Kivity wrote:
>>    
>>> On 07/28/2009 08:55 PM, Michael S. Tsirkin wrote:
>>>      
>>>> This implements a new EFD_STATE flag for eventfd.
>>>> When set, this flag changes eventfd behaviour in the following way:
>>>> - write simply stores the value written, and is always non-blocking
>>>> - read unblocks when the value written changes, and
>>>>     returns the value written
>>>>
>>>> Motivation: we'd like to use eventfd in qemu to pass interrupts from
>>>> (emulated or assigned) devices to guest. For level interrupts, the
>>>> counter supported currently by eventfd is not a good match: we really
>>>> need to set interrupt to a level, typically 0 or 1, and give the guest
>>>> ability to see the last value written.
>>>>
>>>>
>>>> @@ -31,37 +31,59 @@ struct eventfd_ctx {
>>>>    	 * issue a wakeup.
>>>>    	 */
>>>>    	__u64 count;
>>>> +	/*
>>>> +	 * When EF_STATE flag is set, eventfd behaves differently:
>>>> +	 * value written gets stored in "count", read will copy
>>>> +	 * "count" to "state".
>>>> +	 */
>>>> +	__u64 state;
>>>>    	unsigned int flags;
>>>>    };
>>>>
>>>>        
>>> Why not write the new value into ->count directly?
>>>      
>>
>> That's what it says. state is ther to detect that value was changed
>> after last read. Makes sense?
>>    
>
> Why not do it at the point of the write?
>
>     if (value != ctx->count) {
>         ctx->count = value;
>         wake_things_up();
>     }

What if write comes before read?

> -- 
> error compiling committee.c: too many arguments to function
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity Aug. 4, 2009, 8:53 a.m. UTC | #5
On 08/03/2009 07:57 PM, Michael S. Tsirkin wrote:
>> Why not do it at the point of the write?
>>
>>      if (value != ctx->count) {
>>          ctx->count = value;
>>          wake_things_up();
>>      }
>>      
>
> What if write comes before read?
>    

The read will get the new value.
Michael S. Tsirkin Aug. 4, 2009, 8:54 a.m. UTC | #6
On Tue, Aug 04, 2009 at 11:53:03AM +0300, Avi Kivity wrote:
> On 08/03/2009 07:57 PM, Michael S. Tsirkin wrote:
>>> Why not do it at the point of the write?
>>>
>>>      if (value != ctx->count) {
>>>          ctx->count = value;
>>>          wake_things_up();
>>>      }
>>>      
>>
>> What if write comes before read?
>>    
>
> The read will get the new value.

Yes :) But how does read know it should not block?

> -- 
> error compiling committee.c: too many arguments to function
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity Aug. 4, 2009, 9:17 a.m. UTC | #7
On 08/04/2009 11:54 AM, Michael S. Tsirkin wrote:
> On Tue, Aug 04, 2009 at 11:53:03AM +0300, Avi Kivity wrote:
>    
>> On 08/03/2009 07:57 PM, Michael S. Tsirkin wrote:
>>      
>>>> Why not do it at the point of the write?
>>>>
>>>>       if (value != ctx->count) {
>>>>           ctx->count = value;
>>>>           wake_things_up();
>>>>       }
>>>>
>>>>          
>>> What if write comes before read?
>>>
>>>        
>> The read will get the new value.
>>      
>
> Yes :) But how does read know it should not block?
>    

If a different read comes after the write but after our read, it will 
have transferred the value, resulting in the same situation.

I think reads should never block with a state based mechanism.
Gleb Natapov Aug. 4, 2009, 9:17 a.m. UTC | #8
On Tue, Aug 04, 2009 at 12:17:44PM +0300, Avi Kivity wrote:
> On 08/04/2009 11:54 AM, Michael S. Tsirkin wrote:
>> On Tue, Aug 04, 2009 at 11:53:03AM +0300, Avi Kivity wrote:
>>    
>>> On 08/03/2009 07:57 PM, Michael S. Tsirkin wrote:
>>>      
>>>>> Why not do it at the point of the write?
>>>>>
>>>>>       if (value != ctx->count) {
>>>>>           ctx->count = value;
>>>>>           wake_things_up();
>>>>>       }
>>>>>
>>>>>          
>>>> What if write comes before read?
>>>>
>>>>        
>>> The read will get the new value.
>>>      
>>
>> Yes :) But how does read know it should not block?
>>    
>
> If a different read comes after the write but after our read, it will  
> have transferred the value, resulting in the same situation.
>
> I think reads should never block with a state based mechanism.
>
Reader may want to poll for the status change.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov Aug. 4, 2009, 9:23 a.m. UTC | #9
On Tue, Aug 04, 2009 at 12:25:49PM +0300, Avi Kivity wrote:
> On 08/04/2009 12:17 PM, Gleb Natapov wrote:
>>> If a different read comes after the write but after our read, it will
>>> have transferred the value, resulting in the same situation.
>>>
>>> I think reads should never block with a state based mechanism.
>>>
>>>      
>> Reader may want to poll for the status change.
>>    
>
> Without epoll(), it's inherently racy since reads from other processes  
> can clear the status.
>
This is correct for any file descriptor. Multiple readers shouldn't
simultaneously read from the same files descriptor if they expect to
make any sense from a result.

> The "last read value" needs to be maintained for each reader, which is  
> not possible with read().
>
Only one reader scenario is interesting. This is not some multiplexing
device.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity Aug. 4, 2009, 9:25 a.m. UTC | #10
On 08/04/2009 12:17 PM, Gleb Natapov wrote:
>> If a different read comes after the write but after our read, it will
>> have transferred the value, resulting in the same situation.
>>
>> I think reads should never block with a state based mechanism.
>>
>>      
> Reader may want to poll for the status change.
>    

Without epoll(), it's inherently racy since reads from other processes 
can clear the status.

The "last read value" needs to be maintained for each reader, which is 
not possible with read().
Michael S. Tsirkin Aug. 4, 2009, 9:26 a.m. UTC | #11
On Tue, Aug 04, 2009 at 12:30:29PM +0300, Avi Kivity wrote:
> On 08/04/2009 12:23 PM, Gleb Natapov wrote:
>> On Tue, Aug 04, 2009 at 12:25:49PM +0300, Avi Kivity wrote:
>>    
>>> On 08/04/2009 12:17 PM, Gleb Natapov wrote:
>>>      
>>>>> If a different read comes after the write but after our read, it will
>>>>> have transferred the value, resulting in the same situation.
>>>>>
>>>>> I think reads should never block with a state based mechanism.
>>>>>
>>>>>
>>>>>          
>>>> Reader may want to poll for the status change.
>>>>
>>>>        
>>> Without epoll(), it's inherently racy since reads from other processes
>>> can clear the status.
>>>
>>>      
>> This is correct for any file descriptor. Multiple readers shouldn't
>> simultaneously read from the same files descriptor if they expect to
>> make any sense from a result.
>>    
>
> I think counting eventfd is an exception, but in general you are right.

How is it an exception? It seems that one reader get the counter,
others will block until the next write.

> -- 
> error compiling committee.c: too many arguments to function
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity Aug. 4, 2009, 9:30 a.m. UTC | #12
On 08/04/2009 12:23 PM, Gleb Natapov wrote:
> On Tue, Aug 04, 2009 at 12:25:49PM +0300, Avi Kivity wrote:
>    
>> On 08/04/2009 12:17 PM, Gleb Natapov wrote:
>>      
>>>> If a different read comes after the write but after our read, it will
>>>> have transferred the value, resulting in the same situation.
>>>>
>>>> I think reads should never block with a state based mechanism.
>>>>
>>>>
>>>>          
>>> Reader may want to poll for the status change.
>>>
>>>        
>> Without epoll(), it's inherently racy since reads from other processes
>> can clear the status.
>>
>>      
> This is correct for any file descriptor. Multiple readers shouldn't
> simultaneously read from the same files descriptor if they expect to
> make any sense from a result.
>    

I think counting eventfd is an exception, but in general you are right.
Michael S. Tsirkin Aug. 4, 2009, 9:33 a.m. UTC | #13
On Tue, Aug 04, 2009 at 12:17:44PM +0300, Avi Kivity wrote:
> On 08/04/2009 11:54 AM, Michael S. Tsirkin wrote:
>> On Tue, Aug 04, 2009 at 11:53:03AM +0300, Avi Kivity wrote:
>>    
>>> On 08/03/2009 07:57 PM, Michael S. Tsirkin wrote:
>>>      
>>>>> Why not do it at the point of the write?
>>>>>
>>>>>       if (value != ctx->count) {
>>>>>           ctx->count = value;
>>>>>           wake_things_up();
>>>>>       }
>>>>>
>>>>>          
>>>> What if write comes before read?
>>>>
>>>>        
>>> The read will get the new value.
>>>      
>>
>> Yes :) But how does read know it should not block?
>>    
>
> If a different read comes after the write but after our read, it will  
> have transferred the value, resulting in the same situation.

Not the same: one reader wakes up, others sleep.

Multiple reads from the same fd behave this way for any file I can think
of. Consider regular eventfd, or a pipe, a socket ...  But if we want to
support blocking reads, we probably should not require the readers to
sync with writers.


> I think reads should never block with a state based mechanism.

Yes, with no support for blocking reads, we don't need the state.
In that case, we probably want to error on open out unless O_NONBLOCK is
specified.  But why is this a good idea?


> -- 
> error compiling committee.c: too many arguments to function
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity Aug. 4, 2009, 10:06 a.m. UTC | #14
On 08/04/2009 12:26 PM, Michael S. Tsirkin wrote:
> How is it an exception? It seems that one reader get the counter,
> others will block until the next write.
>    

Right.  I retract my comments....
diff mbox

Patch

diff --git a/fs/eventfd.c b/fs/eventfd.c
index 347a0e0..7b279e3 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -31,37 +31,59 @@  struct eventfd_ctx {
 	 * issue a wakeup.
 	 */
 	__u64 count;
+	/*
+	 * When EF_STATE flag is set, eventfd behaves differently:
+	 * value written gets stored in "count", read will copy
+	 * "count" to "state".
+	 */
+	__u64 state;
 	unsigned int flags;
 };
 
 
 static inline int eventfd_readable(struct eventfd_ctx *ctx)
 {
-	return ctx->count > 0;
+	if (ctx->flags & EFD_STATE)
+		return ctx->state != ctx->count;
+	else
+		return ctx->count > 0;
 }
 
 static inline int eventfd_writeable(struct eventfd_ctx *ctx, u64 n)
 {
-	return ULLONG_MAX - n > ctx->count;
+	if (ctx->flags & EFD_STATE)
+		return 1;
+	else
+		return ULLONG_MAX - n > ctx->count;
 }
 
 static inline int eventfd_overflow(struct eventfd_ctx *ctx, u64 cnt)
 {
-	return cnt == ULLONG_MAX;
+	if (ctx->flags & EFD_STATE)
+		return 0;
+	else
+		return cnt == ULLONG_MAX;
 }
 
 static inline void eventfd_dowrite(struct eventfd_ctx *ctx, u64 ucnt)
 {
-	if (eventfd_writeable(ctx, ucnt))
-		ucnt = ULLONG_MAX - ctx->count;
+	if (ctx->flags & EFD_STATE)
+		ctx->count = ucnt;
+	else {
+		if (ULLONG_MAX - ctx->count < ucnt)
+			ucnt = ULLONG_MAX - ctx->count;
 
-	ctx->count += ucnt;
+		ctx->count += ucnt;
+	}
 }
 
 static inline u64 eventfd_doread(struct eventfd_ctx *ctx)
 {
 	u64 ucnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count;
-	ctx->count -= ucnt;
+	if (ctx->flags & EFD_STATE)
+		ctx->state = ucnt;
+	else
+		ctx->count -= ucnt;
 	return ucnt;
 }
 
@@ -337,6 +359,10 @@  SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
 
 	if (flags & ~EFD_FLAGS_SET)
 		return -EINVAL;
+	/* State together with semaphore does not make sense. */
+	if ((flags & EFD_STATE) && (flags & EFD_SEMAPHORE))
+		return -EINVAL;
+
 
 	ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
 	if (!ctx)
@@ -344,6 +370,7 @@  SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
 
 	kref_init(&ctx->kref);
 	init_waitqueue_head(&ctx->wqh);
+	ctx->state = count;
 	ctx->count = count;
 	ctx->flags = flags;
 
diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h
index 3b85ba6..78ff649 100644
--- a/include/linux/eventfd.h
+++ b/include/linux/eventfd.h
@@ -19,11 +19,12 @@ 
  * shared O_* flags.
  */
 #define EFD_SEMAPHORE (1 << 0)
+#define EFD_STATE (1 << 1)
 #define EFD_CLOEXEC O_CLOEXEC
 #define EFD_NONBLOCK O_NONBLOCK
 
 #define EFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK)
-#define EFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS | EFD_SEMAPHORE)
+#define EFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS | EFD_SEMAPHORE | EFD_STATE)
 
 #ifdef CONFIG_EVENTFD