diff mbox series

[v3,06/13] epoll: introduce helpers for adding/removing events to uring

Message ID 20190516085810.31077-7-rpenyaev@suse.de (mailing list archive)
State New, archived
Headers show
Series epoll: support pollable epoll from userspace | expand

Commit Message

Roman Penyaev May 16, 2019, 8:58 a.m. UTC
Both add and remove events are lockless and can be called in parallel.

ep_add_event_to_uring():
	o user item is marked atomically as ready
	o if on previous stem user item was observed as not ready,
	  then new entry is created for the index uring.

ep_remove_user_item():
	o user item is marked as EPOLLREMOVED only if it was ready,
	  thus userspace will obseve previously added entry in index
	  uring and correct "removed" state of the item.

Signed-off-by: Roman Penyaev <rpenyaev@suse.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-kernel@vger.kernel.org

Comments

Peter Zijlstra May 31, 2019, 9:55 a.m. UTC | #1
On Thu, May 16, 2019 at 10:58:03AM +0200, Roman Penyaev wrote:
> +#define atomic_set_unless_zero(ptr, flags)			\
> +({								\
> +	typeof(ptr) _ptr = (ptr);				\
> +	typeof(flags) _flags = (flags);				\
> +	typeof(*_ptr) _old, _val = READ_ONCE(*_ptr);		\
> +								\
> +	for (;;) {						\
> +		if (!_val)					\
> +			break;					\
> +		_old = cmpxchg(_ptr, _val, _flags);		\
> +		if (_old == _val)				\
> +			break;					\
> +		_val = _old;					\
> +	}							\
> +	_val;							\
> +})

> +#define atomic_or_with_mask(ptr, flags, mask)			\
> +({								\
> +	typeof(ptr) _ptr = (ptr);				\
> +	typeof(flags) _flags = (flags);				\
> +	typeof(flags) _mask = (mask);				\
> +	typeof(*_ptr) _old, _new, _val = READ_ONCE(*_ptr);	\
> +								\
> +	for (;;) {						\
> +		_new = (_val & ~_mask) | _flags;		\
> +		_old = cmpxchg(_ptr, _val, _new);		\
> +		if (_old == _val)				\
> +			break;					\
> +		_val = _old;					\
> +	}							\
> +	_val;							\
> +})

Don't call them atomic_*() if they're not part of the atomic_t
interface.
Peter Zijlstra May 31, 2019, 9:56 a.m. UTC | #2
On Thu, May 16, 2019 at 10:58:03AM +0200, Roman Penyaev wrote:
> +static inline bool ep_add_event_to_uring(struct epitem *epi, __poll_t pollflags)
> +{
> +	struct eventpoll *ep = epi->ep;
> +	struct epoll_uitem *uitem;
> +	bool added = false;
> +
> +	if (WARN_ON(!pollflags))
> +		return false;
> +
> +	uitem = &ep->user_header->items[epi->bit];
> +	/*
> +	 * Can be represented as:
> +	 *
> +	 *    was_ready = uitem->ready_events;
> +	 *    uitem->ready_events &= ~EPOLLREMOVED;
> +	 *    uitem->ready_events |= pollflags;
> +	 *    if (!was_ready) {
> +	 *         // create index entry
> +	 *    }
> +	 *
> +	 * See the big comment inside ep_remove_user_item(), why it is
> +	 * important to mask EPOLLREMOVED.
> +	 */
> +	if (!atomic_or_with_mask(&uitem->ready_events,
> +				 pollflags, EPOLLREMOVED)) {
> +		unsigned int i, *item_idx, index_mask;
> +
> +		/*
> +		 * Item was not ready before, thus we have to insert
> +		 * new index to the ring.
> +		 */
> +
> +		index_mask = ep_max_index_nr(ep) - 1;
> +		i = __atomic_fetch_add(&ep->user_header->tail, 1,
> +				       __ATOMIC_ACQUIRE);

afaict __atomic_fetch_add() does not exist.

> +		item_idx = &ep->user_index[i & index_mask];
> +
> +		/* Signal with a bit, which is > 0 */
> +		*item_idx = epi->bit + 1;

Did you just increment the user visible tail pointer before you filled
the data? That is, can the concurrent userspace observe the increment
before you put credible data in its place?

> +
> +		/*
> +		 * Want index update be flushed from CPU write buffer and
> +		 * immediately visible on userspace side to avoid long busy
> +		 * loops.
> +		 */
> +		smp_wmb();

That's still complete nonsense.

> +
> +		added = true;
> +	}
> +
> +	return added;
> +}
Roman Penyaev May 31, 2019, 11:15 a.m. UTC | #3
On 2019-05-31 11:56, Peter Zijlstra wrote:
> On Thu, May 16, 2019 at 10:58:03AM +0200, Roman Penyaev wrote:
>> +static inline bool ep_add_event_to_uring(struct epitem *epi, __poll_t 
>> pollflags)
>> +{
>> +	struct eventpoll *ep = epi->ep;
>> +	struct epoll_uitem *uitem;
>> +	bool added = false;
>> +
>> +	if (WARN_ON(!pollflags))
>> +		return false;
>> +
>> +	uitem = &ep->user_header->items[epi->bit];
>> +	/*
>> +	 * Can be represented as:
>> +	 *
>> +	 *    was_ready = uitem->ready_events;
>> +	 *    uitem->ready_events &= ~EPOLLREMOVED;
>> +	 *    uitem->ready_events |= pollflags;
>> +	 *    if (!was_ready) {
>> +	 *         // create index entry
>> +	 *    }
>> +	 *
>> +	 * See the big comment inside ep_remove_user_item(), why it is
>> +	 * important to mask EPOLLREMOVED.
>> +	 */
>> +	if (!atomic_or_with_mask(&uitem->ready_events,
>> +				 pollflags, EPOLLREMOVED)) {
>> +		unsigned int i, *item_idx, index_mask;
>> +
>> +		/*
>> +		 * Item was not ready before, thus we have to insert
>> +		 * new index to the ring.
>> +		 */
>> +
>> +		index_mask = ep_max_index_nr(ep) - 1;
>> +		i = __atomic_fetch_add(&ep->user_header->tail, 1,
>> +				       __ATOMIC_ACQUIRE);
> 
> afaict __atomic_fetch_add() does not exist.

That is gcc extension.  I did not find any API just to increment
the variable atomically without using/casting to atomic.  What
is a proper way to achieve that?

> 
>> +		item_idx = &ep->user_index[i & index_mask];
>> +
>> +		/* Signal with a bit, which is > 0 */
>> +		*item_idx = epi->bit + 1;
> 
> Did you just increment the user visible tail pointer before you filled
> the data? That is, can the concurrent userspace observe the increment
> before you put credible data in its place?

No, the "data" is the "ready_events" mask, which was updated before,
using cmpxchg, atomic_or_with_mask() call.  All I need is to put an
index of just updated item to the uring.

Userspace, in its turn, gets the index from the ring and then checks
the mask.

> 
>> +
>> +		/*
>> +		 * Want index update be flushed from CPU write buffer and
>> +		 * immediately visible on userspace side to avoid long busy
>> +		 * loops.
>> +		 */
>> +		smp_wmb();
> 
> That's still complete nonsense.

Yes, true.  My confusion came from the simple test, where one thread
swaps pointers in a loop, another thread dereferences pointer and
increments a variable:

     THR#0
     -----------

     unsigned vvv1 = 0, vvv2 = 0;
     unsigned *ptr;

     ptr = &vvv1;
     thr_level2 = &vvv2;

     while (!stop) {
         unsigned *tmp = *thr_level2;
         *thr_level2 = ptr;
         barrier();               <<<< ????
         ptr = tmp;
     }

     THR#1
     -----------

     while (!stop) {
    	ptr = thr_level2;
         (*ptr)++;
     }


At the end I expect `vvv1` and `vvv2` are approximately equally
incremented.  But, without barrier() only one variable is
incremented.

Now I see that barrier() should be defined as a simple compiler
barrier as asm volatile("" ::: "memory"), and there is nothing
related with write buffer as I wrote in the comment.

So indeed garbage and can be removed.  Thanks.

--
Roman
Roman Penyaev May 31, 2019, 11:24 a.m. UTC | #4
On 2019-05-31 11:55, Peter Zijlstra wrote:
> On Thu, May 16, 2019 at 10:58:03AM +0200, Roman Penyaev wrote:
>> +#define atomic_set_unless_zero(ptr, flags)			\
>> +({								\
>> +	typeof(ptr) _ptr = (ptr);				\
>> +	typeof(flags) _flags = (flags);				\
>> +	typeof(*_ptr) _old, _val = READ_ONCE(*_ptr);		\
>> +								\
>> +	for (;;) {						\
>> +		if (!_val)					\
>> +			break;					\
>> +		_old = cmpxchg(_ptr, _val, _flags);		\
>> +		if (_old == _val)				\
>> +			break;					\
>> +		_val = _old;					\
>> +	}							\
>> +	_val;							\
>> +})
> 
>> +#define atomic_or_with_mask(ptr, flags, mask)			\
>> +({								\
>> +	typeof(ptr) _ptr = (ptr);				\
>> +	typeof(flags) _flags = (flags);				\
>> +	typeof(flags) _mask = (mask);				\
>> +	typeof(*_ptr) _old, _new, _val = READ_ONCE(*_ptr);	\
>> +								\
>> +	for (;;) {						\
>> +		_new = (_val & ~_mask) | _flags;		\
>> +		_old = cmpxchg(_ptr, _val, _new);		\
>> +		if (_old == _val)				\
>> +			break;					\
>> +		_val = _old;					\
>> +	}							\
>> +	_val;							\
>> +})
> 
> Don't call them atomic_*() if they're not part of the atomic_t
> interface.

Can we add those two?  Or keep it local is better?

--
Roman
Peter Zijlstra May 31, 2019, 12:53 p.m. UTC | #5
On Fri, May 31, 2019 at 01:15:21PM +0200, Roman Penyaev wrote:
> On 2019-05-31 11:56, Peter Zijlstra wrote:
> > On Thu, May 16, 2019 at 10:58:03AM +0200, Roman Penyaev wrote:

> > > +		i = __atomic_fetch_add(&ep->user_header->tail, 1,
> > > +				       __ATOMIC_ACQUIRE);
> > 
> > afaict __atomic_fetch_add() does not exist.
> 
> That is gcc extension.  I did not find any API just to increment
> the variable atomically without using/casting to atomic.  What
> is a proper way to achieve that?

That's C11 atomics, and those shall not be used in the kernel. For one
they're not available in the minimally required GCC version (4.6).

The proper and only way is to use atomic_t, but also you cannot share
atomic_t with userspace.

The normal way of doing something like this is to have a kernel private
atomic_t and copy the value out to userspace using smp_store_release().
Peter Zijlstra May 31, 2019, 12:56 p.m. UTC | #6
On Fri, May 31, 2019 at 01:15:21PM +0200, Roman Penyaev wrote:
> On 2019-05-31 11:56, Peter Zijlstra wrote:
> > On Thu, May 16, 2019 at 10:58:03AM +0200, Roman Penyaev wrote:
> > > +static inline bool ep_add_event_to_uring(struct epitem *epi,
> > > __poll_t pollflags)
> > > +{
> > > +	struct eventpoll *ep = epi->ep;
> > > +	struct epoll_uitem *uitem;
> > > +	bool added = false;
> > > +
> > > +	if (WARN_ON(!pollflags))
> > > +		return false;
> > > +
> > > +	uitem = &ep->user_header->items[epi->bit];
> > > +	/*
> > > +	 * Can be represented as:
> > > +	 *
> > > +	 *    was_ready = uitem->ready_events;
> > > +	 *    uitem->ready_events &= ~EPOLLREMOVED;
> > > +	 *    uitem->ready_events |= pollflags;
> > > +	 *    if (!was_ready) {
> > > +	 *         // create index entry
> > > +	 *    }
> > > +	 *
> > > +	 * See the big comment inside ep_remove_user_item(), why it is
> > > +	 * important to mask EPOLLREMOVED.
> > > +	 */
> > > +	if (!atomic_or_with_mask(&uitem->ready_events,
> > > +				 pollflags, EPOLLREMOVED)) {
> > > +		unsigned int i, *item_idx, index_mask;
> > > +
> > > +		/*
> > > +		 * Item was not ready before, thus we have to insert
> > > +		 * new index to the ring.
> > > +		 */
> > > +
> > > +		index_mask = ep_max_index_nr(ep) - 1;
> > > +		i = __atomic_fetch_add(&ep->user_header->tail, 1,
> > > +				       __ATOMIC_ACQUIRE);
> > > +		item_idx = &ep->user_index[i & index_mask];
> > > +
> > > +		/* Signal with a bit, which is > 0 */
> > > +		*item_idx = epi->bit + 1;
> > 
> > Did you just increment the user visible tail pointer before you filled
> > the data? That is, can the concurrent userspace observe the increment
> > before you put credible data in its place?
> 
> No, the "data" is the "ready_events" mask, which was updated before,
> using cmpxchg, atomic_or_with_mask() call.  All I need is to put an
> index of just updated item to the uring.
> 
> Userspace, in its turn, gets the index from the ring and then checks
> the mask.

But where do you write the index into the shared memory? That index
should be written before you publish the new tail.
Peter Zijlstra May 31, 2019, 1:11 p.m. UTC | #7
On Fri, May 31, 2019 at 01:24:07PM +0200, Roman Penyaev wrote:
> On 2019-05-31 11:55, Peter Zijlstra wrote:
> > On Thu, May 16, 2019 at 10:58:03AM +0200, Roman Penyaev wrote:
> > > +#define atomic_set_unless_zero(ptr, flags)			\
> > > +({								\
> > > +	typeof(ptr) _ptr = (ptr);				\
> > > +	typeof(flags) _flags = (flags);				\
> > > +	typeof(*_ptr) _old, _val = READ_ONCE(*_ptr);		\
> > > +								\
> > > +	for (;;) {						\
> > > +		if (!_val)					\
> > > +			break;					\
> > > +		_old = cmpxchg(_ptr, _val, _flags);		\
> > > +		if (_old == _val)				\
> > > +			break;					\
> > > +		_val = _old;					\
> > > +	}							\
> > > +	_val;							\
> > > +})
> > 
> > > +#define atomic_or_with_mask(ptr, flags, mask)			\
> > > +({								\
> > > +	typeof(ptr) _ptr = (ptr);				\
> > > +	typeof(flags) _flags = (flags);				\
> > > +	typeof(flags) _mask = (mask);				\
> > > +	typeof(*_ptr) _old, _new, _val = READ_ONCE(*_ptr);	\
> > > +								\
> > > +	for (;;) {						\
> > > +		_new = (_val & ~_mask) | _flags;		\
> > > +		_old = cmpxchg(_ptr, _val, _new);		\
> > > +		if (_old == _val)				\
> > > +			break;					\
> > > +		_val = _old;					\
> > > +	}							\
> > > +	_val;							\
> > > +})
> > 
> > Don't call them atomic_*() if they're not part of the atomic_t
> > interface.
> 
> Can we add those two?  Or keep it local is better?

Afaict you're using them on the user visible values; we should not put
atomic_t into shared memory.

Your interface isn't compatible with those 'funny' architectures like
parisc etc. Since you expect userspace to do atomic ops on these
variables too. It is not a one-way interface ...
Roman Penyaev May 31, 2019, 2:21 p.m. UTC | #8
On 2019-05-31 14:56, Peter Zijlstra wrote:
> On Fri, May 31, 2019 at 01:15:21PM +0200, Roman Penyaev wrote:
>> On 2019-05-31 11:56, Peter Zijlstra wrote:
>> > On Thu, May 16, 2019 at 10:58:03AM +0200, Roman Penyaev wrote:
>> > > +static inline bool ep_add_event_to_uring(struct epitem *epi,
>> > > __poll_t pollflags)
>> > > +{
>> > > +	struct eventpoll *ep = epi->ep;
>> > > +	struct epoll_uitem *uitem;
>> > > +	bool added = false;
>> > > +
>> > > +	if (WARN_ON(!pollflags))
>> > > +		return false;
>> > > +
>> > > +	uitem = &ep->user_header->items[epi->bit];
>> > > +	/*
>> > > +	 * Can be represented as:
>> > > +	 *
>> > > +	 *    was_ready = uitem->ready_events;
>> > > +	 *    uitem->ready_events &= ~EPOLLREMOVED;
>> > > +	 *    uitem->ready_events |= pollflags;
>> > > +	 *    if (!was_ready) {
>> > > +	 *         // create index entry
>> > > +	 *    }
>> > > +	 *
>> > > +	 * See the big comment inside ep_remove_user_item(), why it is
>> > > +	 * important to mask EPOLLREMOVED.
>> > > +	 */
>> > > +	if (!atomic_or_with_mask(&uitem->ready_events,
>> > > +				 pollflags, EPOLLREMOVED)) {
>> > > +		unsigned int i, *item_idx, index_mask;
>> > > +
>> > > +		/*
>> > > +		 * Item was not ready before, thus we have to insert
>> > > +		 * new index to the ring.
>> > > +		 */
>> > > +
>> > > +		index_mask = ep_max_index_nr(ep) - 1;
>> > > +		i = __atomic_fetch_add(&ep->user_header->tail, 1,
>> > > +				       __ATOMIC_ACQUIRE);
>> > > +		item_idx = &ep->user_index[i & index_mask];
>> > > +
>> > > +		/* Signal with a bit, which is > 0 */
>> > > +		*item_idx = epi->bit + 1;
>> >
>> > Did you just increment the user visible tail pointer before you filled
>> > the data? That is, can the concurrent userspace observe the increment
>> > before you put credible data in its place?
>> 
>> No, the "data" is the "ready_events" mask, which was updated before,
>> using cmpxchg, atomic_or_with_mask() call.  All I need is to put an
>> index of just updated item to the uring.
>> 
>> Userspace, in its turn, gets the index from the ring and then checks
>> the mask.
> 
> But where do you write the index into the shared memory? That index
> should be written before you publish the new tail.

The ep_add_event_to_uring() is lockless, thus I can't increase tail 
after,
I need to reserve the index slot, where to write to.  I can use shadow 
tail,
which is not seen by userspace, but I have to guarantee that tail is 
updated
with shadow tail *after* all callers of ep_add_event_to_uring() are 
left.
That is possible, please see the code below, but it adds more 
complexity:

(code was tested on user side, thus has c11 atomics)

static inline void add_event__kernel(struct ring *ring, unsigned bit)
{
         unsigned i, cntr, commit_cntr, *item_idx, tail, old;

         i = __atomic_fetch_add(&ring->cntr, 1, __ATOMIC_ACQUIRE);
         item_idx = &ring->user_itemsindex[i % ring->nr];

         /* Update data */
         *item_idx = bit;

         commit_cntr = __atomic_add_fetch(&ring->commit_cntr, 1, 
__ATOMIC_RELEASE);

         tail = ring->user_header->tail;
         rmb();
         do {
                 cntr = ring->cntr;
                 if (cntr != commit_cntr)
                         /* Someone else will advance tail */
                         break;

                 old = tail;

         } while ((tail = 
__sync_val_compare_and_swap(&ring->user_header->tail, old, cntr)) != 
old);
}

Another way (current solution) is to spin on userspace side in order to 
get
index > 0 (valid index is always > 0), i.e.:

	item_idx_ptr = &index[idx & indeces_mask];

	/*
	 * Spin here till we see valid index
	 */
	while (!(idx = __atomic_load_n(item_idx_ptr, __ATOMIC_ACQUIRE)))
		;



So of course tail can be updated after, like you mentioned, but then I 
have
to introduce locks.  I want to keep it lockless on hot event path.

--
Roman
Roman Penyaev May 31, 2019, 2:28 p.m. UTC | #9
On 2019-05-31 14:53, Peter Zijlstra wrote:
> On Fri, May 31, 2019 at 01:15:21PM +0200, Roman Penyaev wrote:
>> On 2019-05-31 11:56, Peter Zijlstra wrote:
>> > On Thu, May 16, 2019 at 10:58:03AM +0200, Roman Penyaev wrote:
> 
>> > > +		i = __atomic_fetch_add(&ep->user_header->tail, 1,
>> > > +				       __ATOMIC_ACQUIRE);
>> >
>> > afaict __atomic_fetch_add() does not exist.
>> 
>> That is gcc extension.  I did not find any API just to increment
>> the variable atomically without using/casting to atomic.  What
>> is a proper way to achieve that?
> 
> That's C11 atomics, and those shall not be used in the kernel. For one
> they're not available in the minimally required GCC version (4.6).
> 
> The proper and only way is to use atomic_t, but also you cannot share
> atomic_t with userspace.

Yes, that what I tried to avoid choosing c11 extension.

> 
> The normal way of doing something like this is to have a kernel private
> atomic_t and copy the value out to userspace using smp_store_release().

Since this path is lockless unfortunately that won't work.  So seems
the only way is to do one more cmpxchg (sigh) or give up and take a
look (sad sigh).

--
Roman
Peter Zijlstra May 31, 2019, 4:51 p.m. UTC | #10
On Fri, May 31, 2019 at 04:21:30PM +0200, Roman Penyaev wrote:

> The ep_add_event_to_uring() is lockless, thus I can't increase tail after,
> I need to reserve the index slot, where to write to.  I can use shadow tail,
> which is not seen by userspace, but I have to guarantee that tail is updated
> with shadow tail *after* all callers of ep_add_event_to_uring() are left.
> That is possible, please see the code below, but it adds more complexity:
> 
> (code was tested on user side, thus has c11 atomics)
> 
> static inline void add_event__kernel(struct ring *ring, unsigned bit)
> {
>         unsigned i, cntr, commit_cntr, *item_idx, tail, old;
> 
>         i = __atomic_fetch_add(&ring->cntr, 1, __ATOMIC_ACQUIRE);
>         item_idx = &ring->user_itemsindex[i % ring->nr];
> 
>         /* Update data */
>         *item_idx = bit;
> 
>         commit_cntr = __atomic_add_fetch(&ring->commit_cntr, 1,
> __ATOMIC_RELEASE);
> 
>         tail = ring->user_header->tail;
>         rmb();
>         do {
>                 cntr = ring->cntr;
>                 if (cntr != commit_cntr)
>                         /* Someone else will advance tail */
>                         break;
> 
>                 old = tail;
> 
>         } while ((tail =
> __sync_val_compare_and_swap(&ring->user_header->tail, old, cntr)) != old);
> }

Yes, I'm well aware of that particular problem (see
kernel/events/ring_buffer.c:perf_output_put_handle for instance). But
like you show, it can be done. It also makes the thing wait-free, as
opposed to merely lockless.
Peter Zijlstra May 31, 2019, 4:53 p.m. UTC | #11
On Fri, May 31, 2019 at 04:28:36PM +0200, Roman Penyaev wrote:
> On 2019-05-31 14:53, Peter Zijlstra wrote:
> > On Fri, May 31, 2019 at 01:15:21PM +0200, Roman Penyaev wrote:
> > > On 2019-05-31 11:56, Peter Zijlstra wrote:
> > > > On Thu, May 16, 2019 at 10:58:03AM +0200, Roman Penyaev wrote:
> > 
> > > > > +		i = __atomic_fetch_add(&ep->user_header->tail, 1,
> > > > > +				       __ATOMIC_ACQUIRE);
> > > >
> > > > afaict __atomic_fetch_add() does not exist.
> > > 
> > > That is gcc extension.  I did not find any API just to increment
> > > the variable atomically without using/casting to atomic.  What
> > > is a proper way to achieve that?
> > 
> > That's C11 atomics, and those shall not be used in the kernel. For one
> > they're not available in the minimally required GCC version (4.6).
> > 
> > The proper and only way is to use atomic_t, but also you cannot share
> > atomic_t with userspace.
> 
> Yes, that what I tried to avoid choosing c11 extension.
> 
> > 
> > The normal way of doing something like this is to have a kernel private
> > atomic_t and copy the value out to userspace using smp_store_release().
> 
> Since this path is lockless unfortunately that won't work.  So seems
> the only way is to do one more cmpxchg (sigh) or give up and take a
> look (sad sigh).

Of course it works; most of the extant buffers already have a shadow
tail and do the update to userspace with a store-release.
Roman Penyaev May 31, 2019, 6:58 p.m. UTC | #12
On 2019-05-31 18:51, Peter Zijlstra wrote:
> On Fri, May 31, 2019 at 04:21:30PM +0200, Roman Penyaev wrote:
> 
>> The ep_add_event_to_uring() is lockless, thus I can't increase tail 
>> after,
>> I need to reserve the index slot, where to write to.  I can use shadow 
>> tail,
>> which is not seen by userspace, but I have to guarantee that tail is 
>> updated
>> with shadow tail *after* all callers of ep_add_event_to_uring() are 
>> left.
>> That is possible, please see the code below, but it adds more 
>> complexity:
>> 
>> (code was tested on user side, thus has c11 atomics)
>> 
>> static inline void add_event__kernel(struct ring *ring, unsigned bit)
>> {
>>         unsigned i, cntr, commit_cntr, *item_idx, tail, old;
>> 
>>         i = __atomic_fetch_add(&ring->cntr, 1, __ATOMIC_ACQUIRE);
>>         item_idx = &ring->user_itemsindex[i % ring->nr];
>> 
>>         /* Update data */
>>         *item_idx = bit;
>> 
>>         commit_cntr = __atomic_add_fetch(&ring->commit_cntr, 1,
>> __ATOMIC_RELEASE);
>> 
>>         tail = ring->user_header->tail;
>>         rmb();
>>         do {
>>                 cntr = ring->cntr;
>>                 if (cntr != commit_cntr)
>>                         /* Someone else will advance tail */
>>                         break;
>> 
>>                 old = tail;
>> 
>>         } while ((tail =
>> __sync_val_compare_and_swap(&ring->user_header->tail, old, cntr)) != 
>> old);
>> }
> 
> Yes, I'm well aware of that particular problem (see
> kernel/events/ring_buffer.c:perf_output_put_handle for instance).

I'll take a look, thanks.

> But like you show, it can be done. It also makes the thing wait-free, 
> as
> opposed to merely lockless.

You think it's better?  I did not like this variant from the very
beginning because of the unnecessary complexity.  But maybe you're
right.  No busy loops on user side makes it wait-free.  And also
I can avoid c11 in kernel using cmpxchg along with atomic_t.

--
Roman
Peter Zijlstra June 3, 2019, 9:09 a.m. UTC | #13
On Fri, May 31, 2019 at 08:58:19PM +0200, Roman Penyaev wrote:
> On 2019-05-31 18:51, Peter Zijlstra wrote:

> > But like you show, it can be done. It also makes the thing wait-free, as
> > opposed to merely lockless.
> 
> You think it's better?  I did not like this variant from the very
> beginning because of the unnecessary complexity.  But maybe you're
> right.  No busy loops on user side makes it wait-free.  And also
> I can avoid c11 in kernel using cmpxchg along with atomic_t.

Imagine the (v)CPU going for an extended nap right between publishing the
new tail and writing the !0 entry. Then your userspace is stuck burning
cycles without getting anything useful done.
Roman Penyaev June 3, 2019, 10:02 a.m. UTC | #14
On 2019-06-03 11:09, Peter Zijlstra wrote:
> On Fri, May 31, 2019 at 08:58:19PM +0200, Roman Penyaev wrote:
>> On 2019-05-31 18:51, Peter Zijlstra wrote:
> 
>> > But like you show, it can be done. It also makes the thing wait-free, as
>> > opposed to merely lockless.
>> 
>> You think it's better?  I did not like this variant from the very
>> beginning because of the unnecessary complexity.  But maybe you're
>> right.  No busy loops on user side makes it wait-free.  And also
>> I can avoid c11 in kernel using cmpxchg along with atomic_t.
> 
> Imagine the (v)CPU going for an extended nap right between publishing 
> the
> new tail and writing the !0 entry. Then your userspace is stuck burning
> cycles without getting anything useful done.

Yes, that is absolutely not nice.  That also worries me.  I wanted
to minimize number of atomic ops on hot path, and of course in that
respect this busy-loop is more attractive.

I will polish and switch back to the wait-free variant and resend the
whole patchset.  Could you please take a look?  Will add you to CC.

--
Roman
diff mbox series

Patch

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 9d3905c0afbf..2f551c005640 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -189,6 +189,9 @@  struct epitem {
 
 	/* Work for offloading event callback */
 	struct work_struct work;
+
+	/* Bit in user bitmap for user polling */
+	unsigned int bit;
 };
 
 /*
@@ -427,6 +430,11 @@  static inline unsigned int ep_to_items_bm_length(unsigned int nr)
 	return PAGE_ALIGN(ALIGN(nr, 8) >> 3);
 }
 
+static inline unsigned int ep_max_index_nr(struct eventpoll *ep)
+{
+	return ep->index_length >> ilog2(sizeof(*ep->user_index));
+}
+
 static inline bool ep_polled_by_user(struct eventpoll *ep)
 {
 	return !!ep->user_header;
@@ -857,6 +865,127 @@  static void epi_rcu_free(struct rcu_head *head)
 	kmem_cache_free(epi_cache, epi);
 }
 
+#define atomic_set_unless_zero(ptr, flags)			\
+({								\
+	typeof(ptr) _ptr = (ptr);				\
+	typeof(flags) _flags = (flags);				\
+	typeof(*_ptr) _old, _val = READ_ONCE(*_ptr);		\
+								\
+	for (;;) {						\
+		if (!_val)					\
+			break;					\
+		_old = cmpxchg(_ptr, _val, _flags);		\
+		if (_old == _val)				\
+			break;					\
+		_val = _old;					\
+	}							\
+	_val;							\
+})
+
+static inline void ep_remove_user_item(struct epitem *epi)
+{
+	struct eventpoll *ep = epi->ep;
+	struct epoll_uitem *uitem;
+
+	lockdep_assert_held(&ep->mtx);
+
+	/* Event should not have any attached queues */
+	WARN_ON(!list_empty(&epi->pwqlist));
+
+	uitem = &ep->user_header->items[epi->bit];
+
+	/*
+	 * User item can be in two states: signaled (read_events is set
+	 * and userspace has not yet consumed this event) and not signaled
+	 * (no events yet fired or already consumed by userspace).
+	 * We reset ready_events to EPOLLREMOVED only if ready_events is
+	 * in signaled state (we expect that userspace will come soon and
+	 * fetch this event).  In case of not signaled leave read_events
+	 * as 0.
+	 *
+	 * Why it is important to mark read_events as EPOLLREMOVED in case
+	 * of already signaled state?  ep_insert() op can be immediately
+	 * called after ep_remove(), thus the same bit can be reused and
+	 * then new event comes, which corresponds to the same entry inside
+	 * user items array.  For this particular case ep_add_event_to_uring()
+	 * does not allocate a new index entry, but simply masks EPOLLREMOVED,
+	 * and userspace uses old index entry, but meanwhile old user item
+	 * has been removed, new item has been added and event updated.
+	 */
+	atomic_set_unless_zero(&uitem->ready_events, EPOLLREMOVED);
+	clear_bit(epi->bit, ep->items_bm);
+}
+
+#define atomic_or_with_mask(ptr, flags, mask)			\
+({								\
+	typeof(ptr) _ptr = (ptr);				\
+	typeof(flags) _flags = (flags);				\
+	typeof(flags) _mask = (mask);				\
+	typeof(*_ptr) _old, _new, _val = READ_ONCE(*_ptr);	\
+								\
+	for (;;) {						\
+		_new = (_val & ~_mask) | _flags;		\
+		_old = cmpxchg(_ptr, _val, _new);		\
+		if (_old == _val)				\
+			break;					\
+		_val = _old;					\
+	}							\
+	_val;							\
+})
+
+static inline bool ep_add_event_to_uring(struct epitem *epi, __poll_t pollflags)
+{
+	struct eventpoll *ep = epi->ep;
+	struct epoll_uitem *uitem;
+	bool added = false;
+
+	if (WARN_ON(!pollflags))
+		return false;
+
+	uitem = &ep->user_header->items[epi->bit];
+	/*
+	 * Can be represented as:
+	 *
+	 *    was_ready = uitem->ready_events;
+	 *    uitem->ready_events &= ~EPOLLREMOVED;
+	 *    uitem->ready_events |= pollflags;
+	 *    if (!was_ready) {
+	 *         // create index entry
+	 *    }
+	 *
+	 * See the big comment inside ep_remove_user_item(), why it is
+	 * important to mask EPOLLREMOVED.
+	 */
+	if (!atomic_or_with_mask(&uitem->ready_events,
+				 pollflags, EPOLLREMOVED)) {
+		unsigned int i, *item_idx, index_mask;
+
+		/*
+		 * Item was not ready before, thus we have to insert
+		 * new index to the ring.
+		 */
+
+		index_mask = ep_max_index_nr(ep) - 1;
+		i = __atomic_fetch_add(&ep->user_header->tail, 1,
+				       __ATOMIC_ACQUIRE);
+		item_idx = &ep->user_index[i & index_mask];
+
+		/* Signal with a bit, which is > 0 */
+		*item_idx = epi->bit + 1;
+
+		/*
+		 * Want index update be flushed from CPU write buffer and
+		 * immediately visible on userspace side to avoid long busy
+		 * loops.
+		 */
+		smp_wmb();
+
+		added = true;
+	}
+
+	return added;
+}
+
 /*
  * Removes a "struct epitem" from the eventpoll RB tree and deallocates
  * all the associated resources. Must be called with "mtx" held.
diff --git a/include/uapi/linux/eventpoll.h b/include/uapi/linux/eventpoll.h
index 95ac0b564529..70aa80ddff25 100644
--- a/include/uapi/linux/eventpoll.h
+++ b/include/uapi/linux/eventpoll.h
@@ -42,6 +42,9 @@ 
 #define EPOLLMSG	(__force __poll_t)0x00000400
 #define EPOLLRDHUP	(__force __poll_t)0x00002000
 
+/* User item marked as removed for EPOLL_USERPOLL */
+#define EPOLLREMOVED	((__force __poll_t)(1U << 27))
+
 /* Set exclusive wakeup mode for the target file descriptor */
 #define EPOLLEXCLUSIVE	((__force __poll_t)(1U << 28))