diff mbox series

[for-next,4/7] io_uring: hide eventfd assumptions in evenfd paths

Message ID 8ac7fbe5ad880990ef498fa09f8de70390836f97.1655637157.git.asml.silence@gmail.com (mailing list archive)
State New
Headers show
Series cqe posting cleanups | expand

Commit Message

Pavel Begunkov June 19, 2022, 11:26 a.m. UTC
Some io_uring-eventfd users assume that there won't be spurious wakeups.
That assumption has to be honoured by all io_cqring_ev_posted() callers,
which is inconvenient and from time to time leads to problems but should
be maintained to not break the userspace.

Instead of making the callers to track whether a CQE was posted or not,
hide it inside io_eventfd_signal(). It saves ->cached_cq_tail it saw
last time and triggers the eventfd only when ->cached_cq_tail changed
since then.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 include/linux/io_uring_types.h |  2 ++
 io_uring/io_uring.c            | 44 ++++++++++++++++++++--------------
 io_uring/timeout.c             |  3 +--
 3 files changed, 29 insertions(+), 20 deletions(-)

Comments

Jens Axboe June 19, 2022, 6:18 p.m. UTC | #1
On Sun, Jun 19, 2022 at 5:26 AM Pavel Begunkov <asml.silence@gmail.com> wrote:
>
> Some io_uring-eventfd users assume that there won't be spurious wakeups.
> That assumption has to be honoured by all io_cqring_ev_posted() callers,
> which is inconvenient and from time to time leads to problems but should
> be maintained to not break the userspace.
>
> Instead of making the callers to track whether a CQE was posted or not,
> hide it inside io_eventfd_signal(). It saves ->cached_cq_tail it saw
> last time and triggers the eventfd only when ->cached_cq_tail changed
> since then.

This one is causing frequent errors with poll-cancel.t:

axboe@m1pro-kvm ~/g/liburing (master)> test/poll-cancel.t
axboe@m1pro-kvm ~/g/liburing (master)> test/poll-cancel.t
Timed out!
axboe@m1pro-kvm ~/g/liburing (master) [1]> test/poll-cancel.t
Timed out!
axboe@m1pro-kvm ~/g/liburing (master) [1]> test/poll-cancel.t
Timed out!

I've dropped this one, and 6-7/7 as they then also throw a bunch of
rejects.
Pavel Begunkov June 19, 2022, 6:49 p.m. UTC | #2
On 6/19/22 19:18, Jens Axboe wrote:
> On Sun, Jun 19, 2022 at 5:26 AM Pavel Begunkov <asml.silence@gmail.com> wrote:
>>
>> Some io_uring-eventfd users assume that there won't be spurious wakeups.
>> That assumption has to be honoured by all io_cqring_ev_posted() callers,
>> which is inconvenient and from time to time leads to problems but should
>> be maintained to not break the userspace.
>>
>> Instead of making the callers to track whether a CQE was posted or not,
>> hide it inside io_eventfd_signal(). It saves ->cached_cq_tail it saw
>> last time and triggers the eventfd only when ->cached_cq_tail changed
>> since then.
> 
> This one is causing frequent errors with poll-cancel.t:
> 
> axboe@m1pro-kvm ~/g/liburing (master)> test/poll-cancel.t
> axboe@m1pro-kvm ~/g/liburing (master)> test/poll-cancel.t
> Timed out!
> axboe@m1pro-kvm ~/g/liburing (master) [1]> test/poll-cancel.t
> Timed out!
> axboe@m1pro-kvm ~/g/liburing (master) [1]> test/poll-cancel.t
> Timed out!
> 
> I've dropped this one, and 6-7/7 as they then also throw a bunch of
> rejects.

I mentioned it in the cover letter, extra wake ups slowing task
exit cancellations down, which make it to timeout (increasing
alarm time helps). The problem is in cancellation, in particular
because for some reason it spins on the cancellation (including
timeout and poll remove all).

I'll look into it, but it's rather "discovered by accident"
rather than caused by the patch.
Jens Axboe June 19, 2022, 6:58 p.m. UTC | #3
On 6/19/22 12:49 PM, Pavel Begunkov wrote:
> On 6/19/22 19:18, Jens Axboe wrote:
>> On Sun, Jun 19, 2022 at 5:26 AM Pavel Begunkov <asml.silence@gmail.com> wrote:
>>>
>>> Some io_uring-eventfd users assume that there won't be spurious wakeups.
>>> That assumption has to be honoured by all io_cqring_ev_posted() callers,
>>> which is inconvenient and from time to time leads to problems but should
>>> be maintained to not break the userspace.
>>>
>>> Instead of making the callers to track whether a CQE was posted or not,
>>> hide it inside io_eventfd_signal(). It saves ->cached_cq_tail it saw
>>> last time and triggers the eventfd only when ->cached_cq_tail changed
>>> since then.
>>
>> This one is causing frequent errors with poll-cancel.t:
>>
>> axboe@m1pro-kvm ~/g/liburing (master)> test/poll-cancel.t
>> axboe@m1pro-kvm ~/g/liburing (master)> test/poll-cancel.t
>> Timed out!
>> axboe@m1pro-kvm ~/g/liburing (master) [1]> test/poll-cancel.t
>> Timed out!
>> axboe@m1pro-kvm ~/g/liburing (master) [1]> test/poll-cancel.t
>> Timed out!
>>
>> I've dropped this one, and 6-7/7 as they then also throw a bunch of
>> rejects.
> 
> I mentioned it in the cover letter, extra wake ups slowing task
> exit cancellations down, which make it to timeout (increasing
> alarm time helps). The problem is in cancellation, in particular
> because for some reason it spins on the cancellation (including
> timeout and poll remove all).

But that's not really usable at all. Before the patch, the test case
takes 1-2ms, predictably. After the patch, I see lots of 10.0s runs.
That's clearly not going to work.
diff mbox series

Patch

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 779c72da5b8f..327bc7f0808d 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -315,6 +315,8 @@  struct io_ring_ctx {
 
 	struct list_head		defer_list;
 	unsigned			sq_thread_idle;
+	/* protected by ->completion_lock */
+	unsigned			evfd_last_cq_tail;
 };
 
 enum {
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 31beb9ccbf12..0875cc649e23 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -473,6 +473,22 @@  static __cold void io_queue_deferred(struct io_ring_ctx *ctx)
 static void io_eventfd_signal(struct io_ring_ctx *ctx)
 {
 	struct io_ev_fd *ev_fd;
+	bool skip;
+
+	spin_lock(&ctx->completion_lock);
+	/*
+	 * Eventfd should only get triggered when at least one event has been
+	 * posted. Some applications rely on the eventfd notification count only
+	 * changing IFF a new CQE has been added to the CQ ring. There's no
+	 * depedency on 1:1 relationship between how many times this function is
+	 * called (and hence the eventfd count) and number of CQEs posted to the
+	 * CQ ring.
+	 */
+	skip = ctx->cached_cq_tail == ctx->evfd_last_cq_tail;
+	ctx->evfd_last_cq_tail = ctx->cached_cq_tail;
+	spin_unlock(&ctx->completion_lock);
+	if (skip)
+		return;
 
 	rcu_read_lock();
 	/*
@@ -511,13 +527,6 @@  void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
 		io_eventfd_signal(ctx);
 }
 
-/*
- * This should only get called when at least one event has been posted.
- * Some applications rely on the eventfd notification count only changing
- * IFF a new CQE has been added to the CQ ring. There's no depedency on
- * 1:1 relationship between how many times this function is called (and
- * hence the eventfd count) and number of CQEs posted to the CQ ring.
- */
 void io_cqring_ev_posted(struct io_ring_ctx *ctx)
 {
 	if (unlikely(ctx->off_timeout_used || ctx->drain_active ||
@@ -530,7 +539,7 @@  void io_cqring_ev_posted(struct io_ring_ctx *ctx)
 /* Returns true if there are no backlogged entries after the flush */
 static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
 {
-	bool all_flushed, posted;
+	bool all_flushed;
 	size_t cqe_size = sizeof(struct io_uring_cqe);
 
 	if (!force && __io_cqring_events(ctx) == ctx->cq_entries)
@@ -539,7 +548,6 @@  static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
 	if (ctx->flags & IORING_SETUP_CQE32)
 		cqe_size <<= 1;
 
-	posted = false;
 	spin_lock(&ctx->completion_lock);
 	while (!list_empty(&ctx->cq_overflow_list)) {
 		struct io_uring_cqe *cqe = io_get_cqe(ctx);
@@ -554,7 +562,6 @@  static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
 		else
 			io_account_cq_overflow(ctx);
 
-		posted = true;
 		list_del(&ocqe->list);
 		kfree(ocqe);
 	}
@@ -567,8 +574,7 @@  static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
 
 	io_commit_cqring(ctx);
 	spin_unlock(&ctx->completion_lock);
-	if (posted)
-		io_cqring_ev_posted(ctx);
+	io_cqring_ev_posted(ctx);
 	return all_flushed;
 }
 
@@ -758,8 +764,7 @@  bool io_post_aux_cqe(struct io_ring_ctx *ctx,
 	filled = io_fill_cqe_aux(ctx, user_data, res, cflags);
 	io_commit_cqring(ctx);
 	spin_unlock(&ctx->completion_lock);
-	if (filled)
-		io_cqring_ev_posted(ctx);
+	io_cqring_ev_posted(ctx);
 	return filled;
 }
 
@@ -940,14 +945,12 @@  __cold void io_free_req(struct io_kiocb *req)
 static void __io_req_find_next_prep(struct io_kiocb *req)
 {
 	struct io_ring_ctx *ctx = req->ctx;
-	bool posted;
 
 	spin_lock(&ctx->completion_lock);
-	posted = io_disarm_next(req);
+	io_disarm_next(req);
 	io_commit_cqring(ctx);
 	spin_unlock(&ctx->completion_lock);
-	if (posted)
-		io_cqring_ev_posted(ctx);
+	io_cqring_ev_posted(ctx);
 }
 
 static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
@@ -2431,6 +2434,11 @@  static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
 		kfree(ev_fd);
 		return ret;
 	}
+
+	spin_lock(&ctx->completion_lock);
+	ctx->evfd_last_cq_tail = ctx->cached_cq_tail;
+	spin_unlock(&ctx->completion_lock);
+
 	ev_fd->eventfd_async = eventfd_async;
 	ctx->has_evfd = true;
 	rcu_assign_pointer(ctx->io_ev_fd, ev_fd);
diff --git a/io_uring/timeout.c b/io_uring/timeout.c
index 557c637af158..4938c1cdcbcd 100644
--- a/io_uring/timeout.c
+++ b/io_uring/timeout.c
@@ -628,7 +628,6 @@  __cold bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk,
 	spin_unlock_irq(&ctx->timeout_lock);
 	io_commit_cqring(ctx);
 	spin_unlock(&ctx->completion_lock);
-	if (canceled != 0)
-		io_cqring_ev_posted(ctx);
+	io_cqring_ev_posted(ctx);
 	return canceled != 0;
 }