From patchwork Sun Sep 22 08:08:50 2019
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Pavel Begunkov <asml.silence@gmail.com>
X-Patchwork-Id: 11155753
Return-Path: <SRS0=1fuv=XR=vger.kernel.org=linux-block-owner@kernel.org>
Received: from mail.kernel.org (pdx-korg-mail-1.web.codeaurora.org
 [172.30.200.123])
	by pdx-korg-patchwork-2.web.codeaurora.org (Postfix) with ESMTP id 5888A14E5
	for <patchwork-linux-block@patchwork.kernel.org>;
 Sun, 22 Sep 2019 08:09:21 +0000 (UTC)
Received: from vger.kernel.org (vger.kernel.org [209.132.180.67])
	by mail.kernel.org (Postfix) with ESMTP id 33AAF21A4A
	for <patchwork-linux-block@patchwork.kernel.org>;
 Sun, 22 Sep 2019 08:09:21 +0000 (UTC)
Authentication-Results: mail.kernel.org;
	dkim=pass (2048-bit key) header.d=gmail.com header.i=@gmail.com
 header.b="Ap8pSX2p"
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S1727695AbfIVIJT (ORCPT
        <rfc822;patchwork-linux-block@patchwork.kernel.org>);
        Sun, 22 Sep 2019 04:09:19 -0400
Received: from mail-wr1-f67.google.com ([209.85.221.67]:45091 "EHLO
        mail-wr1-f67.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S1727741AbfIVIJT (ORCPT
        <rfc822;linux-block@vger.kernel.org>);
        Sun, 22 Sep 2019 04:09:19 -0400
Received: by mail-wr1-f67.google.com with SMTP id r5so10638222wrm.12;
        Sun, 22 Sep 2019 01:09:16 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=gmail.com; s=20161025;
        h=from:to:cc:subject:date:message-id:in-reply-to:references
         :mime-version:content-transfer-encoding;
        bh=E4sz3bMPtRNQeqEEXec0GVjBvdkbRYQeJBiuMJ2oejo=;
        b=Ap8pSX2pFZwrJu+JxUae9LrqgooB/VeL6ghPHBWBmMTOV7tjHQf/R0oB7irc8JudA/
         dlx9v6MvmRaenxI2ddxlbEK4Wt4yDHljolkbGAVZQlIu3m4b62aeYAfA2JnKISu6LvX3
         Ssi2fsS2QGiTpUYz9QVE8gsEOaCgjmqFD3ErR2olqavDMztNZ/8MJ3Fv5X8oteK6wl+2
         7Wlz4tk16AABmMK+jr2HWV29rgEybd07i4z99TR1GTDYeC+Vj89MTfIhUibL8b0isSRZ
         r8320wmUfqysYxbKIu8wH7DS+PZqiVpE7AkHgDlPRCWYsLv4bGU1U4/lyfO4yWhf5pKI
         GKNA==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=1e100.net; s=20161025;
        h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to
         :references:mime-version:content-transfer-encoding;
        bh=E4sz3bMPtRNQeqEEXec0GVjBvdkbRYQeJBiuMJ2oejo=;
        b=euS6Y3xO0urPf82dvsgpvi2O8lcOdrPudPrU3c6e9xrHYeCtu4ybjvc/88/Nmgkh6p
         T37hqlmsTKZsopjkM8Pl6CZza6I6v8f4rhCzoiBG+woSfeMRZZeoISaAwHF+M3XHw2CE
         yB1IrJ7BLUWOd5S7vtugfvRkeHqrvat3EKitxYv1pug7c570f37TowMaUv4ORTWFutuq
         xa3pvZhLKHLVbyh2Qw+uSL1k2SaLjCkpvhRwsXB39glzx/5UeLWmuQ+m5RfBc6sl5BAT
         TMDGDOH+3BKQjO1441UjQ9EilVg7uJ+W00S5wPSnLVYGJX55D9xF8wL10LKuaqpmGKm5
         8iMw==
X-Gm-Message-State: APjAAAXqf+H4f4Xzky9JTwL74TvTMYg4FemoVgPGzXWzBN1z2aNMZQ6l
        hjbVbOQB5ILFUOtdcGJ4MRI=
X-Google-Smtp-Source: 
 APXvYqz3sNreJH2M+K1VawHH1PfjiIFcBehFodSfWxtAzgXn1LerO90uDeJb+bayz9xXwsSp/2M2OQ==
X-Received: by 2002:a5d:49c3:: with SMTP id t3mr5763670wrs.151.1569139756053;
        Sun, 22 Sep 2019 01:09:16 -0700 (PDT)
Received: from localhost.localdomain ([109.126.147.119])
        by smtp.gmail.com with ESMTPSA id
 x5sm7726983wrt.75.2019.09.22.01.09.14
        (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
        Sun, 22 Sep 2019 01:09:15 -0700 (PDT)
From: "Pavel Begunkov (Silence)" <asml.silence@gmail.com>
To: Jens Axboe <axboe@kernel.dk>, Ingo Molnar <mingo@redhat.com>,
        Peter Zijlstra <peterz@infradead.org>,
        linux-block@vger.kernel.org, linux-kernel@vger.kernel.org
Cc: Pavel Begunkov <asml.silence@gmail.com>
Subject: [PATCH v2 1/2] sched/wait: Add wait_threshold
Date: Sun, 22 Sep 2019 11:08:50 +0300
Message-Id: 
 <d99ce2f7c98d4408aea50f515bbb6b89bc7850e8.1569139018.git.asml.silence@gmail.com>
X-Mailer: git-send-email 2.23.0
In-Reply-To: <cover.1569139018.git.asml.silence@gmail.com>
References: <cover.1569139018.git.asml.silence@gmail.com>
MIME-Version: 1.0
Sender: linux-block-owner@vger.kernel.org
Precedence: bulk
List-ID: <linux-block.vger.kernel.org>
X-Mailing-List: linux-block@vger.kernel.org

From: Pavel Begunkov <asml.silence@gmail.com>

Add wait_threshold -- a custom wait_event derivative, that waits until
a value is equal to or greater than the specified threshold.

v2: rebase
1. use full condition instead of event number generator
2. add WQ_THRESHOLD_WAKE_ALWAYS constant

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 include/linux/wait_threshold.h | 67 ++++++++++++++++++++++++++++++++++
 kernel/sched/Makefile          |  2 +-
 kernel/sched/wait_threshold.c  | 26 +++++++++++++
 3 files changed, 94 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/wait_threshold.h
 create mode 100644 kernel/sched/wait_threshold.c

diff --git a/include/linux/wait_threshold.h b/include/linux/wait_threshold.h
new file mode 100644
index 000000000000..d8b054504c26
--- /dev/null
+++ b/include/linux/wait_threshold.h
@@ -0,0 +1,67 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_WAIT_THRESHOLD_H
+#define _LINUX_WAIT_THRESHOLD_H
+
+#include <linux/wait.h>
+
+#define WQ_THRESHOLD_WAKE_ALWAYS	(~0ui)
+
+struct wait_threshold_queue_entry {
+	struct wait_queue_entry wq_entry;
+	unsigned int threshold;
+};
+
+
+void init_wait_threshold_entry(struct wait_threshold_queue_entry *wtq_entry,
+				unsigned int threshold);
+
+static inline void wake_up_threshold(struct wait_queue_head *wq_head,
+					unsigned int val)
+{
+	void *arg = (void *)(unsigned long)val;
+
+	__wake_up(wq_head, TASK_NORMAL, 1, arg);
+}
+
+#define ___wait_threshold_event(q, thresh, condition, state,		\
+				exclusive, ret, cmd)			\
+({									\
+	__label__ __out;						\
+	struct wait_queue_head *__wq_head = &q;				\
+	struct wait_threshold_queue_entry __wtq_entry;			\
+	struct wait_queue_entry *__wq_entry = &__wtq_entry.wq_entry;	\
+	long __ret = ret; /* explicit shadow */				\
+									\
+	init_wait_threshold_entry(&__wtq_entry, thresh);		\
+	for (;;) {							\
+		long __int = prepare_to_wait_event(__wq_head,		\
+						   __wq_entry,		\
+						   state);		\
+		if (condition)						\
+			break;						\
+									\
+		if (___wait_is_interruptible(state) && __int) {		\
+			__ret = __int;					\
+			goto __out;					\
+		}							\
+									\
+		cmd;							\
+	}								\
+	finish_wait(__wq_head, __wq_entry);				\
+__out:	__ret;								\
+})
+
+#define __wait_threshold_interruptible(q, thresh, condition)		\
+	___wait_threshold_event(q, thresh, condition, TASK_INTERRUPTIBLE, 0, 0,\
+			  schedule())
+
+#define wait_threshold_interruptible(q, threshold, condition)	\
+({								\
+	int __ret = 0;						\
+	might_sleep();						\
+	if (!(condition))					\
+		__ret = __wait_threshold_interruptible(q,	\
+			threshold, condition);			\
+	__ret;							\
+})
+#endif /* _LINUX_WAIT_THRESHOLD_H */
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 21fb5a5662b5..bb895a3184f9 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -18,7 +18,7 @@ endif
 
 obj-y += core.o loadavg.o clock.o cputime.o
 obj-y += idle.o fair.o rt.o deadline.o
-obj-y += wait.o wait_bit.o swait.o completion.o
+obj-y += wait.o wait_bit.o wait_threshold.o swait.o completion.o
 
 obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o
 obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o
diff --git a/kernel/sched/wait_threshold.c b/kernel/sched/wait_threshold.c
new file mode 100644
index 000000000000..80a027c02ff3
--- /dev/null
+++ b/kernel/sched/wait_threshold.c
@@ -0,0 +1,26 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include "sched.h"
+#include <linux/wait_threshold.h>
+
+static int wake_threshold_function(struct wait_queue_entry *wq_entry,
+				   unsigned int mode, int sync, void *arg)
+{
+	unsigned int val = (unsigned int)(unsigned long)arg;
+	struct wait_threshold_queue_entry *wtq_entry =
+		container_of(wq_entry, struct wait_threshold_queue_entry,
+			wq_entry);
+
+	if (val < wtq_entry->threshold)
+		return 0;
+
+	return default_wake_function(wq_entry, mode, sync, arg);
+}
+
+void init_wait_threshold_entry(struct wait_threshold_queue_entry *wtq_entry,
+			       unsigned int threshold)
+{
+	init_wait_entry(&wtq_entry->wq_entry, 0);
+	wtq_entry->wq_entry.func = wake_threshold_function;
+	wtq_entry->threshold = threshold;
+}
+EXPORT_SYMBOL(init_wait_threshold_entry);

From patchwork Sun Sep 22 08:08:51 2019
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Pavel Begunkov <asml.silence@gmail.com>
X-Patchwork-Id: 11155755
Return-Path: <SRS0=1fuv=XR=vger.kernel.org=linux-block-owner@kernel.org>
Received: from mail.kernel.org (pdx-korg-mail-1.web.codeaurora.org
 [172.30.200.123])
	by pdx-korg-patchwork-2.web.codeaurora.org (Postfix) with ESMTP id 49D161745
	for <patchwork-linux-block@patchwork.kernel.org>;
 Sun, 22 Sep 2019 08:09:44 +0000 (UTC)
Received: from vger.kernel.org (vger.kernel.org [209.132.180.67])
	by mail.kernel.org (Postfix) with ESMTP id 27FD720830
	for <patchwork-linux-block@patchwork.kernel.org>;
 Sun, 22 Sep 2019 08:09:44 +0000 (UTC)
Authentication-Results: mail.kernel.org;
	dkim=pass (2048-bit key) header.d=gmail.com header.i=@gmail.com
 header.b="E8AgvgEx"
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S1727837AbfIVIJW (ORCPT
        <rfc822;patchwork-linux-block@patchwork.kernel.org>);
        Sun, 22 Sep 2019 04:09:22 -0400
Received: from mail-wm1-f68.google.com ([209.85.128.68]:35126 "EHLO
        mail-wm1-f68.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S1727741AbfIVIJW (ORCPT
        <rfc822;linux-block@vger.kernel.org>);
        Sun, 22 Sep 2019 04:09:22 -0400
Received: by mail-wm1-f68.google.com with SMTP id y21so6055824wmi.0;
        Sun, 22 Sep 2019 01:09:18 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=gmail.com; s=20161025;
        h=from:to:cc:subject:date:message-id:in-reply-to:references
         :mime-version:content-transfer-encoding;
        bh=CD5dbN6dWmUlIx4cKSdgxz+b1tGDJ8hFL6z+Q546CZk=;
        b=E8AgvgExUI3gKjb+333Cd12keB8bc0WQFm2GpsOIlfUr7WYthQc8alEPksUTgL+n49
         keuqFaF5/5HPCsxXo+UtFlIijmQ4dK9LJ4RdtGKf4JG7JN/z95IvlHNOq+npitQx+wvr
         YKq+oPNLD+tMkML8nh/tNDZQhq1tVD5YCaRz/LLbqdGxvG/20yagbTWn/3CxPzx1dzYX
         KgvCUw0vn5Ia0QTkpHtz2tyuBz3Z0VQ0L3N0L9O95gbwGFYnQYYhCiQAvs2kW4reVqMQ
         gez8w4veI56zulgySmi4zvpu7dvFmeU3wsr2kL5UzBDjacTuXGtPmh89xH9npx+NqyFB
         8q3g==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=1e100.net; s=20161025;
        h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to
         :references:mime-version:content-transfer-encoding;
        bh=CD5dbN6dWmUlIx4cKSdgxz+b1tGDJ8hFL6z+Q546CZk=;
        b=Jc5e+5/3QwM5pWiWF4kI8e1yAynXiAZxEohvGJEKKe+Ocm4QiqDRJo6Y6kzJCEyt5Y
         lXp41tw9QrE/9kLGlrknGSQ0KTLyToWR/P3N8yY4E+EgKBj7XE10sohcGpS1gTvy8zzE
         eqU5uozMY9lFt6ahJZdZrVXHz+6jqabJFTNLGXwrRyMvQAnXGY9n+uO2/EphSZyPvkZO
         Jq9SDj6OgQ46whsUWQWLpugKqUF1Z54eUq+tofvQkG9dG6yEq8jdtk9/97fWT6fgdYes
         4P/dvEOhBjxdnZ17J6nApO4XOartqBUhKkoK7vDWSJNShYXS4YZwpkMX/C4o14EQ5LLl
         RmtA==
X-Gm-Message-State: APjAAAUD3mGUBEfMa2X+bXJzTWne3JwwLcGQ/52k9on7tp5gT6WRKm7b
        nZVnPTWrmR2/Ikv2vPZbczk=
X-Google-Smtp-Source: 
 APXvYqxRF3hjhST0yeu+KSpaHSaKnYdpDROzfGN1inYoTYsKyq521h9oabCKDJNmF7s3rDCOWG6JmQ==
X-Received: by 2002:a7b:c74a:: with SMTP id w10mr9564923wmk.30.1569139757683;
        Sun, 22 Sep 2019 01:09:17 -0700 (PDT)
Received: from localhost.localdomain ([109.126.147.119])
        by smtp.gmail.com with ESMTPSA id
 x5sm7726983wrt.75.2019.09.22.01.09.16
        (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
        Sun, 22 Sep 2019 01:09:17 -0700 (PDT)
From: "Pavel Begunkov (Silence)" <asml.silence@gmail.com>
To: Jens Axboe <axboe@kernel.dk>, Ingo Molnar <mingo@redhat.com>,
        Peter Zijlstra <peterz@infradead.org>,
        linux-block@vger.kernel.org, linux-kernel@vger.kernel.org
Cc: Pavel Begunkov <asml.silence@gmail.com>
Subject: [PATCH v2 2/2] io_uring: Optimise cq waiting with wait_threshold
Date: Sun, 22 Sep 2019 11:08:51 +0300
Message-Id: 
 <321aa8db2bbefb8f4b41d7b2608f629fbd5d3d55.1569139018.git.asml.silence@gmail.com>
X-Mailer: git-send-email 2.23.0
In-Reply-To: <cover.1569139018.git.asml.silence@gmail.com>
References: <cover.1569139018.git.asml.silence@gmail.com>
MIME-Version: 1.0
Sender: linux-block-owner@vger.kernel.org
Precedence: bulk
List-ID: <linux-block.vger.kernel.org>
X-Mailing-List: linux-block@vger.kernel.org

From: Pavel Begunkov <asml.silence@gmail.com>

While waiting for completion events in io_cqring_wait(), the process
will be waken up inside wait_threshold_interruptible() on any request
completion, check num of events in completion queue and potentially go
to sleep again.

Apparently, there could be a lot of such spurious wakeups with lots of
overhead. It especially manifests itself, when min_events is large, and
completions are arriving one by one or in small batches (that usually
is true).

E.g. if device completes requests one by one and io_uring_enter is
waiting for 100 events, then there will be ~99 spurious wakeups.

Use new wait_threshold_*() instead, which won't wake it up until
necessary number of events is collected.

Performance test:
The first thread generates requests (QD=512) one by one, so they will
be completed in the similar pattern. The second thread waiting for
128 events to complete.

Tested with null_blk with 5us delay
and 3.8GHz Intel CPU.

throughput before: 270 KIOPS
throughput after:  370 KIOPS
~40% throughput boost, exaggerated, but makes a point.

v2: wake always in io_timeout_fn() with WQ_THRESHOLD_WAKE_ALWAYS

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 fs/io_uring.c | 35 ++++++++++++++++++++++++-----------
 1 file changed, 24 insertions(+), 11 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 5c3f2bb81637..05f4391c7bbe 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -70,6 +70,7 @@
 #include <linux/nospec.h>
 #include <linux/sizes.h>
 #include <linux/hugetlb.h>
+#include <linux/wait_threshold.h>
 
 #include <uapi/linux/io_uring.h>
 
@@ -414,6 +415,13 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	return ctx;
 }
 
+static unsigned int io_cqring_events(struct io_rings *rings)
+{
+	/* See comment at the top of this file */
+	smp_rmb();
+	return READ_ONCE(rings->cq.tail) - READ_ONCE(rings->cq.head);
+}
+
 static inline bool io_sequence_defer(struct io_ring_ctx *ctx,
 				     struct io_kiocb *req)
 {
@@ -559,16 +567,27 @@ static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data,
 	}
 }
 
-static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
+static void __io_cqring_ev_posted(struct io_ring_ctx *ctx,
+				unsigned int nr_events)
 {
 	if (waitqueue_active(&ctx->wait))
-		wake_up(&ctx->wait);
+		wake_up_threshold(&ctx->wait, nr_events);
 	if (waitqueue_active(&ctx->sqo_wait))
 		wake_up(&ctx->sqo_wait);
 	if (ctx->cq_ev_fd)
 		eventfd_signal(ctx->cq_ev_fd, 1);
 }
 
+static inline void io_cqring_ev_posted(struct io_ring_ctx *ctx)
+{
+	__io_cqring_ev_posted(ctx, io_cqring_events(ctx->rings));
+}
+
+static inline void io_cqring_timeout_posted(struct io_ring_ctx *ctx)
+{
+	__io_cqring_ev_posted(ctx, WQ_THRESHOLD_WAKE_ALWAYS);
+}
+
 static void io_cqring_add_event(struct io_ring_ctx *ctx, u64 user_data,
 				long res)
 {
@@ -587,7 +606,7 @@ static void io_ring_drop_ctx_refs(struct io_ring_ctx *ctx, unsigned refs)
 	percpu_ref_put_many(&ctx->refs, refs);
 
 	if (waitqueue_active(&ctx->wait))
-		wake_up(&ctx->wait);
+		wake_up_threshold(&ctx->wait, io_cqring_events(ctx->rings));
 }
 
 static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
@@ -722,12 +741,6 @@ static void io_put_req(struct io_kiocb *req)
 		io_free_req(req);
 }
 
-static unsigned io_cqring_events(struct io_rings *rings)
-{
-	/* See comment at the top of this file */
-	smp_rmb();
-	return READ_ONCE(rings->cq.tail) - READ_ONCE(rings->cq.head);
-}
 
 /*
  * Find and free completed poll iocbs
@@ -1824,7 +1837,7 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
 	io_commit_cqring(ctx);
 	spin_unlock_irqrestore(&ctx->completion_lock, flags);
 
-	io_cqring_ev_posted(ctx);
+	io_cqring_timeout_posted(ctx);
 
 	io_put_req(req);
 	return HRTIMER_NORESTART;
@@ -2723,7 +2736,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 	 * we started waiting. For timeouts, we always want to return to
 	 * userspace.
 	 */
-	ret = wait_event_interruptible(ctx->wait,
+	ret = wait_threshold_interruptible(ctx->wait, min_events,
 				io_cqring_events(rings) >= min_events ||
 				atomic_read(&ctx->cq_timeouts) != nr_timeouts);
 	restore_saved_sigmask_unless(ret == -ERESTARTSYS);