diff mbox series

[2/2] eventfd: support delayed wakeup for non-semaphore eventfd to reduce cpu utilization

Message ID tencent_D113D23D99C9FC229F0FAADCA8CF823A2609@qq.com (mailing list archive)
State New, archived
Headers show
Series [1/2] eventfd: use wait_event_interruptible_locked_irq() helper | expand

Commit Message

Wen Yang Jan. 29, 2023, 5:47 p.m. UTC
From: Wen Yang <wenyang.linux@foxmail.com>

For the NON SEMAPHORE eventfd, if it's counter has a nonzero value,
then a read(2) returns 8 bytes containing that value, and the counter's
value is reset to zero. Therefore, in the NON SEMAPHORE scenario,
N event_writes vs ONE event_read is possible.

However, the current implementation wakes up the read thread immediately
in eventfd_write so that the cpu utilization increases unnecessarily.

By adding a configurable delay after eventfd_write, these unnecessary
wakeup operations are avoided, thereby reducing cpu utilization.

We used the following test code:
https://github.com/w-simon/tests/blob/master/src/test.c
./test_zmq  > /dev/null

The cpu usage is as follows:
12:14:22     CPU    %usr   %nice    %sys %iowait    %irq   %soft  %steal  %guest  %gnice   %idle
12:14:24     all   55.46    0.00    4.78    0.00    0.00    0.96    0.00    0.00    0.00   38.80
12:14:26     all   56.29    0.00    4.70    0.00    0.00    1.24    0.00    0.00    0.00   37.76
12:14:28     all   54.97    0.00    5.25    0.00    0.00    0.97    0.00    0.00    0.00   38.81
12:14:30     all   56.02    0.00    5.26    0.00    0.00    1.24    0.00    0.00    0.00   37.48
12:14:32     all   55.31    0.00    5.03    0.00    0.00    1.40    0.00    0.00    0.00   38.27
12:14:34     all   55.46    0.00    5.26    0.00    0.00    1.24    0.00    0.00    0.00   38.04

Then adjust the new control parameter, as follows:
echo 5 > /proc/sys/fs/eventfd_write_wake_delay_ms

The cpu usagen was observed to decrease by more than 30%, as follows:
12:14:36     all   28.17    0.00    0.93    0.00    0.00    0.00    0.00    0.00    0.00   70.90
12:14:38     all   24.00    0.00    0.80    0.00    0.00    0.13    0.00    0.00    0.00   75.07
12:14:40     all   23.57    0.00    0.53    0.00    0.00    0.13    0.00    0.00    0.00   75.77
12:14:42     all   23.59    0.00    0.40    0.00    0.00    0.00    0.00    0.00    0.00   76.01
12:14:44     all   23.69    0.00    0.27    0.00    0.00    0.00    0.00    0.00    0.00   76.04
12:14:46     all   23.20    0.00    0.67    0.00    0.00    0.13    0.00    0.00    0.00   76.00
12:14:48     all   24.87    0.00    0.66    0.00    0.00    0.00    0.00    0.00    0.00   74.47
12:14:50     all   24.27    0.00    0.66    0.00    0.00    0.00    0.00    0.00    0.00   75.07

Signed-off-by: Wen Yang <wenyang.linux@foxmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dylan Yudaken <dylany@fb.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: David Woodhouse <dwmw@amazon.co.uk>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
---
 fs/eventfd.c | 79 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 78 insertions(+), 1 deletion(-)
diff mbox series

Patch

diff --git a/fs/eventfd.c b/fs/eventfd.c
index c5bda3df4a28..e45436737f9d 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -41,6 +41,9 @@  struct eventfd_ctx {
 	__u64 count;
 	unsigned int flags;
 	int id;
+#ifdef CONFIG_SYSCTL
+	struct delayed_work dwork;
+#endif
 };
 
 __u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, unsigned mask)
@@ -95,6 +98,9 @@  static void eventfd_free_ctx(struct eventfd_ctx *ctx)
 {
 	if (ctx->id >= 0)
 		ida_simple_remove(&eventfd_ida, ctx->id);
+#ifdef CONFIG_SYSCTL
+	flush_delayed_work(&ctx->dwork);
+#endif
 	kfree(ctx);
 }
 
@@ -256,6 +262,28 @@  static ssize_t eventfd_read(struct kiocb *iocb, struct iov_iter *to)
 	return sizeof(ucnt);
 }
 
+#ifdef CONFIG_SYSCTL
+
+static unsigned long sysctl_eventfd_write_wake_delay_ms;
+
+static void eventfd_delayed_workfn(struct work_struct *work)
+{
+	struct delayed_work *dwork = to_delayed_work(work);
+	struct eventfd_ctx *ctx = container_of(dwork, struct eventfd_ctx, dwork);
+
+	spin_lock_irq(&ctx->wqh.lock);
+	current->in_eventfd = 1;
+	if (ctx->count) {
+		/* waitqueue_active is safe because ctx->wqh.lock is being held here. */
+		if (waitqueue_active(&ctx->wqh))
+			wake_up_locked_poll(&ctx->wqh, EPOLLIN);
+	}
+	current->in_eventfd = 0;
+	spin_unlock_irq(&ctx->wqh.lock);
+}
+
+#endif
+
 static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t count,
 			     loff_t *ppos)
 {
@@ -282,8 +310,26 @@  static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t c
 	if (likely(res > 0)) {
 		ctx->count += ucnt;
 		current->in_eventfd = 1;
-		if (waitqueue_active(&ctx->wqh))
+
+		/* waitqueue_active is safe because ctx->wqh.lock is being held here. */
+		if (waitqueue_active(&ctx->wqh)) {
+#ifdef CONFIG_SYSCTL
+			if (ctx->flags & EFD_SEMAPHORE)
+				wake_up_locked_poll(&ctx->wqh, EPOLLIN);
+			else {
+				unsigned long delay = sysctl_eventfd_write_wake_delay_ms;
+
+				if (delay) {
+					if (!delayed_work_pending(&ctx->dwork))
+						queue_delayed_work(system_unbound_wq,
+								&ctx->dwork, delay);
+				} else
+					wake_up_locked_poll(&ctx->wqh, EPOLLIN);
+			}
+#else
 			wake_up_locked_poll(&ctx->wqh, EPOLLIN);
+#endif
+		}
 		current->in_eventfd = 0;
 	}
 	spin_unlock_irq(&ctx->wqh.lock);
@@ -406,6 +452,9 @@  static int do_eventfd(unsigned int count, int flags)
 	ctx->count = count;
 	ctx->flags = flags;
 	ctx->id = ida_simple_get(&eventfd_ida, 0, 0, GFP_KERNEL);
+#ifdef CONFIG_SYSCTL
+	INIT_DELAYED_WORK(&ctx->dwork, eventfd_delayed_workfn);
+#endif
 
 	flags &= EFD_SHARED_FCNTL_FLAGS;
 	flags |= O_RDWR;
@@ -438,3 +487,31 @@  SYSCALL_DEFINE1(eventfd, unsigned int, count)
 	return do_eventfd(count, 0);
 }
 
+#ifdef CONFIG_SYSCTL
+
+static unsigned long min_wake_delay;
+
+static unsigned long max_wake_delay = HZ / 10;
+
+static struct ctl_table fs_eventfd_ctl[] = {
+	{
+		.procname      = "eventfd_write_wake_delay_ms",
+		.data          = &sysctl_eventfd_write_wake_delay_ms,
+		.maxlen        = sizeof(unsigned long),
+		.mode          = 0644,
+		.proc_handler  = proc_doulongvec_ms_jiffies_minmax,
+		.extra1        = (void *)&min_wake_delay,
+		.extra2        = (void *)&max_wake_delay,
+	},
+	{ }
+};
+
+static int __init init_fs_exec_sysctls(void)
+{
+	register_sysctl_init("fs", fs_eventfd_ctl);
+	return 0;
+}
+
+fs_initcall(init_fs_exec_sysctls);
+
+#endif /* CONFIG_SYSCTL */