@@ -104,6 +104,12 @@ If the ring file descriptor has been registered through use of
then setting this flag will tell the kernel that the
.I ring_fd
passed in is the registered ring offset rather than a normal file descriptor.
+.TP
+.B IORING_ENTER_NO_IOWAIT
+If this flag is set, then waiting on events will not be accounted as iowait for
+the task if
+.BR io_uring_enter (2)
+results in waiting.
.PP
.PP
new file mode 100644
@@ -0,0 +1,52 @@
+.\" Copyright (C) 2024 David Wei <dw@davidwei.uk>
+.\"
+.\" SPDX-License-Identifier: LGPL-2.0-or-later
+.\"
+.TH io_uring_iowait_toggle 3 "Aug 16, 2024" "liburing-2.8" "liburing Manual"
+.SH NAME
+io_uring_iowait_toggle \- toggle whether waiting for events is accounted as iowait
+.SH SYNOPSIS
+.nf
+.B #include <liburing.h>
+.PP
+.BI "int io_uring_iowait_toggle(struct io_uring *" ring ",
+.BI " bool " enabled ");"
+.BI "
+.fi
+.SH DESCRIPTION
+.PP
+The
+.BR io_uring_iowait_toggle (3)
+function toggles for a given
+.I ring
+whether waiting for events is accounted as iowait time for the task. When set
+to true, time spent waiting is accounted as iowait time; otherwise, it is
+accounted as idle time. The default behavior is to always account time waiting
+for events as iowait time.
+
+Setting in_iowait achieves two things:
+.TP
+.B 1. Account time spent waiting as iowait time
+.TP
+.B 2. Enable cpufreq optimizations, setting SCHED_CPUFREQ_IOWAIT on the rq
+.PP
+
+The accounting aspect is a relic from the days of uniprocessor systems, where
+iowait indicates that a task is blocked uninterruptibly waiting for IO and
+cannot perform other work. iowait with SMP systems is mostly a bogus
+accounting value, but is set to enable cpufreq boosts for high frequency waits.
+
+Some user tooling attributes iowait time as CPU utilization time, so high
+iowait time can look like apparent high CPU utilization, even though the task
+is not scheduled and the CPU is free to run other tasks.
+.BR io_uring_iowait_toggle (3)
+provides a way to disable this behavior where it makes sense to do so.
+
+Available since 6.12.
+
+.SH RETURN VALUE
+On success
+.BR io_uring_iowait_toggle (3)
+0. If the kernel does not support this feature, it returns
+.BR -EOPNOTSUPP
+.
@@ -243,6 +243,7 @@ int io_uring_unregister_napi(struct io_uring *ring, struct io_uring_napi *napi);
int io_uring_get_events(struct io_uring *ring);
int io_uring_submit_and_get_events(struct io_uring *ring);
+int io_uring_iowait_toggle(struct io_uring *ring, bool enabled);
/*
* io_uring syscalls.
@@ -504,6 +504,7 @@ struct io_cqring_offsets {
#define IORING_ENTER_SQ_WAIT (1U << 2)
#define IORING_ENTER_EXT_ARG (1U << 3)
#define IORING_ENTER_REGISTERED_RING (1U << 4)
+#define IORING_ENTER_NO_IOWAIT (1U << 6)
/*
* Passed in for io_uring_setup(2). Copied back with updated info on success
@@ -539,6 +540,7 @@ struct io_uring_params {
#define IORING_FEAT_LINKED_FILE (1U << 12)
#define IORING_FEAT_REG_REG_RING (1U << 13)
#define IORING_FEAT_RECVSEND_BUNDLE (1U << 14)
+#define IORING_FEAT_IOWAIT_TOGGLE (1U << 15)
/*
* io_uring_register(2) opcodes and arguments
@@ -6,6 +6,7 @@ enum {
INT_FLAG_REG_RING = 1,
INT_FLAG_REG_REG_RING = 2,
INT_FLAG_APP_MEM = 4,
+ INT_FLAG_NO_IOWAIT = 8,
};
#endif
@@ -97,4 +97,6 @@ LIBURING_2.7 {
} LIBURING_2.6;
LIBURING_2.8 {
+ global:
+ io_uring_iowait_toggle;
} LIBURING_2.7;
@@ -110,6 +110,8 @@ static int _io_uring_get_cqe(struct io_uring *ring,
if (ring->int_flags & INT_FLAG_REG_RING)
flags |= IORING_ENTER_REGISTERED_RING;
+ if (ring->int_flags & INT_FLAG_NO_IOWAIT)
+ flags |= IORING_ENTER_NO_IOWAIT;
ret = __sys_io_uring_enter2(ring->enter_ring_fd, data->submit,
data->wait_nr, flags, data->arg,
data->sz);
@@ -687,3 +687,15 @@ int io_uring_free_buf_ring(struct io_uring *ring, struct io_uring_buf_ring *br,
__sys_munmap(br, nentries * sizeof(struct io_uring_buf));
return 0;
}
+
+int io_uring_iowait_toggle(struct io_uring *ring, bool enabled)
+{
+ if (!(ring->features & IORING_FEAT_IOWAIT_TOGGLE))
+ return -EOPNOTSUPP;
+
+ if (enabled)
+ ring->int_flags &= ~INT_FLAG_NO_IOWAIT;
+ else
+ ring->int_flags |= INT_FLAG_NO_IOWAIT;
+ return 0;
+}
@@ -125,6 +125,7 @@ test_srcs := \
msg-ring-flags.c \
msg-ring-overflow.c \
multicqes_drain.c \
+ no-iowait.c \
no-mmap-inval.c \
nolibc.c \
nop-all-sizes.c \
new file mode 100644
@@ -0,0 +1,162 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Description: test no iowait toggle
+ *
+ */
+#include <errno.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <fcntl.h>
+#include <sys/time.h>
+#include <sys/wait.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sched.h>
+
+#include "helpers.h"
+#include "liburing.h"
+#include "../src/syscall.h"
+
+#define TIMEOUT_SEC 1
+#define PINNED_CPU 0
+
+static int pin_to_cpu()
+{
+ cpu_set_t set;
+
+ CPU_ZERO(&set);
+ CPU_SET(PINNED_CPU, &set);
+ if (sched_setaffinity(0, sizeof(cpu_set_t), &set) == -1)
+ return 1;
+
+ return 0;
+}
+
+static int get_iowait()
+{
+ FILE *fp;
+ char line[1024];
+ char cpu[10];
+ int sz;
+ unsigned long long user, nice, system, idle, iowait;
+
+ sz = snprintf(cpu, 10, "cpu%d", PINNED_CPU);
+ fp = fopen("/proc/stat", "r");
+ if (fp == NULL)
+ return -1;
+
+ while (fgets(line, sizeof(line), fp) != NULL) {
+ if (strncmp(line, cpu, sz) == 0) {
+ sscanf(line, "%*s %llu %llu %llu %llu %llu", &user,
+ &nice, &system, &idle, &iowait);
+ break;
+ }
+ }
+
+ fclose(fp);
+ return iowait;
+}
+
+static int test_iowait(struct io_uring *ring, bool enabled)
+{
+ struct io_uring_sqe *sqe;
+ struct io_uring_cqe *cqe;
+ struct __kernel_timespec ts;
+ int ret, iowait, exp;
+
+ ret = io_uring_iowait_toggle(ring, enabled);
+ if (ret == -EOPNOTSUPP)
+ return T_EXIT_SKIP;
+
+ sqe = io_uring_get_sqe(ring);
+ if (!sqe) {
+ fprintf(stderr, "%s: get sqe failed\n", __FUNCTION__);
+ return T_EXIT_FAIL;
+ }
+
+ ts.tv_sec = TIMEOUT_SEC;
+ ts.tv_nsec = 0;
+ io_uring_prep_timeout(sqe, &ts, 0, 0);
+
+ ret = io_uring_submit(ring);
+ if (ret <= 0) {
+ fprintf(stderr, "%s: sqe submit failed: %d\n", __FUNCTION__, ret);
+ return T_EXIT_FAIL;
+ }
+
+ iowait = get_iowait();
+ if (iowait < 0) {
+ fprintf(stderr, "%s: open /proc/stat failed\n", __FUNCTION__);
+ return T_EXIT_FAIL;
+ }
+ ret = io_uring_wait_cqe(ring, &cqe);
+ if (ret < 0) {
+ fprintf(stderr, "%s: wait completion %d\n", __FUNCTION__, ret);
+ return T_EXIT_FAIL;
+ }
+
+ ret = cqe->res;
+ io_uring_cqe_seen(ring, cqe);
+ if (ret != -ETIME) {
+ fprintf(stderr, "%s: Timeout: %s\n", __FUNCTION__, strerror(-ret));
+ return T_EXIT_FAIL;
+ }
+
+ ret = get_iowait();
+ if (ret < 0) {
+ fprintf(stderr, "%s: open /proc/stat failed\n", __FUNCTION__);
+ return T_EXIT_FAIL;
+ }
+ exp = ret - iowait;
+ if (enabled) {
+ if (exp >= (TIMEOUT_SEC * sysconf(_SC_CLK_TCK) * 11) / 10 ||
+ exp <= (TIMEOUT_SEC * sysconf(_SC_CLK_TCK) * 9) / 10)
+ return T_EXIT_FAIL;
+ } else {
+ if (exp >= sysconf(_SC_CLK_TCK) / 10)
+ return T_EXIT_FAIL;
+ }
+
+ return T_EXIT_PASS;
+}
+
+int main(int argc, char *argv[])
+{
+ struct io_uring ring;
+ struct io_uring_params p = { };
+ int ret;
+
+ if (argc > 1)
+ return 0;
+
+ ret = pin_to_cpu();
+ if (ret) {
+ fprintf(stderr, "pinning to cpu%d failed\n", PINNED_CPU);
+ return 1;
+ }
+
+ ret = io_uring_queue_init_params(8, &ring, &p);
+ if (ret) {
+ fprintf(stderr, "ring setup failed\n");
+ return 1;
+ }
+
+ ret = test_iowait(&ring, true);
+ if (ret == T_EXIT_SKIP)
+ return ret;
+ if (ret) {
+ fprintf(stderr, "test_iowait with iowait enabled failed\n");
+ return ret;
+ }
+
+ ret = test_iowait(&ring, false);
+ if (ret) {
+ fprintf(stderr, "test_iowait with iowait disabled failed\n");
+ return ret;
+ }
+
+ io_uring_queue_exit(&ring);
+ return 0;
+}
Add io_uring_iowait_toggle() helper function for the userspace liburing side of IORING_ENTER_NO_IOWAIT flag added in io_uring for 6.12. This function toggles whether a ring sets in_iowait when waiting for completions. This is useful when waiting for multiple batched completions using e.g. io_uring_submit_and_wait_timeout() and userspace treats iowait time as CPU utilization. It works by keeping an internal flag INT_FLAG_NO_IOWAIT, which if set will set IORING_ENTER_NO_IOWAIT on every io_uring_enter(). Manpages are added/modified, a unit test is included, and io_uring.h is synced with the kernel side. Signed-off-by: David Wei <dw@davidwei.uk> --- v2: - edit manpages --- man/io_uring_enter.2 | 6 ++ man/io_uring_iowait_toggle.3 | 52 ++++++++++ src/include/liburing.h | 1 + src/include/liburing/io_uring.h | 2 + src/int_flags.h | 1 + src/liburing.map | 2 + src/queue.c | 2 + src/setup.c | 12 +++ test/Makefile | 1 + test/no-iowait.c | 162 ++++++++++++++++++++++++++++++++ 10 files changed, 241 insertions(+) create mode 100644 man/io_uring_iowait_toggle.3 create mode 100644 test/no-iowait.c