@@ -224,6 +224,12 @@ struct io_alloc_cache {
size_t elem_size;
};
+struct iopoll_info {
+ bool poll_state;
+ long last_runtime;
+ long last_irqtime;
+};
+
struct io_ring_ctx {
/* const or read-mostly hot data */
struct {
@@ -421,6 +427,7 @@ struct io_ring_ctx {
unsigned short n_sqe_pages;
struct page **ring_pages;
struct page **sqe_pages;
+ struct xarray poll_array;
};
struct io_tw_state {
@@ -641,6 +648,11 @@ struct io_kiocb {
u64 extra1;
u64 extra2;
} big_cqe;
+ /* for adaptive iopoll */
+ int poll_flag;
+ bool poll_state;
+ struct timespec64 iopoll_start;
+ struct timespec64 iopoll_end;
};
struct io_overflow_cqe {
@@ -198,6 +198,7 @@ enum {
* Removes indirection through the SQ index array.
*/
#define IORING_SETUP_NO_SQARRAY (1U << 16)
+#define IORING_SETUP_NO_POLLQUEUE (1U << 17)
enum io_uring_op {
IORING_OP_NOP,
@@ -79,6 +79,8 @@
#include <uapi/linux/io_uring.h>
+#include <linux/time.h>
+#include <linux/timekeeping.h>
#include "io-wq.h"
#include "io_uring.h"
@@ -122,6 +124,9 @@
#define IO_COMPL_BATCH 32
#define IO_REQ_ALLOC_BATCH 8
+#define IO_POLL_QUEUE 1
+#define IO_NO_POLL_QUEUE 0
+
enum {
IO_CHECK_CQ_OVERFLOW_BIT,
IO_CHECK_CQ_DROPPED_BIT,
@@ -311,6 +316,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
goto err;
ctx->flags = p->flags;
+ xa_init(&ctx->poll_array);
atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT);
init_waitqueue_head(&ctx->sqo_sq_wait);
INIT_LIST_HEAD(&ctx->sqd_list);
@@ -1875,11 +1881,32 @@ static bool io_assign_file(struct io_kiocb *req, const struct io_issue_def *def,
return !!req->file;
}
+/* Get poll queue information of the device */
+int get_poll_queue_state(struct io_kiocb *req)
+{
+ struct block_device *bdev;
+ struct request_queue *q;
+ struct inode *inode;
+
+ inode = req->file->f_inode;
+ if (!inode->i_rdev)
+ return 1;
+ bdev = blkdev_get_no_open(inode->i_rdev);
+ q = bdev->bd_queue;
+ if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) {
+ return IO_NO_POLL_QUEUE;
+ } else {
+ return IO_POLL_QUEUE;
+ }
+}
+
static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
{
const struct io_issue_def *def = &io_issue_defs[req->opcode];
const struct cred *creds = NULL;
+ struct io_ring_ctx *ctx = req->ctx;
int ret;
+ u32 index;
if (unlikely(!io_assign_file(req, def, issue_flags)))
return -EBADF;
@@ -1890,6 +1917,21 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
if (!def->audit_skip)
audit_uring_entry(req->opcode);
+ if (ctx->flags & IORING_SETUP_NO_POLLQUEUE) {
+ index = req->file->f_inode->i_rdev;
+ struct iopoll_info *entry = xa_load(&ctx->poll_array, index);
+
+ if (!entry) {
+ entry = kmalloc(sizeof(struct iopoll_info), GFP_KERNEL);
+ entry->poll_state = get_poll_queue_state(req);
+ entry->last_runtime = 0;
+ entry->last_irqtime = 0;
+ xa_store(&ctx->poll_array, index, entry, GFP_KERNEL);
+ }
+ req->poll_state = entry->poll_state;
+ ktime_get_ts64(&req->iopoll_start);
+ }
+
ret = def->issue(req, issue_flags);
if (!def->audit_skip)
@@ -2176,6 +2218,8 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
req->file = NULL;
req->rsrc_node = NULL;
req->task = current;
+ req->poll_flag = 0;
+ req->poll_state = 1;
if (unlikely(opcode >= IORING_OP_LAST)) {
req->opcode = 0;
@@ -2921,6 +2965,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
kfree(ctx->cancel_table_locked.hbs);
kfree(ctx->io_bl);
xa_destroy(&ctx->io_bl_xa);
+ xa_destroy(&ctx->poll_array);
kfree(ctx);
}
@@ -4050,7 +4095,7 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
IORING_SETUP_SQE128 | IORING_SETUP_CQE32 |
IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN |
IORING_SETUP_NO_MMAP | IORING_SETUP_REGISTERED_FD_ONLY |
- IORING_SETUP_NO_SQARRAY))
+ IORING_SETUP_NO_SQARRAY | IORING_SETUP_NO_POLLQUEUE))
return -EINVAL;
return io_uring_create(entries, &p, params);
@@ -125,6 +125,8 @@ static inline void io_req_task_work_add(struct io_kiocb *req)
__io_req_task_work_add(req, 0);
}
+#define LEFT_TIME 3000
+
#define io_for_each_link(pos, head) \
for (pos = (head); pos; pos = pos->link)
@@ -1118,6 +1118,44 @@ void io_rw_fail(struct io_kiocb *req)
io_req_set_res(req, res, req->cqe.flags);
}
+void io_delay(struct io_kiocb *req, struct iopoll_info *entry)
+{
+ struct hrtimer_sleeper timer;
+ ktime_t kt;
+ struct timespec64 tc, oldtc;
+ enum hrtimer_mode mode;
+ long sleep_ti;
+
+ if (req->poll_flag == 1)
+ return;
+
+ if (entry->last_runtime <= entry->last_irqtime || (entry->last_runtime - entry->last_irqtime) < LEFT_TIME)
+ return;
+
+ req->poll_flag = 1;
+ ktime_get_ts64(&oldtc);
+ sleep_ti = (entry->last_runtime - entry->last_irqtime) / 2;
+ kt = ktime_set(0, sleep_ti);
+
+ mode = HRTIMER_MODE_REL;
+ hrtimer_init_sleeper_on_stack(&timer, CLOCK_MONOTONIC, mode);
+ hrtimer_set_expires(&timer.timer, kt);
+
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ hrtimer_sleeper_start_expires(&timer, mode);
+ if (timer.task) {
+ io_schedule();
+ }
+ hrtimer_cancel(&timer.timer);
+ mode = HRTIMER_MODE_ABS;
+
+ __set_current_state(TASK_RUNNING);
+ destroy_hrtimer_on_stack(&timer.timer);
+
+ ktime_get_ts64(&tc);
+ entry->last_irqtime = tc.tv_nsec - oldtc.tv_nsec - sleep_ti;
+}
+
int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
{
struct io_wq_work_node *pos, *start, *prev;
@@ -1136,12 +1174,28 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list);
struct file *file = req->file;
int ret;
+ u32 index = file->f_inode->i_rdev;
/*
* Move completed and retryable entries to our local lists.
* If we find a request that requires polling, break out
* and complete those lists first, if we have entries there.
*/
+
+ if ((ctx->flags & IORING_SETUP_NO_POLLQUEUE) && !req->poll_state) {
+ struct iopoll_info *entry = xa_load(&ctx->poll_array, index);
+
+ do {
+ if (READ_ONCE(req->iopoll_completed)) {
+ ktime_get_ts64(&req->iopoll_end);
+ entry->last_runtime = req->iopoll_end.tv_nsec - req->iopoll_start.tv_nsec;
+ break;
+ }
+ io_delay(req, entry);
+ } while (1);
+ goto complete;
+ }
+
if (READ_ONCE(req->iopoll_completed))
break;
@@ -1172,6 +1226,7 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
else if (!pos)
return 0;
+complete:
prev = start;
wq_list_for_each_resume(pos, prev) {
struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list);