Message ID | 20240206162402.643507-2-axboe@kernel.dk (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | Misc cleanups / optimizations | expand |
On 2/6/24 9:22 AM, Jens Axboe wrote: > We're out of space here, and none of the flags are easily reclaimable. > Bump it to 64-bits and re-arrange the struct a bit to avoid gaps. > > Add a specific bitwise type for the request flags, io_request_flags_t. > This will help catch violations of casting this value to a smaller type > on 32-bit archs, like unsigned int. > > No functional changes intended in this patch. Looks like I didn't run this through testing after making the 32-bit change, my bad. Here's v2 of it, no changes to the rest. diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 854ad67a5f70..5aef24ef467b 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -468,70 +468,73 @@ enum { __REQ_F_LAST_BIT, }; +typedef u64 __bitwise io_req_flags_t; +#define IO_REQ_FLAG(bitno) ((__force io_req_flags_t) BIT_ULL((bitno))) + enum { /* ctx owns file */ - REQ_F_FIXED_FILE = BIT(REQ_F_FIXED_FILE_BIT), + REQ_F_FIXED_FILE = IO_REQ_FLAG(REQ_F_FIXED_FILE_BIT), /* drain existing IO first */ - REQ_F_IO_DRAIN = BIT(REQ_F_IO_DRAIN_BIT), + REQ_F_IO_DRAIN = IO_REQ_FLAG(REQ_F_IO_DRAIN_BIT), /* linked sqes */ - REQ_F_LINK = BIT(REQ_F_LINK_BIT), + REQ_F_LINK = IO_REQ_FLAG(REQ_F_LINK_BIT), /* doesn't sever on completion < 0 */ - REQ_F_HARDLINK = BIT(REQ_F_HARDLINK_BIT), + REQ_F_HARDLINK = IO_REQ_FLAG(REQ_F_HARDLINK_BIT), /* IOSQE_ASYNC */ - REQ_F_FORCE_ASYNC = BIT(REQ_F_FORCE_ASYNC_BIT), + REQ_F_FORCE_ASYNC = IO_REQ_FLAG(REQ_F_FORCE_ASYNC_BIT), /* IOSQE_BUFFER_SELECT */ - REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT), + REQ_F_BUFFER_SELECT = IO_REQ_FLAG(REQ_F_BUFFER_SELECT_BIT), /* IOSQE_CQE_SKIP_SUCCESS */ - REQ_F_CQE_SKIP = BIT(REQ_F_CQE_SKIP_BIT), + REQ_F_CQE_SKIP = IO_REQ_FLAG(REQ_F_CQE_SKIP_BIT), /* fail rest of links */ - REQ_F_FAIL = BIT(REQ_F_FAIL_BIT), + REQ_F_FAIL = IO_REQ_FLAG(REQ_F_FAIL_BIT), /* on inflight list, should be cancelled and waited on exit reliably */ - REQ_F_INFLIGHT = BIT(REQ_F_INFLIGHT_BIT), + REQ_F_INFLIGHT = IO_REQ_FLAG(REQ_F_INFLIGHT_BIT), /* read/write uses file position */ - REQ_F_CUR_POS = BIT(REQ_F_CUR_POS_BIT), + REQ_F_CUR_POS = IO_REQ_FLAG(REQ_F_CUR_POS_BIT), /* must not punt to workers */ - REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT), + REQ_F_NOWAIT = IO_REQ_FLAG(REQ_F_NOWAIT_BIT), /* has or had linked timeout */ - REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT), + REQ_F_LINK_TIMEOUT = IO_REQ_FLAG(REQ_F_LINK_TIMEOUT_BIT), /* needs cleanup */ - REQ_F_NEED_CLEANUP = BIT(REQ_F_NEED_CLEANUP_BIT), + REQ_F_NEED_CLEANUP = IO_REQ_FLAG(REQ_F_NEED_CLEANUP_BIT), /* already went through poll handler */ - REQ_F_POLLED = BIT(REQ_F_POLLED_BIT), + REQ_F_POLLED = IO_REQ_FLAG(REQ_F_POLLED_BIT), /* buffer already selected */ - REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT), + REQ_F_BUFFER_SELECTED = IO_REQ_FLAG(REQ_F_BUFFER_SELECTED_BIT), /* buffer selected from ring, needs commit */ - REQ_F_BUFFER_RING = BIT(REQ_F_BUFFER_RING_BIT), + REQ_F_BUFFER_RING = IO_REQ_FLAG(REQ_F_BUFFER_RING_BIT), /* caller should reissue async */ - REQ_F_REISSUE = BIT(REQ_F_REISSUE_BIT), + REQ_F_REISSUE = IO_REQ_FLAG(REQ_F_REISSUE_BIT), /* supports async reads/writes */ - REQ_F_SUPPORT_NOWAIT = BIT(REQ_F_SUPPORT_NOWAIT_BIT), + REQ_F_SUPPORT_NOWAIT = IO_REQ_FLAG(REQ_F_SUPPORT_NOWAIT_BIT), /* regular file */ - REQ_F_ISREG = BIT(REQ_F_ISREG_BIT), + REQ_F_ISREG = IO_REQ_FLAG(REQ_F_ISREG_BIT), /* has creds assigned */ - REQ_F_CREDS = BIT(REQ_F_CREDS_BIT), + REQ_F_CREDS = IO_REQ_FLAG(REQ_F_CREDS_BIT), /* skip refcounting if not set */ - REQ_F_REFCOUNT = BIT(REQ_F_REFCOUNT_BIT), + REQ_F_REFCOUNT = IO_REQ_FLAG(REQ_F_REFCOUNT_BIT), /* there is a linked timeout that has to be armed */ - REQ_F_ARM_LTIMEOUT = BIT(REQ_F_ARM_LTIMEOUT_BIT), + REQ_F_ARM_LTIMEOUT = IO_REQ_FLAG(REQ_F_ARM_LTIMEOUT_BIT), /* ->async_data allocated */ - REQ_F_ASYNC_DATA = BIT(REQ_F_ASYNC_DATA_BIT), + REQ_F_ASYNC_DATA = IO_REQ_FLAG(REQ_F_ASYNC_DATA_BIT), /* don't post CQEs while failing linked requests */ - REQ_F_SKIP_LINK_CQES = BIT(REQ_F_SKIP_LINK_CQES_BIT), + REQ_F_SKIP_LINK_CQES = IO_REQ_FLAG(REQ_F_SKIP_LINK_CQES_BIT), /* single poll may be active */ - REQ_F_SINGLE_POLL = BIT(REQ_F_SINGLE_POLL_BIT), + REQ_F_SINGLE_POLL = IO_REQ_FLAG(REQ_F_SINGLE_POLL_BIT), /* double poll may active */ - REQ_F_DOUBLE_POLL = BIT(REQ_F_DOUBLE_POLL_BIT), + REQ_F_DOUBLE_POLL = IO_REQ_FLAG(REQ_F_DOUBLE_POLL_BIT), /* request has already done partial IO */ - REQ_F_PARTIAL_IO = BIT(REQ_F_PARTIAL_IO_BIT), + REQ_F_PARTIAL_IO = IO_REQ_FLAG(REQ_F_PARTIAL_IO_BIT), /* fast poll multishot mode */ - REQ_F_APOLL_MULTISHOT = BIT(REQ_F_APOLL_MULTISHOT_BIT), + REQ_F_APOLL_MULTISHOT = IO_REQ_FLAG(REQ_F_APOLL_MULTISHOT_BIT), /* recvmsg special flag, clear EPOLLIN */ - REQ_F_CLEAR_POLLIN = BIT(REQ_F_CLEAR_POLLIN_BIT), + REQ_F_CLEAR_POLLIN = IO_REQ_FLAG(REQ_F_CLEAR_POLLIN_BIT), /* hashed into ->cancel_hash_locked, protected by ->uring_lock */ - REQ_F_HASH_LOCKED = BIT(REQ_F_HASH_LOCKED_BIT), + REQ_F_HASH_LOCKED = IO_REQ_FLAG(REQ_F_HASH_LOCKED_BIT), /* don't use lazy poll wake for this request */ - REQ_F_POLL_NO_LAZY = BIT(REQ_F_POLL_NO_LAZY_BIT), + REQ_F_POLL_NO_LAZY = IO_REQ_FLAG(REQ_F_POLL_NO_LAZY_BIT), }; typedef void (*io_req_tw_func_t)(struct io_kiocb *req, struct io_tw_state *ts); @@ -592,15 +595,14 @@ struct io_kiocb { * and after selection it points to the buffer ID itself. */ u16 buf_index; - unsigned int flags; - struct io_cqe cqe; + atomic_t refs; + + io_req_flags_t flags; struct io_ring_ctx *ctx; struct task_struct *task; - struct io_rsrc_node *rsrc_node; - union { /* store used ubuf, so we can prevent reloading */ struct io_mapped_ubuf *imu; @@ -615,18 +617,23 @@ struct io_kiocb { struct io_buffer_list *buf_list; }; + /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */ + struct hlist_node hash_node; + union { /* used by request caches, completion batching and iopoll */ struct io_wq_work_node comp_list; /* cache ->apoll->events */ __poll_t apoll_events; }; - atomic_t refs; - atomic_t poll_refs; + + struct io_rsrc_node *rsrc_node; + + struct io_cqe cqe; + struct io_task_work io_task_work; + atomic_t poll_refs; unsigned nr_tw; - /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */ - struct hlist_node hash_node; /* internal polling, see IORING_FEAT_FAST_POLL */ struct async_poll *apoll; /* opcode allocated if it needs to store data for async defer */ diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h index 69454f1f98b0..3d7704a52b73 100644 --- a/include/trace/events/io_uring.h +++ b/include/trace/events/io_uring.h @@ -148,7 +148,7 @@ TRACE_EVENT(io_uring_queue_async_work, __field( void *, req ) __field( u64, user_data ) __field( u8, opcode ) - __field( unsigned int, flags ) + __field( io_req_flags_t, flags ) __field( struct io_wq_work *, work ) __field( int, rw ) @@ -167,10 +167,10 @@ TRACE_EVENT(io_uring_queue_async_work, __assign_str(op_str, io_uring_get_opcode(req->opcode)); ), - TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s, flags 0x%x, %s queue, work %p", + TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s, flags 0x%lx, %s queue, work %p", __entry->ctx, __entry->req, __entry->user_data, - __get_str(op_str), - __entry->flags, __entry->rw ? "hashed" : "normal", __entry->work) + __get_str(op_str), (long) __entry->flags, + __entry->rw ? "hashed" : "normal", __entry->work) ); /** @@ -378,7 +378,7 @@ TRACE_EVENT(io_uring_submit_req, __field( void *, req ) __field( unsigned long long, user_data ) __field( u8, opcode ) - __field( u32, flags ) + __field( io_req_flags_t, flags ) __field( bool, sq_thread ) __string( op_str, io_uring_get_opcode(req->opcode) ) @@ -395,10 +395,10 @@ TRACE_EVENT(io_uring_submit_req, __assign_str(op_str, io_uring_get_opcode(req->opcode)); ), - TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, flags 0x%x, " + TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, flags 0x%lx, " "sq_thread %d", __entry->ctx, __entry->req, __entry->user_data, __get_str(op_str), - __entry->flags, __entry->sq_thread) + (long) __entry->flags, __entry->sq_thread) ); /* diff --git a/io_uring/filetable.h b/io_uring/filetable.h index b47adf170c31..b2435c4dca1f 100644 --- a/io_uring/filetable.h +++ b/io_uring/filetable.h @@ -17,7 +17,7 @@ int io_fixed_fd_remove(struct io_ring_ctx *ctx, unsigned int offset); int io_register_file_alloc_range(struct io_ring_ctx *ctx, struct io_uring_file_index_range __user *arg); -unsigned int io_file_get_flags(struct file *file); +io_req_flags_t io_file_get_flags(struct file *file); static inline void io_file_bitmap_clear(struct io_file_table *table, int bit) { diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index cd9a137ad6ce..360a7ee41d3a 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1768,9 +1768,9 @@ static void io_iopoll_req_issued(struct io_kiocb *req, unsigned int issue_flags) } } -unsigned int io_file_get_flags(struct file *file) +io_req_flags_t io_file_get_flags(struct file *file) { - unsigned int res = 0; + io_req_flags_t res = 0; if (S_ISREG(file_inode(file)->i_mode)) res |= REQ_F_ISREG; @@ -2171,7 +2171,8 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, /* req is partially pre-initialised, see io_preinit_req() */ req->opcode = opcode = READ_ONCE(sqe->opcode); /* same numerical values with corresponding REQ_F_*, safe to copy */ - req->flags = sqe_flags = READ_ONCE(sqe->flags); + sqe_flags = READ_ONCE(sqe->flags); + req->flags = (io_req_flags_t) sqe_flags; req->cqe.user_data = READ_ONCE(sqe->user_data); req->file = NULL; req->rsrc_node = NULL; @@ -4153,7 +4154,7 @@ static int __init io_uring_init(void) BUILD_BUG_ON(SQE_COMMON_FLAGS >= (1 << 8)); BUILD_BUG_ON((SQE_VALID_FLAGS | SQE_COMMON_FLAGS) != SQE_VALID_FLAGS); - BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof(int)); + BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof(u64)); BUILD_BUG_ON(sizeof(atomic_t) != sizeof(u32));
On 2/6/24 16:22, Jens Axboe wrote: > We're out of space here, and none of the flags are easily reclaimable. > Bump it to 64-bits and re-arrange the struct a bit to avoid gaps. > > Add a specific bitwise type for the request flags, io_request_flags_t. > This will help catch violations of casting this value to a smaller type > on 32-bit archs, like unsigned int. > > No functional changes intended in this patch. > > Signed-off-by: Jens Axboe <axboe@kernel.dk> > --- > include/linux/io_uring_types.h | 87 ++++++++++++++++++--------------- > include/trace/events/io_uring.h | 14 +++--- > io_uring/filetable.h | 2 +- > io_uring/io_uring.c | 9 ++-- > 4 files changed, 60 insertions(+), 52 deletions(-) > > diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h > index 854ad67a5f70..5ac18b05d4ee 100644 > --- a/include/linux/io_uring_types.h > +++ b/include/linux/io_uring_types.h > @@ -428,7 +428,7 @@ struct io_tw_state { > bool locked; > }; > > -enum { > +enum io_req_flags { > REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT, > REQ_F_IO_DRAIN_BIT = IOSQE_IO_DRAIN_BIT, > REQ_F_LINK_BIT = IOSQE_IO_LINK_BIT, > @@ -468,70 +468,73 @@ enum { > __REQ_F_LAST_BIT, > }; > > +typedef enum io_req_flags __bitwise io_req_flags_t; > +#define IO_REQ_FLAG(bitno) ((__force io_req_flags_t) BIT_ULL((bitno))) > + > enum { > /* ctx owns file */ > - REQ_F_FIXED_FILE = BIT(REQ_F_FIXED_FILE_BIT), > + REQ_F_FIXED_FILE = IO_REQ_FLAG(REQ_F_FIXED_FILE_BIT), > /* drain existing IO first */ > - REQ_F_IO_DRAIN = BIT(REQ_F_IO_DRAIN_BIT), > + REQ_F_IO_DRAIN = IO_REQ_FLAG(REQ_F_IO_DRAIN_BIT), > /* linked sqes */ > - REQ_F_LINK = BIT(REQ_F_LINK_BIT), > + REQ_F_LINK = IO_REQ_FLAG(REQ_F_LINK_BIT), > /* doesn't sever on completion < 0 */ > - REQ_F_HARDLINK = BIT(REQ_F_HARDLINK_BIT), > + REQ_F_HARDLINK = IO_REQ_FLAG(REQ_F_HARDLINK_BIT), > /* IOSQE_ASYNC */ > - REQ_F_FORCE_ASYNC = BIT(REQ_F_FORCE_ASYNC_BIT), > + REQ_F_FORCE_ASYNC = IO_REQ_FLAG(REQ_F_FORCE_ASYNC_BIT), > /* IOSQE_BUFFER_SELECT */ > - REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT), > + REQ_F_BUFFER_SELECT = IO_REQ_FLAG(REQ_F_BUFFER_SELECT_BIT), > /* IOSQE_CQE_SKIP_SUCCESS */ > - REQ_F_CQE_SKIP = BIT(REQ_F_CQE_SKIP_BIT), > + REQ_F_CQE_SKIP = IO_REQ_FLAG(REQ_F_CQE_SKIP_BIT), > > /* fail rest of links */ > - REQ_F_FAIL = BIT(REQ_F_FAIL_BIT), > + REQ_F_FAIL = IO_REQ_FLAG(REQ_F_FAIL_BIT), > /* on inflight list, should be cancelled and waited on exit reliably */ > - REQ_F_INFLIGHT = BIT(REQ_F_INFLIGHT_BIT), > + REQ_F_INFLIGHT = IO_REQ_FLAG(REQ_F_INFLIGHT_BIT), > /* read/write uses file position */ > - REQ_F_CUR_POS = BIT(REQ_F_CUR_POS_BIT), > + REQ_F_CUR_POS = IO_REQ_FLAG(REQ_F_CUR_POS_BIT), > /* must not punt to workers */ > - REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT), > + REQ_F_NOWAIT = IO_REQ_FLAG(REQ_F_NOWAIT_BIT), > /* has or had linked timeout */ > - REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT), > + REQ_F_LINK_TIMEOUT = IO_REQ_FLAG(REQ_F_LINK_TIMEOUT_BIT), > /* needs cleanup */ > - REQ_F_NEED_CLEANUP = BIT(REQ_F_NEED_CLEANUP_BIT), > + REQ_F_NEED_CLEANUP = IO_REQ_FLAG(REQ_F_NEED_CLEANUP_BIT), > /* already went through poll handler */ > - REQ_F_POLLED = BIT(REQ_F_POLLED_BIT), > + REQ_F_POLLED = IO_REQ_FLAG(REQ_F_POLLED_BIT), > /* buffer already selected */ > - REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT), > + REQ_F_BUFFER_SELECTED = IO_REQ_FLAG(REQ_F_BUFFER_SELECTED_BIT), > /* buffer selected from ring, needs commit */ > - REQ_F_BUFFER_RING = BIT(REQ_F_BUFFER_RING_BIT), > + REQ_F_BUFFER_RING = IO_REQ_FLAG(REQ_F_BUFFER_RING_BIT), > /* caller should reissue async */ > - REQ_F_REISSUE = BIT(REQ_F_REISSUE_BIT), > + REQ_F_REISSUE = IO_REQ_FLAG(REQ_F_REISSUE_BIT), > /* supports async reads/writes */ > - REQ_F_SUPPORT_NOWAIT = BIT(REQ_F_SUPPORT_NOWAIT_BIT), > + REQ_F_SUPPORT_NOWAIT = IO_REQ_FLAG(REQ_F_SUPPORT_NOWAIT_BIT), > /* regular file */ > - REQ_F_ISREG = BIT(REQ_F_ISREG_BIT), > + REQ_F_ISREG = IO_REQ_FLAG(REQ_F_ISREG_BIT), > /* has creds assigned */ > - REQ_F_CREDS = BIT(REQ_F_CREDS_BIT), > + REQ_F_CREDS = IO_REQ_FLAG(REQ_F_CREDS_BIT), > /* skip refcounting if not set */ > - REQ_F_REFCOUNT = BIT(REQ_F_REFCOUNT_BIT), > + REQ_F_REFCOUNT = IO_REQ_FLAG(REQ_F_REFCOUNT_BIT), > /* there is a linked timeout that has to be armed */ > - REQ_F_ARM_LTIMEOUT = BIT(REQ_F_ARM_LTIMEOUT_BIT), > + REQ_F_ARM_LTIMEOUT = IO_REQ_FLAG(REQ_F_ARM_LTIMEOUT_BIT), > /* ->async_data allocated */ > - REQ_F_ASYNC_DATA = BIT(REQ_F_ASYNC_DATA_BIT), > + REQ_F_ASYNC_DATA = IO_REQ_FLAG(REQ_F_ASYNC_DATA_BIT), > /* don't post CQEs while failing linked requests */ > - REQ_F_SKIP_LINK_CQES = BIT(REQ_F_SKIP_LINK_CQES_BIT), > + REQ_F_SKIP_LINK_CQES = IO_REQ_FLAG(REQ_F_SKIP_LINK_CQES_BIT), > /* single poll may be active */ > - REQ_F_SINGLE_POLL = BIT(REQ_F_SINGLE_POLL_BIT), > + REQ_F_SINGLE_POLL = IO_REQ_FLAG(REQ_F_SINGLE_POLL_BIT), > /* double poll may active */ > - REQ_F_DOUBLE_POLL = BIT(REQ_F_DOUBLE_POLL_BIT), > + REQ_F_DOUBLE_POLL = IO_REQ_FLAG(REQ_F_DOUBLE_POLL_BIT), > /* request has already done partial IO */ > - REQ_F_PARTIAL_IO = BIT(REQ_F_PARTIAL_IO_BIT), > + REQ_F_PARTIAL_IO = IO_REQ_FLAG(REQ_F_PARTIAL_IO_BIT), > /* fast poll multishot mode */ > - REQ_F_APOLL_MULTISHOT = BIT(REQ_F_APOLL_MULTISHOT_BIT), > + REQ_F_APOLL_MULTISHOT = IO_REQ_FLAG(REQ_F_APOLL_MULTISHOT_BIT), > /* recvmsg special flag, clear EPOLLIN */ > - REQ_F_CLEAR_POLLIN = BIT(REQ_F_CLEAR_POLLIN_BIT), > + REQ_F_CLEAR_POLLIN = IO_REQ_FLAG(REQ_F_CLEAR_POLLIN_BIT), > /* hashed into ->cancel_hash_locked, protected by ->uring_lock */ > - REQ_F_HASH_LOCKED = BIT(REQ_F_HASH_LOCKED_BIT), > + REQ_F_HASH_LOCKED = IO_REQ_FLAG(REQ_F_HASH_LOCKED_BIT), > /* don't use lazy poll wake for this request */ > - REQ_F_POLL_NO_LAZY = BIT(REQ_F_POLL_NO_LAZY_BIT), > + REQ_F_POLL_NO_LAZY = IO_REQ_FLAG(REQ_F_POLL_NO_LAZY_BIT), > }; > > typedef void (*io_req_tw_func_t)(struct io_kiocb *req, struct io_tw_state *ts); > @@ -592,15 +595,14 @@ struct io_kiocb { > * and after selection it points to the buffer ID itself. > */ > u16 buf_index; > - unsigned int flags; > > - struct io_cqe cqe; With the current layout the min number of lines we touch per request is 2 (including the op specific 64B), that's includes setting up cqe at init and using it for completing. Moving cqe down makes it 3. > + atomic_t refs; We're pulling it refs, which is not touched at all in the hot path. Even if there's a hole I'd argue it's better to leave it at the end. > + > + io_req_flags_t flags; > > struct io_ring_ctx *ctx; > struct task_struct *task; > > - struct io_rsrc_node *rsrc_node; It's used in hot paths, registered buffers/files, would be unfortunate to move it to the next line. > - > union { > /* store used ubuf, so we can prevent reloading */ > struct io_mapped_ubuf *imu; > @@ -615,18 +617,23 @@ struct io_kiocb { > struct io_buffer_list *buf_list; > }; > > + /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */ > + struct hlist_node hash_node; > + And we're pulling hash_node into the hottest line, which is used only when we arm a poll and remove poll. So, it's mostly for networking, sends wouldn't use it much, and multishots wouldn't normally touch it. As for ideas how to find space: 1) iopoll_completed completed can be converted to flags2 2) REQ_F_{SINGLE,DOUBLE}_POLL is a weird duplication. Can probably be combined into one flag, or removed at all. Again, sends are usually not so poll heavy and the hot path for recv is multishot. 3) we can probably move req->task down and replace it with get_task() { if (req->ctx->flags & DEFER_TASKRUN) task = ctx->submitter_task; else task = req->task; } The most common user of it -- task_work_add -- already checks the flag and has separate paths, and init/completion paths can be optimised. > union { > /* used by request caches, completion batching and iopoll */ > struct io_wq_work_node comp_list; > /* cache ->apoll->events */ > __poll_t apoll_events; > }; > - atomic_t refs; > - atomic_t poll_refs; > + > + struct io_rsrc_node *rsrc_node; > + > + struct io_cqe cqe; > + > struct io_task_work io_task_work; > + atomic_t poll_refs; > unsigned nr_tw; > - /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */ > - struct hlist_node hash_node; > /* internal polling, see IORING_FEAT_FAST_POLL */ > struct async_poll *apoll; > /* opcode allocated if it needs to store data for async defer */ > diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h > index 69454f1f98b0..3d7704a52b73 100644 > --- a/include/trace/events/io_uring.h > +++ b/include/trace/events/io_uring.h > @@ -148,7 +148,7 @@ TRACE_EVENT(io_uring_queue_async_work, > __field( void *, req ) > __field( u64, user_data ) > __field( u8, opcode ) > - __field( unsigned int, flags ) > + __field( io_req_flags_t, flags ) > __field( struct io_wq_work *, work ) > __field( int, rw ) > > @@ -167,10 +167,10 @@ TRACE_EVENT(io_uring_queue_async_work, > __assign_str(op_str, io_uring_get_opcode(req->opcode)); > ), > > - TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s, flags 0x%x, %s queue, work %p", > + TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s, flags 0x%lx, %s queue, work %p", > __entry->ctx, __entry->req, __entry->user_data, > - __get_str(op_str), > - __entry->flags, __entry->rw ? "hashed" : "normal", __entry->work) > + __get_str(op_str), (long) __entry->flags, > + __entry->rw ? "hashed" : "normal", __entry->work) > ); > > /** > @@ -378,7 +378,7 @@ TRACE_EVENT(io_uring_submit_req, > __field( void *, req ) > __field( unsigned long long, user_data ) > __field( u8, opcode ) > - __field( u32, flags ) > + __field( io_req_flags_t, flags ) > __field( bool, sq_thread ) > > __string( op_str, io_uring_get_opcode(req->opcode) ) > @@ -395,10 +395,10 @@ TRACE_EVENT(io_uring_submit_req, > __assign_str(op_str, io_uring_get_opcode(req->opcode)); > ), > > - TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, flags 0x%x, " > + TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, flags 0x%lx, " > "sq_thread %d", __entry->ctx, __entry->req, > __entry->user_data, __get_str(op_str), > - __entry->flags, __entry->sq_thread) > + (long) __entry->flags, __entry->sq_thread) > ); > > /* > diff --git a/io_uring/filetable.h b/io_uring/filetable.h > index b47adf170c31..b2435c4dca1f 100644 > --- a/io_uring/filetable.h > +++ b/io_uring/filetable.h > @@ -17,7 +17,7 @@ int io_fixed_fd_remove(struct io_ring_ctx *ctx, unsigned int offset); > int io_register_file_alloc_range(struct io_ring_ctx *ctx, > struct io_uring_file_index_range __user *arg); > > -unsigned int io_file_get_flags(struct file *file); > +io_req_flags_t io_file_get_flags(struct file *file); > > static inline void io_file_bitmap_clear(struct io_file_table *table, int bit) > { > diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c > index cd9a137ad6ce..360a7ee41d3a 100644 > --- a/io_uring/io_uring.c > +++ b/io_uring/io_uring.c > @@ -1768,9 +1768,9 @@ static void io_iopoll_req_issued(struct io_kiocb *req, unsigned int issue_flags) > } > } > > -unsigned int io_file_get_flags(struct file *file) > +io_req_flags_t io_file_get_flags(struct file *file) > { > - unsigned int res = 0; > + io_req_flags_t res = 0; > > if (S_ISREG(file_inode(file)->i_mode)) > res |= REQ_F_ISREG; > @@ -2171,7 +2171,8 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, > /* req is partially pre-initialised, see io_preinit_req() */ > req->opcode = opcode = READ_ONCE(sqe->opcode); > /* same numerical values with corresponding REQ_F_*, safe to copy */ > - req->flags = sqe_flags = READ_ONCE(sqe->flags); > + sqe_flags = READ_ONCE(sqe->flags); > + req->flags = (io_req_flags_t) sqe_flags; > req->cqe.user_data = READ_ONCE(sqe->user_data); > req->file = NULL; > req->rsrc_node = NULL; > @@ -4153,7 +4154,7 @@ static int __init io_uring_init(void) > BUILD_BUG_ON(SQE_COMMON_FLAGS >= (1 << 8)); > BUILD_BUG_ON((SQE_VALID_FLAGS | SQE_COMMON_FLAGS) != SQE_VALID_FLAGS); > > - BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof(int)); > + BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof(u64)); > > BUILD_BUG_ON(sizeof(atomic_t) != sizeof(u32)); >
On 2/6/24 5:43 PM, Pavel Begunkov wrote: > On 2/6/24 16:22, Jens Axboe wrote: >> We're out of space here, and none of the flags are easily reclaimable. >> Bump it to 64-bits and re-arrange the struct a bit to avoid gaps. >> >> Add a specific bitwise type for the request flags, io_request_flags_t. >> This will help catch violations of casting this value to a smaller type >> on 32-bit archs, like unsigned int. >> >> No functional changes intended in this patch. >> >> Signed-off-by: Jens Axboe <axboe@kernel.dk> >> --- >> include/linux/io_uring_types.h | 87 ++++++++++++++++++--------------- >> include/trace/events/io_uring.h | 14 +++--- >> io_uring/filetable.h | 2 +- >> io_uring/io_uring.c | 9 ++-- >> 4 files changed, 60 insertions(+), 52 deletions(-) >> >> diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h >> index 854ad67a5f70..5ac18b05d4ee 100644 >> --- a/include/linux/io_uring_types.h >> +++ b/include/linux/io_uring_types.h >> @@ -428,7 +428,7 @@ struct io_tw_state { >> bool locked; >> }; >> -enum { >> +enum io_req_flags { >> REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT, >> REQ_F_IO_DRAIN_BIT = IOSQE_IO_DRAIN_BIT, >> REQ_F_LINK_BIT = IOSQE_IO_LINK_BIT, >> @@ -468,70 +468,73 @@ enum { >> __REQ_F_LAST_BIT, >> }; >> +typedef enum io_req_flags __bitwise io_req_flags_t; >> +#define IO_REQ_FLAG(bitno) ((__force io_req_flags_t) BIT_ULL((bitno))) >> + >> enum { >> /* ctx owns file */ >> - REQ_F_FIXED_FILE = BIT(REQ_F_FIXED_FILE_BIT), >> + REQ_F_FIXED_FILE = IO_REQ_FLAG(REQ_F_FIXED_FILE_BIT), >> /* drain existing IO first */ >> - REQ_F_IO_DRAIN = BIT(REQ_F_IO_DRAIN_BIT), >> + REQ_F_IO_DRAIN = IO_REQ_FLAG(REQ_F_IO_DRAIN_BIT), >> /* linked sqes */ >> - REQ_F_LINK = BIT(REQ_F_LINK_BIT), >> + REQ_F_LINK = IO_REQ_FLAG(REQ_F_LINK_BIT), >> /* doesn't sever on completion < 0 */ >> - REQ_F_HARDLINK = BIT(REQ_F_HARDLINK_BIT), >> + REQ_F_HARDLINK = IO_REQ_FLAG(REQ_F_HARDLINK_BIT), >> /* IOSQE_ASYNC */ >> - REQ_F_FORCE_ASYNC = BIT(REQ_F_FORCE_ASYNC_BIT), >> + REQ_F_FORCE_ASYNC = IO_REQ_FLAG(REQ_F_FORCE_ASYNC_BIT), >> /* IOSQE_BUFFER_SELECT */ >> - REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT), >> + REQ_F_BUFFER_SELECT = IO_REQ_FLAG(REQ_F_BUFFER_SELECT_BIT), >> /* IOSQE_CQE_SKIP_SUCCESS */ >> - REQ_F_CQE_SKIP = BIT(REQ_F_CQE_SKIP_BIT), >> + REQ_F_CQE_SKIP = IO_REQ_FLAG(REQ_F_CQE_SKIP_BIT), >> /* fail rest of links */ >> - REQ_F_FAIL = BIT(REQ_F_FAIL_BIT), >> + REQ_F_FAIL = IO_REQ_FLAG(REQ_F_FAIL_BIT), >> /* on inflight list, should be cancelled and waited on exit reliably */ >> - REQ_F_INFLIGHT = BIT(REQ_F_INFLIGHT_BIT), >> + REQ_F_INFLIGHT = IO_REQ_FLAG(REQ_F_INFLIGHT_BIT), >> /* read/write uses file position */ >> - REQ_F_CUR_POS = BIT(REQ_F_CUR_POS_BIT), >> + REQ_F_CUR_POS = IO_REQ_FLAG(REQ_F_CUR_POS_BIT), >> /* must not punt to workers */ >> - REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT), >> + REQ_F_NOWAIT = IO_REQ_FLAG(REQ_F_NOWAIT_BIT), >> /* has or had linked timeout */ >> - REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT), >> + REQ_F_LINK_TIMEOUT = IO_REQ_FLAG(REQ_F_LINK_TIMEOUT_BIT), >> /* needs cleanup */ >> - REQ_F_NEED_CLEANUP = BIT(REQ_F_NEED_CLEANUP_BIT), >> + REQ_F_NEED_CLEANUP = IO_REQ_FLAG(REQ_F_NEED_CLEANUP_BIT), >> /* already went through poll handler */ >> - REQ_F_POLLED = BIT(REQ_F_POLLED_BIT), >> + REQ_F_POLLED = IO_REQ_FLAG(REQ_F_POLLED_BIT), >> /* buffer already selected */ >> - REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT), >> + REQ_F_BUFFER_SELECTED = IO_REQ_FLAG(REQ_F_BUFFER_SELECTED_BIT), >> /* buffer selected from ring, needs commit */ >> - REQ_F_BUFFER_RING = BIT(REQ_F_BUFFER_RING_BIT), >> + REQ_F_BUFFER_RING = IO_REQ_FLAG(REQ_F_BUFFER_RING_BIT), >> /* caller should reissue async */ >> - REQ_F_REISSUE = BIT(REQ_F_REISSUE_BIT), >> + REQ_F_REISSUE = IO_REQ_FLAG(REQ_F_REISSUE_BIT), >> /* supports async reads/writes */ >> - REQ_F_SUPPORT_NOWAIT = BIT(REQ_F_SUPPORT_NOWAIT_BIT), >> + REQ_F_SUPPORT_NOWAIT = IO_REQ_FLAG(REQ_F_SUPPORT_NOWAIT_BIT), >> /* regular file */ >> - REQ_F_ISREG = BIT(REQ_F_ISREG_BIT), >> + REQ_F_ISREG = IO_REQ_FLAG(REQ_F_ISREG_BIT), >> /* has creds assigned */ >> - REQ_F_CREDS = BIT(REQ_F_CREDS_BIT), >> + REQ_F_CREDS = IO_REQ_FLAG(REQ_F_CREDS_BIT), >> /* skip refcounting if not set */ >> - REQ_F_REFCOUNT = BIT(REQ_F_REFCOUNT_BIT), >> + REQ_F_REFCOUNT = IO_REQ_FLAG(REQ_F_REFCOUNT_BIT), >> /* there is a linked timeout that has to be armed */ >> - REQ_F_ARM_LTIMEOUT = BIT(REQ_F_ARM_LTIMEOUT_BIT), >> + REQ_F_ARM_LTIMEOUT = IO_REQ_FLAG(REQ_F_ARM_LTIMEOUT_BIT), >> /* ->async_data allocated */ >> - REQ_F_ASYNC_DATA = BIT(REQ_F_ASYNC_DATA_BIT), >> + REQ_F_ASYNC_DATA = IO_REQ_FLAG(REQ_F_ASYNC_DATA_BIT), >> /* don't post CQEs while failing linked requests */ >> - REQ_F_SKIP_LINK_CQES = BIT(REQ_F_SKIP_LINK_CQES_BIT), >> + REQ_F_SKIP_LINK_CQES = IO_REQ_FLAG(REQ_F_SKIP_LINK_CQES_BIT), >> /* single poll may be active */ >> - REQ_F_SINGLE_POLL = BIT(REQ_F_SINGLE_POLL_BIT), >> + REQ_F_SINGLE_POLL = IO_REQ_FLAG(REQ_F_SINGLE_POLL_BIT), >> /* double poll may active */ >> - REQ_F_DOUBLE_POLL = BIT(REQ_F_DOUBLE_POLL_BIT), >> + REQ_F_DOUBLE_POLL = IO_REQ_FLAG(REQ_F_DOUBLE_POLL_BIT), >> /* request has already done partial IO */ >> - REQ_F_PARTIAL_IO = BIT(REQ_F_PARTIAL_IO_BIT), >> + REQ_F_PARTIAL_IO = IO_REQ_FLAG(REQ_F_PARTIAL_IO_BIT), >> /* fast poll multishot mode */ >> - REQ_F_APOLL_MULTISHOT = BIT(REQ_F_APOLL_MULTISHOT_BIT), >> + REQ_F_APOLL_MULTISHOT = IO_REQ_FLAG(REQ_F_APOLL_MULTISHOT_BIT), >> /* recvmsg special flag, clear EPOLLIN */ >> - REQ_F_CLEAR_POLLIN = BIT(REQ_F_CLEAR_POLLIN_BIT), >> + REQ_F_CLEAR_POLLIN = IO_REQ_FLAG(REQ_F_CLEAR_POLLIN_BIT), >> /* hashed into ->cancel_hash_locked, protected by ->uring_lock */ >> - REQ_F_HASH_LOCKED = BIT(REQ_F_HASH_LOCKED_BIT), >> + REQ_F_HASH_LOCKED = IO_REQ_FLAG(REQ_F_HASH_LOCKED_BIT), >> /* don't use lazy poll wake for this request */ >> - REQ_F_POLL_NO_LAZY = BIT(REQ_F_POLL_NO_LAZY_BIT), >> + REQ_F_POLL_NO_LAZY = IO_REQ_FLAG(REQ_F_POLL_NO_LAZY_BIT), >> }; >> typedef void (*io_req_tw_func_t)(struct io_kiocb *req, struct io_tw_state *ts); >> @@ -592,15 +595,14 @@ struct io_kiocb { >> * and after selection it points to the buffer ID itself. >> */ >> u16 buf_index; >> - unsigned int flags; >> - struct io_cqe cqe; > > With the current layout the min number of lines we touch per > request is 2 (including the op specific 64B), that's includes > setting up cqe at init and using it for completing. Moving cqe > down makes it 3. > >> + atomic_t refs; > > We're pulling it refs, which is not touched at all in the hot > path. Even if there's a hole I'd argue it's better to leave it > at the end. > >> + >> + io_req_flags_t flags; >> struct io_ring_ctx *ctx; >> struct task_struct *task; >> - struct io_rsrc_node *rsrc_node; > > It's used in hot paths, registered buffers/files, would be > unfortunate to move it to the next line. Yep I did feel a bit bad about that one... Let me take another stab at it. >> - >> union { >> /* store used ubuf, so we can prevent reloading */ >> struct io_mapped_ubuf *imu; >> @@ -615,18 +617,23 @@ struct io_kiocb { >> struct io_buffer_list *buf_list; >> }; >> + /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */ >> + struct hlist_node hash_node; >> + > > And we're pulling hash_node into the hottest line, which is > used only when we arm a poll and remove poll. So, it's mostly > for networking, sends wouldn't use it much, and multishots > wouldn't normally touch it. > > As for ideas how to find space: > 1) iopoll_completed completed can be converted to flags2 That's a good idea, but won't immediately find any space as it'd just leave a hole anyway. But would be good to note in there perhaps, you never know when it needs re-arranging again. > 2) REQ_F_{SINGLE,DOUBLE}_POLL is a weird duplication. Can > probably be combined into one flag, or removed at all. > Again, sends are usually not so poll heavy and the hot > path for recv is multishot. Normal receive is also a hot path, even if multishot should be preferred in general. Ditto on non-sockets but still pollable files, doing eg read for example. > 3) we can probably move req->task down and replace it with > > get_task() { > if (req->ctx->flags & DEFER_TASKRUN) > task = ctx->submitter_task; > else > task = req->task; > } Assuming ctx flags is hot, which is would generally be, that's not a bad idea at all. I'll do another loop over this one.
On 2/7/24 02:18, Jens Axboe wrote: > On 2/6/24 5:43 PM, Pavel Begunkov wrote: >> On 2/6/24 16:22, Jens Axboe wrote: >>> We're out of space here, and none of the flags are easily reclaimable. >>> Bump it to 64-bits and re-arrange the struct a bit to avoid gaps. >>> >>> Add a specific bitwise type for the request flags, io_request_flags_t. >>> This will help catch violations of casting this value to a smaller type >>> on 32-bit archs, like unsigned int. >>> >>> No functional changes intended in this patch. >>> >>> Signed-off-by: Jens Axboe <axboe@kernel.dk> >>> --- ... >>> typedef void (*io_req_tw_func_t)(struct io_kiocb *req, struct io_tw_state *ts); >>> @@ -592,15 +595,14 @@ struct io_kiocb { >>> * and after selection it points to the buffer ID itself. >>> */ >>> u16 buf_index; >>> - unsigned int flags; >>> - struct io_cqe cqe; >> >> With the current layout the min number of lines we touch per >> request is 2 (including the op specific 64B), that's includes >> setting up cqe at init and using it for completing. Moving cqe >> down makes it 3. >> >>> + atomic_t refs; >> >> We're pulling it refs, which is not touched at all in the hot >> path. Even if there's a hole I'd argue it's better to leave it >> at the end. >> >>> + >>> + io_req_flags_t flags; >>> struct io_ring_ctx *ctx; >>> struct task_struct *task; >>> - struct io_rsrc_node *rsrc_node; >> >> It's used in hot paths, registered buffers/files, would be >> unfortunate to move it to the next line. > > Yep I did feel a bit bad about that one... Let me take another stab at > it. > >>> - >>> union { >>> /* store used ubuf, so we can prevent reloading */ >>> struct io_mapped_ubuf *imu; >>> @@ -615,18 +617,23 @@ struct io_kiocb { >>> struct io_buffer_list *buf_list; >>> }; >>> + /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */ >>> + struct hlist_node hash_node; >>> + >> >> And we're pulling hash_node into the hottest line, which is >> used only when we arm a poll and remove poll. So, it's mostly >> for networking, sends wouldn't use it much, and multishots >> wouldn't normally touch it. >> >> As for ideas how to find space: >> 1) iopoll_completed completed can be converted to flags2 > > That's a good idea, but won't immediately find any space as it'd just > leave a hole anyway. But would be good to note in there perhaps, you > never know when it needs re-arranging again. struct io_kiocb { unsigned flags; ... u8 flags2; }; I rather proposed to have this, which is definitely borderline ugly but certainly an option. >> 2) REQ_F_{SINGLE,DOUBLE}_POLL is a weird duplication. Can >> probably be combined into one flag, or removed at all. >> Again, sends are usually not so poll heavy and the hot >> path for recv is multishot. > > Normal receive is also a hot path, even if multishot should be preferred The degree of hotness is arguable. It's poll, which takes a spinlock (and disables irqs), does an indirect call, goes into he socket internals there touching pretty contended parts like sock_wq. The relative overhead of looking at f_ops should be nothing. But the thought was more about combining them, REQ_F_POLL_ACTIVE, and clear only if it's not double poll. > in general. Ditto on non-sockets but still pollable files, doing eg read > for example. > >> 3) we can probably move req->task down and replace it with >> >> get_task() { >> if (req->ctx->flags & DEFER_TASKRUN) >> task = ctx->submitter_task; >> else >> task = req->task; >> } > > Assuming ctx flags is hot, which is would generally be, that's not a bad > idea at all. As mentioned, task_work_add would be the main user, and there is already a different branch for DEFER_TASKRUN, to it implicitly knows that ctx->submitter_task is correct. > I'll do another loop over this one.
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 854ad67a5f70..5ac18b05d4ee 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -428,7 +428,7 @@ struct io_tw_state { bool locked; }; -enum { +enum io_req_flags { REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT, REQ_F_IO_DRAIN_BIT = IOSQE_IO_DRAIN_BIT, REQ_F_LINK_BIT = IOSQE_IO_LINK_BIT, @@ -468,70 +468,73 @@ enum { __REQ_F_LAST_BIT, }; +typedef enum io_req_flags __bitwise io_req_flags_t; +#define IO_REQ_FLAG(bitno) ((__force io_req_flags_t) BIT_ULL((bitno))) + enum { /* ctx owns file */ - REQ_F_FIXED_FILE = BIT(REQ_F_FIXED_FILE_BIT), + REQ_F_FIXED_FILE = IO_REQ_FLAG(REQ_F_FIXED_FILE_BIT), /* drain existing IO first */ - REQ_F_IO_DRAIN = BIT(REQ_F_IO_DRAIN_BIT), + REQ_F_IO_DRAIN = IO_REQ_FLAG(REQ_F_IO_DRAIN_BIT), /* linked sqes */ - REQ_F_LINK = BIT(REQ_F_LINK_BIT), + REQ_F_LINK = IO_REQ_FLAG(REQ_F_LINK_BIT), /* doesn't sever on completion < 0 */ - REQ_F_HARDLINK = BIT(REQ_F_HARDLINK_BIT), + REQ_F_HARDLINK = IO_REQ_FLAG(REQ_F_HARDLINK_BIT), /* IOSQE_ASYNC */ - REQ_F_FORCE_ASYNC = BIT(REQ_F_FORCE_ASYNC_BIT), + REQ_F_FORCE_ASYNC = IO_REQ_FLAG(REQ_F_FORCE_ASYNC_BIT), /* IOSQE_BUFFER_SELECT */ - REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT), + REQ_F_BUFFER_SELECT = IO_REQ_FLAG(REQ_F_BUFFER_SELECT_BIT), /* IOSQE_CQE_SKIP_SUCCESS */ - REQ_F_CQE_SKIP = BIT(REQ_F_CQE_SKIP_BIT), + REQ_F_CQE_SKIP = IO_REQ_FLAG(REQ_F_CQE_SKIP_BIT), /* fail rest of links */ - REQ_F_FAIL = BIT(REQ_F_FAIL_BIT), + REQ_F_FAIL = IO_REQ_FLAG(REQ_F_FAIL_BIT), /* on inflight list, should be cancelled and waited on exit reliably */ - REQ_F_INFLIGHT = BIT(REQ_F_INFLIGHT_BIT), + REQ_F_INFLIGHT = IO_REQ_FLAG(REQ_F_INFLIGHT_BIT), /* read/write uses file position */ - REQ_F_CUR_POS = BIT(REQ_F_CUR_POS_BIT), + REQ_F_CUR_POS = IO_REQ_FLAG(REQ_F_CUR_POS_BIT), /* must not punt to workers */ - REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT), + REQ_F_NOWAIT = IO_REQ_FLAG(REQ_F_NOWAIT_BIT), /* has or had linked timeout */ - REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT), + REQ_F_LINK_TIMEOUT = IO_REQ_FLAG(REQ_F_LINK_TIMEOUT_BIT), /* needs cleanup */ - REQ_F_NEED_CLEANUP = BIT(REQ_F_NEED_CLEANUP_BIT), + REQ_F_NEED_CLEANUP = IO_REQ_FLAG(REQ_F_NEED_CLEANUP_BIT), /* already went through poll handler */ - REQ_F_POLLED = BIT(REQ_F_POLLED_BIT), + REQ_F_POLLED = IO_REQ_FLAG(REQ_F_POLLED_BIT), /* buffer already selected */ - REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT), + REQ_F_BUFFER_SELECTED = IO_REQ_FLAG(REQ_F_BUFFER_SELECTED_BIT), /* buffer selected from ring, needs commit */ - REQ_F_BUFFER_RING = BIT(REQ_F_BUFFER_RING_BIT), + REQ_F_BUFFER_RING = IO_REQ_FLAG(REQ_F_BUFFER_RING_BIT), /* caller should reissue async */ - REQ_F_REISSUE = BIT(REQ_F_REISSUE_BIT), + REQ_F_REISSUE = IO_REQ_FLAG(REQ_F_REISSUE_BIT), /* supports async reads/writes */ - REQ_F_SUPPORT_NOWAIT = BIT(REQ_F_SUPPORT_NOWAIT_BIT), + REQ_F_SUPPORT_NOWAIT = IO_REQ_FLAG(REQ_F_SUPPORT_NOWAIT_BIT), /* regular file */ - REQ_F_ISREG = BIT(REQ_F_ISREG_BIT), + REQ_F_ISREG = IO_REQ_FLAG(REQ_F_ISREG_BIT), /* has creds assigned */ - REQ_F_CREDS = BIT(REQ_F_CREDS_BIT), + REQ_F_CREDS = IO_REQ_FLAG(REQ_F_CREDS_BIT), /* skip refcounting if not set */ - REQ_F_REFCOUNT = BIT(REQ_F_REFCOUNT_BIT), + REQ_F_REFCOUNT = IO_REQ_FLAG(REQ_F_REFCOUNT_BIT), /* there is a linked timeout that has to be armed */ - REQ_F_ARM_LTIMEOUT = BIT(REQ_F_ARM_LTIMEOUT_BIT), + REQ_F_ARM_LTIMEOUT = IO_REQ_FLAG(REQ_F_ARM_LTIMEOUT_BIT), /* ->async_data allocated */ - REQ_F_ASYNC_DATA = BIT(REQ_F_ASYNC_DATA_BIT), + REQ_F_ASYNC_DATA = IO_REQ_FLAG(REQ_F_ASYNC_DATA_BIT), /* don't post CQEs while failing linked requests */ - REQ_F_SKIP_LINK_CQES = BIT(REQ_F_SKIP_LINK_CQES_BIT), + REQ_F_SKIP_LINK_CQES = IO_REQ_FLAG(REQ_F_SKIP_LINK_CQES_BIT), /* single poll may be active */ - REQ_F_SINGLE_POLL = BIT(REQ_F_SINGLE_POLL_BIT), + REQ_F_SINGLE_POLL = IO_REQ_FLAG(REQ_F_SINGLE_POLL_BIT), /* double poll may active */ - REQ_F_DOUBLE_POLL = BIT(REQ_F_DOUBLE_POLL_BIT), + REQ_F_DOUBLE_POLL = IO_REQ_FLAG(REQ_F_DOUBLE_POLL_BIT), /* request has already done partial IO */ - REQ_F_PARTIAL_IO = BIT(REQ_F_PARTIAL_IO_BIT), + REQ_F_PARTIAL_IO = IO_REQ_FLAG(REQ_F_PARTIAL_IO_BIT), /* fast poll multishot mode */ - REQ_F_APOLL_MULTISHOT = BIT(REQ_F_APOLL_MULTISHOT_BIT), + REQ_F_APOLL_MULTISHOT = IO_REQ_FLAG(REQ_F_APOLL_MULTISHOT_BIT), /* recvmsg special flag, clear EPOLLIN */ - REQ_F_CLEAR_POLLIN = BIT(REQ_F_CLEAR_POLLIN_BIT), + REQ_F_CLEAR_POLLIN = IO_REQ_FLAG(REQ_F_CLEAR_POLLIN_BIT), /* hashed into ->cancel_hash_locked, protected by ->uring_lock */ - REQ_F_HASH_LOCKED = BIT(REQ_F_HASH_LOCKED_BIT), + REQ_F_HASH_LOCKED = IO_REQ_FLAG(REQ_F_HASH_LOCKED_BIT), /* don't use lazy poll wake for this request */ - REQ_F_POLL_NO_LAZY = BIT(REQ_F_POLL_NO_LAZY_BIT), + REQ_F_POLL_NO_LAZY = IO_REQ_FLAG(REQ_F_POLL_NO_LAZY_BIT), }; typedef void (*io_req_tw_func_t)(struct io_kiocb *req, struct io_tw_state *ts); @@ -592,15 +595,14 @@ struct io_kiocb { * and after selection it points to the buffer ID itself. */ u16 buf_index; - unsigned int flags; - struct io_cqe cqe; + atomic_t refs; + + io_req_flags_t flags; struct io_ring_ctx *ctx; struct task_struct *task; - struct io_rsrc_node *rsrc_node; - union { /* store used ubuf, so we can prevent reloading */ struct io_mapped_ubuf *imu; @@ -615,18 +617,23 @@ struct io_kiocb { struct io_buffer_list *buf_list; }; + /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */ + struct hlist_node hash_node; + union { /* used by request caches, completion batching and iopoll */ struct io_wq_work_node comp_list; /* cache ->apoll->events */ __poll_t apoll_events; }; - atomic_t refs; - atomic_t poll_refs; + + struct io_rsrc_node *rsrc_node; + + struct io_cqe cqe; + struct io_task_work io_task_work; + atomic_t poll_refs; unsigned nr_tw; - /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */ - struct hlist_node hash_node; /* internal polling, see IORING_FEAT_FAST_POLL */ struct async_poll *apoll; /* opcode allocated if it needs to store data for async defer */ diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h index 69454f1f98b0..3d7704a52b73 100644 --- a/include/trace/events/io_uring.h +++ b/include/trace/events/io_uring.h @@ -148,7 +148,7 @@ TRACE_EVENT(io_uring_queue_async_work, __field( void *, req ) __field( u64, user_data ) __field( u8, opcode ) - __field( unsigned int, flags ) + __field( io_req_flags_t, flags ) __field( struct io_wq_work *, work ) __field( int, rw ) @@ -167,10 +167,10 @@ TRACE_EVENT(io_uring_queue_async_work, __assign_str(op_str, io_uring_get_opcode(req->opcode)); ), - TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s, flags 0x%x, %s queue, work %p", + TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s, flags 0x%lx, %s queue, work %p", __entry->ctx, __entry->req, __entry->user_data, - __get_str(op_str), - __entry->flags, __entry->rw ? "hashed" : "normal", __entry->work) + __get_str(op_str), (long) __entry->flags, + __entry->rw ? "hashed" : "normal", __entry->work) ); /** @@ -378,7 +378,7 @@ TRACE_EVENT(io_uring_submit_req, __field( void *, req ) __field( unsigned long long, user_data ) __field( u8, opcode ) - __field( u32, flags ) + __field( io_req_flags_t, flags ) __field( bool, sq_thread ) __string( op_str, io_uring_get_opcode(req->opcode) ) @@ -395,10 +395,10 @@ TRACE_EVENT(io_uring_submit_req, __assign_str(op_str, io_uring_get_opcode(req->opcode)); ), - TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, flags 0x%x, " + TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, flags 0x%lx, " "sq_thread %d", __entry->ctx, __entry->req, __entry->user_data, __get_str(op_str), - __entry->flags, __entry->sq_thread) + (long) __entry->flags, __entry->sq_thread) ); /* diff --git a/io_uring/filetable.h b/io_uring/filetable.h index b47adf170c31..b2435c4dca1f 100644 --- a/io_uring/filetable.h +++ b/io_uring/filetable.h @@ -17,7 +17,7 @@ int io_fixed_fd_remove(struct io_ring_ctx *ctx, unsigned int offset); int io_register_file_alloc_range(struct io_ring_ctx *ctx, struct io_uring_file_index_range __user *arg); -unsigned int io_file_get_flags(struct file *file); +io_req_flags_t io_file_get_flags(struct file *file); static inline void io_file_bitmap_clear(struct io_file_table *table, int bit) { diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index cd9a137ad6ce..360a7ee41d3a 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1768,9 +1768,9 @@ static void io_iopoll_req_issued(struct io_kiocb *req, unsigned int issue_flags) } } -unsigned int io_file_get_flags(struct file *file) +io_req_flags_t io_file_get_flags(struct file *file) { - unsigned int res = 0; + io_req_flags_t res = 0; if (S_ISREG(file_inode(file)->i_mode)) res |= REQ_F_ISREG; @@ -2171,7 +2171,8 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, /* req is partially pre-initialised, see io_preinit_req() */ req->opcode = opcode = READ_ONCE(sqe->opcode); /* same numerical values with corresponding REQ_F_*, safe to copy */ - req->flags = sqe_flags = READ_ONCE(sqe->flags); + sqe_flags = READ_ONCE(sqe->flags); + req->flags = (io_req_flags_t) sqe_flags; req->cqe.user_data = READ_ONCE(sqe->user_data); req->file = NULL; req->rsrc_node = NULL; @@ -4153,7 +4154,7 @@ static int __init io_uring_init(void) BUILD_BUG_ON(SQE_COMMON_FLAGS >= (1 << 8)); BUILD_BUG_ON((SQE_VALID_FLAGS | SQE_COMMON_FLAGS) != SQE_VALID_FLAGS); - BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof(int)); + BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof(u64)); BUILD_BUG_ON(sizeof(atomic_t) != sizeof(u32));
We're out of space here, and none of the flags are easily reclaimable. Bump it to 64-bits and re-arrange the struct a bit to avoid gaps. Add a specific bitwise type for the request flags, io_request_flags_t. This will help catch violations of casting this value to a smaller type on 32-bit archs, like unsigned int. No functional changes intended in this patch. Signed-off-by: Jens Axboe <axboe@kernel.dk> --- include/linux/io_uring_types.h | 87 ++++++++++++++++++--------------- include/trace/events/io_uring.h | 14 +++--- io_uring/filetable.h | 2 +- io_uring/io_uring.c | 9 ++-- 4 files changed, 60 insertions(+), 52 deletions(-)