diff mbox series

[RFC,v2,09/13] io_uring: separate wq for ring polling

Message ID 0fbee0baf170cbfb8488773e61890fc78ed48d1e.1672713341.git.asml.silence@gmail.com (mailing list archive)
State New
Headers show
Series CQ waiting and wake up optimisations | expand

Commit Message

Pavel Begunkov Jan. 3, 2023, 3:04 a.m. UTC
Don't use ->cq_wait for ring polling but add a separate wait queue for
it. We need it for following patches.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 include/linux/io_uring_types.h | 1 +
 io_uring/io_uring.c            | 3 ++-
 io_uring/io_uring.h            | 9 +++++++++
 3 files changed, 12 insertions(+), 1 deletion(-)

Comments

Jens Axboe Jan. 4, 2023, 6:08 p.m. UTC | #1
On 1/2/23 8:04 PM, Pavel Begunkov wrote:
> Don't use ->cq_wait for ring polling but add a separate wait queue for
> it. We need it for following patches.
> 
> Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
> ---
>  include/linux/io_uring_types.h | 1 +
>  io_uring/io_uring.c            | 3 ++-
>  io_uring/io_uring.h            | 9 +++++++++
>  3 files changed, 12 insertions(+), 1 deletion(-)
> 
> diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
> index dcd8a563ab52..cbcd3aaddd9d 100644
> --- a/include/linux/io_uring_types.h
> +++ b/include/linux/io_uring_types.h
> @@ -286,6 +286,7 @@ struct io_ring_ctx {
>  		unsigned		cq_entries;
>  		struct io_ev_fd	__rcu	*io_ev_fd;
>  		struct wait_queue_head	cq_wait;
> +		struct wait_queue_head	poll_wq;
>  		unsigned		cq_extra;
>  	} ____cacheline_aligned_in_smp;
>  

Should we move poll_wq somewhere else, more out of the way? Would need to
gate the check a flag or something.
Pavel Begunkov Jan. 4, 2023, 8:28 p.m. UTC | #2
On 1/4/23 18:08, Jens Axboe wrote:
> On 1/2/23 8:04 PM, Pavel Begunkov wrote:
>> Don't use ->cq_wait for ring polling but add a separate wait queue for
>> it. We need it for following patches.
>>
>> Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
>> ---
>>   include/linux/io_uring_types.h | 1 +
>>   io_uring/io_uring.c            | 3 ++-
>>   io_uring/io_uring.h            | 9 +++++++++
>>   3 files changed, 12 insertions(+), 1 deletion(-)
>>
>> diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
>> index dcd8a563ab52..cbcd3aaddd9d 100644
>> --- a/include/linux/io_uring_types.h
>> +++ b/include/linux/io_uring_types.h
>> @@ -286,6 +286,7 @@ struct io_ring_ctx {
>>   		unsigned		cq_entries;
>>   		struct io_ev_fd	__rcu	*io_ev_fd;
>>   		struct wait_queue_head	cq_wait;
>> +		struct wait_queue_head	poll_wq;
>>   		unsigned		cq_extra;
>>   	} ____cacheline_aligned_in_smp;
>>   
> 
> Should we move poll_wq somewhere else, more out of the way?

If we care about polling perf and cache collisions with
cq_wait, yeah we can. In any case it's a good idea to at
least move it after cq_extra.

> Would need to gate the check a flag or something.

Not sure I follow
Jens Axboe Jan. 4, 2023, 8:34 p.m. UTC | #3
On 1/4/23 1:28?PM, Pavel Begunkov wrote:
> On 1/4/23 18:08, Jens Axboe wrote:
>> On 1/2/23 8:04?PM, Pavel Begunkov wrote:
>>> Don't use ->cq_wait for ring polling but add a separate wait queue for
>>> it. We need it for following patches.
>>>
>>> Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
>>> ---
>>>   include/linux/io_uring_types.h | 1 +
>>>   io_uring/io_uring.c            | 3 ++-
>>>   io_uring/io_uring.h            | 9 +++++++++
>>>   3 files changed, 12 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
>>> index dcd8a563ab52..cbcd3aaddd9d 100644
>>> --- a/include/linux/io_uring_types.h
>>> +++ b/include/linux/io_uring_types.h
>>> @@ -286,6 +286,7 @@ struct io_ring_ctx {
>>>           unsigned        cq_entries;
>>>           struct io_ev_fd    __rcu    *io_ev_fd;
>>>           struct wait_queue_head    cq_wait;
>>> +        struct wait_queue_head    poll_wq;
>>>           unsigned        cq_extra;
>>>       } ____cacheline_aligned_in_smp;
>>>   
>>
>> Should we move poll_wq somewhere else, more out of the way?
> 
> If we care about polling perf and cache collisions with
> cq_wait, yeah we can. In any case it's a good idea to at
> least move it after cq_extra.
> 
>> Would need to gate the check a flag or something.
> 
> Not sure I follow

I guess I could've been a bit more verbose... If we consider poll on the
io_uring rather uncommon, then moving the poll_wq outside of the hotter
cq_wait cacheline(s) would make sense. Each wait_queue_head is more than
a cacheline. Then we could have a flag in a spot that's hot anyway
whether to check it or not, eg in that same section as cq_wait.

Looking at the layout right now, we're at 116 bytes for that section, or
two cachelines with 12 bytes to spare. If we add poll_wq, then we'll be
at 196 bytes, which is 4 bytes over the next cacheline. So it'd
essentially double the size of that section. If we moved it outside of
the aligned sections, then it'd pack better.
Pavel Begunkov Jan. 4, 2023, 8:45 p.m. UTC | #4
On 1/4/23 20:34, Jens Axboe wrote:
> On 1/4/23 1:28?PM, Pavel Begunkov wrote:
>> On 1/4/23 18:08, Jens Axboe wrote:
>>> On 1/2/23 8:04?PM, Pavel Begunkov wrote:
>>>> Don't use ->cq_wait for ring polling but add a separate wait queue for
>>>> it. We need it for following patches.
>>>>
>>>> Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
>>>> ---
>>>>    include/linux/io_uring_types.h | 1 +
>>>>    io_uring/io_uring.c            | 3 ++-
>>>>    io_uring/io_uring.h            | 9 +++++++++
>>>>    3 files changed, 12 insertions(+), 1 deletion(-)
>>>>
>>>> diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
>>>> index dcd8a563ab52..cbcd3aaddd9d 100644
>>>> --- a/include/linux/io_uring_types.h
>>>> +++ b/include/linux/io_uring_types.h
>>>> @@ -286,6 +286,7 @@ struct io_ring_ctx {
>>>>            unsigned        cq_entries;
>>>>            struct io_ev_fd    __rcu    *io_ev_fd;
>>>>            struct wait_queue_head    cq_wait;
>>>> +        struct wait_queue_head    poll_wq;
>>>>            unsigned        cq_extra;
>>>>        } ____cacheline_aligned_in_smp;
>>>>    
>>>
>>> Should we move poll_wq somewhere else, more out of the way?
>>
>> If we care about polling perf and cache collisions with
>> cq_wait, yeah we can. In any case it's a good idea to at
>> least move it after cq_extra.
>>
>>> Would need to gate the check a flag or something.
>>
>> Not sure I follow
> 
> I guess I could've been a bit more verbose... If we consider poll on the
> io_uring rather uncommon, then moving the poll_wq outside of the hotter
> cq_wait cacheline(s) would make sense. Each wait_queue_head is more than
> a cacheline.

Looks it's 24B, and wait_queue_entry is uncomfortable 40B.

> Then we could have a flag in a spot that's hot anyway
> whether to check it or not, eg in that same section as cq_wait.
> Looking at the layout right now, we're at 116 bytes for that section, or
> two cachelines with 12 bytes to spare. If we add poll_wq, then we'll be
> at 196 bytes, which is 4 bytes over the next cacheline. So it'd
> essentially double the size of that section. If we moved it outside of
> the aligned sections, then it'd pack better.

Than it's not about hotness and caches but rather memory
consumption due to padding, which is still a good argument.
Jens Axboe Jan. 4, 2023, 8:52 p.m. UTC | #5
On 1/4/23 1:34?PM, Jens Axboe wrote:
> On 1/4/23 1:28?PM, Pavel Begunkov wrote:
>> On 1/4/23 18:08, Jens Axboe wrote:
>>> On 1/2/23 8:04?PM, Pavel Begunkov wrote:
>>>> Don't use ->cq_wait for ring polling but add a separate wait queue for
>>>> it. We need it for following patches.
>>>>
>>>> Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
>>>> ---
>>>>   include/linux/io_uring_types.h | 1 +
>>>>   io_uring/io_uring.c            | 3 ++-
>>>>   io_uring/io_uring.h            | 9 +++++++++
>>>>   3 files changed, 12 insertions(+), 1 deletion(-)
>>>>
>>>> diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
>>>> index dcd8a563ab52..cbcd3aaddd9d 100644
>>>> --- a/include/linux/io_uring_types.h
>>>> +++ b/include/linux/io_uring_types.h
>>>> @@ -286,6 +286,7 @@ struct io_ring_ctx {
>>>>           unsigned        cq_entries;
>>>>           struct io_ev_fd    __rcu    *io_ev_fd;
>>>>           struct wait_queue_head    cq_wait;
>>>> +        struct wait_queue_head    poll_wq;
>>>>           unsigned        cq_extra;
>>>>       } ____cacheline_aligned_in_smp;
>>>>   
>>>
>>> Should we move poll_wq somewhere else, more out of the way?
>>
>> If we care about polling perf and cache collisions with
>> cq_wait, yeah we can. In any case it's a good idea to at
>> least move it after cq_extra.
>>
>>> Would need to gate the check a flag or something.
>>
>> Not sure I follow
> 
> I guess I could've been a bit more verbose... If we consider poll on the
> io_uring rather uncommon, then moving the poll_wq outside of the hotter
> cq_wait cacheline(s) would make sense. Each wait_queue_head is more than
> a cacheline. Then we could have a flag in a spot that's hot anyway
> whether to check it or not, eg in that same section as cq_wait.
> 
> Looking at the layout right now, we're at 116 bytes for that section, or
> two cachelines with 12 bytes to spare. If we add poll_wq, then we'll be
> at 196 bytes, which is 4 bytes over the next cacheline. So it'd
> essentially double the size of that section. If we moved it outside of
> the aligned sections, then it'd pack better.

Just after writing this, I noticed that a spinlock took 64 bytes... In
other words, I have LOCKDEP enabled. The correct number is 24 bytes for
wait_queue_head which is obviously a lot more reasonable. It'd still
make that section one more cacheline since it's now at 60 bytes and
would grow to 84 bytes. But it's obviously not as big of a problem as I
had originally assumed.
Jens Axboe Jan. 4, 2023, 8:53 p.m. UTC | #6
On 1/4/23 1:45?PM, Pavel Begunkov wrote:
> On 1/4/23 20:34, Jens Axboe wrote:
>> On 1/4/23 1:28?PM, Pavel Begunkov wrote:
>>> On 1/4/23 18:08, Jens Axboe wrote:
>>>> On 1/2/23 8:04?PM, Pavel Begunkov wrote:
>>>>> Don't use ->cq_wait for ring polling but add a separate wait queue for
>>>>> it. We need it for following patches.
>>>>>
>>>>> Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
>>>>> ---
>>>>>    include/linux/io_uring_types.h | 1 +
>>>>>    io_uring/io_uring.c            | 3 ++-
>>>>>    io_uring/io_uring.h            | 9 +++++++++
>>>>>    3 files changed, 12 insertions(+), 1 deletion(-)
>>>>>
>>>>> diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
>>>>> index dcd8a563ab52..cbcd3aaddd9d 100644
>>>>> --- a/include/linux/io_uring_types.h
>>>>> +++ b/include/linux/io_uring_types.h
>>>>> @@ -286,6 +286,7 @@ struct io_ring_ctx {
>>>>>            unsigned        cq_entries;
>>>>>            struct io_ev_fd    __rcu    *io_ev_fd;
>>>>>            struct wait_queue_head    cq_wait;
>>>>> +        struct wait_queue_head    poll_wq;
>>>>>            unsigned        cq_extra;
>>>>>        } ____cacheline_aligned_in_smp;
>>>>>    
>>>>
>>>> Should we move poll_wq somewhere else, more out of the way?
>>>
>>> If we care about polling perf and cache collisions with
>>> cq_wait, yeah we can. In any case it's a good idea to at
>>> least move it after cq_extra.
>>>
>>>> Would need to gate the check a flag or something.
>>>
>>> Not sure I follow
>>
>> I guess I could've been a bit more verbose... If we consider poll on the
>> io_uring rather uncommon, then moving the poll_wq outside of the hotter
>> cq_wait cacheline(s) would make sense. Each wait_queue_head is more than
>> a cacheline.
> 
> Looks it's 24B, and wait_queue_entry is uncomfortable 40B.

(also see followup email). Yes, it's only 24 bytes indeed.

>> Then we could have a flag in a spot that's hot anyway
>> whether to check it or not, eg in that same section as cq_wait.
>> Looking at the layout right now, we're at 116 bytes for that section, or
>> two cachelines with 12 bytes to spare. If we add poll_wq, then we'll be
>> at 196 bytes, which is 4 bytes over the next cacheline. So it'd
>> essentially double the size of that section. If we moved it outside of
>> the aligned sections, then it'd pack better.
> 
> Than it's not about hotness and caches but rather memory
> consumption due to padding, which is still a good argument.

Right, it's nice to not keep io_ring_ctx bigger than it needs to be. And
if moved out-of-line, then it'd pack better and we would not "waste"
another cacheline on adding this wait_queue_head for polling.
diff mbox series

Patch

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index dcd8a563ab52..cbcd3aaddd9d 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -286,6 +286,7 @@  struct io_ring_ctx {
 		unsigned		cq_entries;
 		struct io_ev_fd	__rcu	*io_ev_fd;
 		struct wait_queue_head	cq_wait;
+		struct wait_queue_head	poll_wq;
 		unsigned		cq_extra;
 	} ____cacheline_aligned_in_smp;
 
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 682f4b086f09..42f512c42099 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -316,6 +316,7 @@  static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
 	mutex_init(&ctx->uring_lock);
 	init_waitqueue_head(&ctx->cq_wait);
+	init_waitqueue_head(&ctx->poll_wq);
 	spin_lock_init(&ctx->completion_lock);
 	spin_lock_init(&ctx->timeout_lock);
 	INIT_WQ_LIST(&ctx->iopoll_list);
@@ -2768,7 +2769,7 @@  static __poll_t io_uring_poll(struct file *file, poll_table *wait)
 	struct io_ring_ctx *ctx = file->private_data;
 	__poll_t mask = 0;
 
-	poll_wait(file, &ctx->cq_wait, wait);
+	poll_wait(file, &ctx->poll_wq, wait);
 	/*
 	 * synchronizes with barrier from wq_has_sleeper call in
 	 * io_commit_cqring
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 9b7baeff5a1c..645ace377d7e 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -207,9 +207,18 @@  static inline void io_commit_cqring(struct io_ring_ctx *ctx)
 	smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail);
 }
 
+static inline void io_poll_wq_wake(struct io_ring_ctx *ctx)
+{
+	if (waitqueue_active(&ctx->poll_wq))
+		__wake_up(&ctx->poll_wq, TASK_NORMAL, 0,
+				poll_to_key(EPOLL_URING_WAKE | EPOLLIN));
+}
+
 /* requires smb_mb() prior, see wq_has_sleeper() */
 static inline void __io_cqring_wake(struct io_ring_ctx *ctx)
 {
+	io_poll_wq_wake(ctx);
+
 	/*
 	 * Trigger waitqueue handler on all waiters on our waitqueue. This
 	 * won't necessarily wake up all the tasks, io_should_wake() will make