diff mbox series

[2/3] io_uring/msg_ring: avoid double indirection task_work for data messages

Message ID 20240524230501.20178-3-axboe@kernel.dk (mailing list archive)
State New
Headers show
Series Improve MSG_RING SINGLE_ISSUER performance | expand

Commit Message

Jens Axboe May 24, 2024, 10:58 p.m. UTC
If IORING_SETUP_SINGLE_ISSUER is set, then we can't post CQEs remotely
to the target ring. Instead, task_work is queued for the target ring,
which is used to post the CQE. To make matters worse, once the target
CQE has been posted, task_work is then queued with the originator to
fill the completion.

This obviously adds a bunch of overhead and latency. Instead of relying
on generic kernel task_work for this, fill an overflow entry on the
target ring and flag it as such that the target ring will flush it. This
avoids both the task_work for posting the CQE, and it means that the
originator CQE can be filled inline as well.

In local testing, this reduces the latency on the sender side by 5-6x.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/msg_ring.c | 77 +++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 74 insertions(+), 3 deletions(-)

Comments

Pavel Begunkov May 28, 2024, 1:18 p.m. UTC | #1
On 5/24/24 23:58, Jens Axboe wrote:
> If IORING_SETUP_SINGLE_ISSUER is set, then we can't post CQEs remotely
> to the target ring. Instead, task_work is queued for the target ring,
> which is used to post the CQE. To make matters worse, once the target
> CQE has been posted, task_work is then queued with the originator to
> fill the completion.
> 
> This obviously adds a bunch of overhead and latency. Instead of relying
> on generic kernel task_work for this, fill an overflow entry on the
> target ring and flag it as such that the target ring will flush it. This
> avoids both the task_work for posting the CQE, and it means that the
> originator CQE can be filled inline as well.
> 
> In local testing, this reduces the latency on the sender side by 5-6x.
> 
> Signed-off-by: Jens Axboe <axboe@kernel.dk>
> ---
>   io_uring/msg_ring.c | 77 +++++++++++++++++++++++++++++++++++++++++++--
>   1 file changed, 74 insertions(+), 3 deletions(-)
> 
> diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c
> index feff2b0822cf..3f89ff3a40ad 100644
> --- a/io_uring/msg_ring.c
> +++ b/io_uring/msg_ring.c
> @@ -123,6 +123,69 @@ static void io_msg_tw_complete(struct callback_head *head)
>   	io_req_queue_tw_complete(req, ret);
>   }
>   
> +static struct io_overflow_cqe *io_alloc_overflow(struct io_ring_ctx *target_ctx)
> +{
> +	bool is_cqe32 = target_ctx->flags & IORING_SETUP_CQE32;
> +	size_t cqe_size = sizeof(struct io_overflow_cqe);
> +	struct io_overflow_cqe *ocqe;
> +
> +	if (is_cqe32)
> +		cqe_size += sizeof(struct io_uring_cqe);
> +
> +	ocqe = kmalloc(cqe_size, GFP_ATOMIC | __GFP_ACCOUNT);
> +	if (!ocqe)
> +		return NULL;
> +
> +	if (is_cqe32)
> +		ocqe->cqe.big_cqe[0] = ocqe->cqe.big_cqe[1] = 0;
> +
> +	return ocqe;
> +}
> +
> +/*
> + * Entered with the target uring_lock held, and will drop it before
> + * returning. Adds a previously allocated ocqe to the overflow list on
> + * the target, and marks it appropriately for flushing.
> + */
> +static void io_msg_add_overflow(struct io_msg *msg,
> +				struct io_ring_ctx *target_ctx,
> +				struct io_overflow_cqe *ocqe, int ret)
> +	__releases(target_ctx->uring_lock)
> +{
> +	spin_lock(&target_ctx->completion_lock);
> +
> +	if (list_empty(&target_ctx->cq_overflow_list)) {
> +		set_bit(IO_CHECK_CQ_OVERFLOW_BIT, &target_ctx->check_cq);
> +		atomic_or(IORING_SQ_TASKRUN, &target_ctx->rings->sq_flags);

TASKRUN? The normal overflow path sets IORING_SQ_CQ_OVERFLOW


> +	}
> +
> +	ocqe->cqe.user_data = msg->user_data;
> +	ocqe->cqe.res = ret;
> +	list_add_tail(&ocqe->list, &target_ctx->cq_overflow_list);
> +	spin_unlock(&target_ctx->completion_lock);
> +	mutex_unlock(&target_ctx->uring_lock);
> +	wake_up_state(target_ctx->submitter_task, TASK_INTERRUPTIBLE);
> +}
Pavel Begunkov May 28, 2024, 1:32 p.m. UTC | #2
On 5/24/24 23:58, Jens Axboe wrote:
> If IORING_SETUP_SINGLE_ISSUER is set, then we can't post CQEs remotely
> to the target ring. Instead, task_work is queued for the target ring,
> which is used to post the CQE. To make matters worse, once the target
> CQE has been posted, task_work is then queued with the originator to
> fill the completion.
> 
> This obviously adds a bunch of overhead and latency. Instead of relying
> on generic kernel task_work for this, fill an overflow entry on the
> target ring and flag it as such that the target ring will flush it. This
> avoids both the task_work for posting the CQE, and it means that the
> originator CQE can be filled inline as well.
> 
> In local testing, this reduces the latency on the sender side by 5-6x.
> 
> Signed-off-by: Jens Axboe <axboe@kernel.dk>
> ---
>   io_uring/msg_ring.c | 77 +++++++++++++++++++++++++++++++++++++++++++--
>   1 file changed, 74 insertions(+), 3 deletions(-)
> 
> diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c
> index feff2b0822cf..3f89ff3a40ad 100644
> --- a/io_uring/msg_ring.c
> +++ b/io_uring/msg_ring.c
> @@ -123,6 +123,69 @@ static void io_msg_tw_complete(struct callback_head *head)
>   	io_req_queue_tw_complete(req, ret);
>   }
>   
> +static struct io_overflow_cqe *io_alloc_overflow(struct io_ring_ctx *target_ctx)
> +{
> +	bool is_cqe32 = target_ctx->flags & IORING_SETUP_CQE32;
> +	size_t cqe_size = sizeof(struct io_overflow_cqe);
> +	struct io_overflow_cqe *ocqe;
> +
> +	if (is_cqe32)
> +		cqe_size += sizeof(struct io_uring_cqe);
> +
> +	ocqe = kmalloc(cqe_size, GFP_ATOMIC | __GFP_ACCOUNT);

__GFP_ACCOUNT looks painful

> +	if (!ocqe)
> +		return NULL;
> +
> +	if (is_cqe32)
> +		ocqe->cqe.big_cqe[0] = ocqe->cqe.big_cqe[1] = 0;
> +
> +	return ocqe;
> +}
> +
...
Jens Axboe May 28, 2024, 2:23 p.m. UTC | #3
On 5/28/24 7:32 AM, Pavel Begunkov wrote:
> On 5/24/24 23:58, Jens Axboe wrote:
>> If IORING_SETUP_SINGLE_ISSUER is set, then we can't post CQEs remotely
>> to the target ring. Instead, task_work is queued for the target ring,
>> which is used to post the CQE. To make matters worse, once the target
>> CQE has been posted, task_work is then queued with the originator to
>> fill the completion.
>>
>> This obviously adds a bunch of overhead and latency. Instead of relying
>> on generic kernel task_work for this, fill an overflow entry on the
>> target ring and flag it as such that the target ring will flush it. This
>> avoids both the task_work for posting the CQE, and it means that the
>> originator CQE can be filled inline as well.
>>
>> In local testing, this reduces the latency on the sender side by 5-6x.
>>
>> Signed-off-by: Jens Axboe <axboe@kernel.dk>
>> ---
>>   io_uring/msg_ring.c | 77 +++++++++++++++++++++++++++++++++++++++++++--
>>   1 file changed, 74 insertions(+), 3 deletions(-)
>>
>> diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c
>> index feff2b0822cf..3f89ff3a40ad 100644
>> --- a/io_uring/msg_ring.c
>> +++ b/io_uring/msg_ring.c
>> @@ -123,6 +123,69 @@ static void io_msg_tw_complete(struct callback_head *head)
>>       io_req_queue_tw_complete(req, ret);
>>   }
>>   +static struct io_overflow_cqe *io_alloc_overflow(struct io_ring_ctx *target_ctx)
>> +{
>> +    bool is_cqe32 = target_ctx->flags & IORING_SETUP_CQE32;
>> +    size_t cqe_size = sizeof(struct io_overflow_cqe);
>> +    struct io_overflow_cqe *ocqe;
>> +
>> +    if (is_cqe32)
>> +        cqe_size += sizeof(struct io_uring_cqe);
>> +
>> +    ocqe = kmalloc(cqe_size, GFP_ATOMIC | __GFP_ACCOUNT);
> 
> __GFP_ACCOUNT looks painful

It always is - I did add the usual alloc cache for this after posting
this series, which makes it a no-op basically:

https://git.kernel.dk/cgit/linux/commit/?h=io_uring-msg_ring&id=c39ead262b60872d6d7daf55e9fc7d76dc09b29d

Just haven't posted a v2 yet.
Jens Axboe May 28, 2024, 2:23 p.m. UTC | #4
On 5/28/24 7:18 AM, Pavel Begunkov wrote:
> On 5/24/24 23:58, Jens Axboe wrote:
>> If IORING_SETUP_SINGLE_ISSUER is set, then we can't post CQEs remotely
>> to the target ring. Instead, task_work is queued for the target ring,
>> which is used to post the CQE. To make matters worse, once the target
>> CQE has been posted, task_work is then queued with the originator to
>> fill the completion.
>>
>> This obviously adds a bunch of overhead and latency. Instead of relying
>> on generic kernel task_work for this, fill an overflow entry on the
>> target ring and flag it as such that the target ring will flush it. This
>> avoids both the task_work for posting the CQE, and it means that the
>> originator CQE can be filled inline as well.
>>
>> In local testing, this reduces the latency on the sender side by 5-6x.
>>
>> Signed-off-by: Jens Axboe <axboe@kernel.dk>
>> ---
>>   io_uring/msg_ring.c | 77 +++++++++++++++++++++++++++++++++++++++++++--
>>   1 file changed, 74 insertions(+), 3 deletions(-)
>>
>> diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c
>> index feff2b0822cf..3f89ff3a40ad 100644
>> --- a/io_uring/msg_ring.c
>> +++ b/io_uring/msg_ring.c
>> @@ -123,6 +123,69 @@ static void io_msg_tw_complete(struct callback_head *head)
>>       io_req_queue_tw_complete(req, ret);
>>   }
>>   +static struct io_overflow_cqe *io_alloc_overflow(struct io_ring_ctx *target_ctx)
>> +{
>> +    bool is_cqe32 = target_ctx->flags & IORING_SETUP_CQE32;
>> +    size_t cqe_size = sizeof(struct io_overflow_cqe);
>> +    struct io_overflow_cqe *ocqe;
>> +
>> +    if (is_cqe32)
>> +        cqe_size += sizeof(struct io_uring_cqe);
>> +
>> +    ocqe = kmalloc(cqe_size, GFP_ATOMIC | __GFP_ACCOUNT);
>> +    if (!ocqe)
>> +        return NULL;
>> +
>> +    if (is_cqe32)
>> +        ocqe->cqe.big_cqe[0] = ocqe->cqe.big_cqe[1] = 0;
>> +
>> +    return ocqe;
>> +}
>> +
>> +/*
>> + * Entered with the target uring_lock held, and will drop it before
>> + * returning. Adds a previously allocated ocqe to the overflow list on
>> + * the target, and marks it appropriately for flushing.
>> + */
>> +static void io_msg_add_overflow(struct io_msg *msg,
>> +                struct io_ring_ctx *target_ctx,
>> +                struct io_overflow_cqe *ocqe, int ret)
>> +    __releases(target_ctx->uring_lock)
>> +{
>> +    spin_lock(&target_ctx->completion_lock);
>> +
>> +    if (list_empty(&target_ctx->cq_overflow_list)) {
>> +        set_bit(IO_CHECK_CQ_OVERFLOW_BIT, &target_ctx->check_cq);
>> +        atomic_or(IORING_SQ_TASKRUN, &target_ctx->rings->sq_flags);
> 
> TASKRUN? The normal overflow path sets IORING_SQ_CQ_OVERFLOW

Was a bit split on it - we want it run as part of waiting, but I also
wasn't super interested in exposing it as an overflow condition since it
is now. It's more of an internal implementation detail.
Pavel Begunkov May 28, 2024, 4:23 p.m. UTC | #5
On 5/28/24 15:23, Jens Axboe wrote:
> On 5/28/24 7:32 AM, Pavel Begunkov wrote:
>> On 5/24/24 23:58, Jens Axboe wrote:
>>> If IORING_SETUP_SINGLE_ISSUER is set, then we can't post CQEs remotely
>>> to the target ring. Instead, task_work is queued for the target ring,
>>> which is used to post the CQE. To make matters worse, once the target
>>> CQE has been posted, task_work is then queued with the originator to
>>> fill the completion.
>>>
>>> This obviously adds a bunch of overhead and latency. Instead of relying
>>> on generic kernel task_work for this, fill an overflow entry on the
>>> target ring and flag it as such that the target ring will flush it. This
>>> avoids both the task_work for posting the CQE, and it means that the
>>> originator CQE can be filled inline as well.
>>>
>>> In local testing, this reduces the latency on the sender side by 5-6x.
>>>
>>> Signed-off-by: Jens Axboe <axboe@kernel.dk>
>>> ---
>>>    io_uring/msg_ring.c | 77 +++++++++++++++++++++++++++++++++++++++++++--
>>>    1 file changed, 74 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c
>>> index feff2b0822cf..3f89ff3a40ad 100644
>>> --- a/io_uring/msg_ring.c
>>> +++ b/io_uring/msg_ring.c
>>> @@ -123,6 +123,69 @@ static void io_msg_tw_complete(struct callback_head *head)
>>>        io_req_queue_tw_complete(req, ret);
>>>    }
>>>    +static struct io_overflow_cqe *io_alloc_overflow(struct io_ring_ctx *target_ctx)
>>> +{
>>> +    bool is_cqe32 = target_ctx->flags & IORING_SETUP_CQE32;
>>> +    size_t cqe_size = sizeof(struct io_overflow_cqe);
>>> +    struct io_overflow_cqe *ocqe;
>>> +
>>> +    if (is_cqe32)
>>> +        cqe_size += sizeof(struct io_uring_cqe);
>>> +
>>> +    ocqe = kmalloc(cqe_size, GFP_ATOMIC | __GFP_ACCOUNT);
>>
>> __GFP_ACCOUNT looks painful
> 
> It always is - I did add the usual alloc cache for this after posting
> this series, which makes it a no-op basically:

Simple ring private cache wouldn't work so well with non
uniform transfer distributions. One way messaging, userspace
level batching, etc., but the main question is in the other
email, i.e. maybe it's better to go with the 2 tw hop model,
which returns memory back where it came from.

> https://git.kernel.dk/cgit/linux/commit/?h=io_uring-msg_ring&id=c39ead262b60872d6d7daf55e9fc7d76dc09b29d
> 
> Just haven't posted a v2 yet.
>
Jens Axboe May 28, 2024, 5:59 p.m. UTC | #6
On 5/28/24 10:23 AM, Pavel Begunkov wrote:
> On 5/28/24 15:23, Jens Axboe wrote:
>> On 5/28/24 7:32 AM, Pavel Begunkov wrote:
>>> On 5/24/24 23:58, Jens Axboe wrote:
>>>> If IORING_SETUP_SINGLE_ISSUER is set, then we can't post CQEs remotely
>>>> to the target ring. Instead, task_work is queued for the target ring,
>>>> which is used to post the CQE. To make matters worse, once the target
>>>> CQE has been posted, task_work is then queued with the originator to
>>>> fill the completion.
>>>>
>>>> This obviously adds a bunch of overhead and latency. Instead of relying
>>>> on generic kernel task_work for this, fill an overflow entry on the
>>>> target ring and flag it as such that the target ring will flush it. This
>>>> avoids both the task_work for posting the CQE, and it means that the
>>>> originator CQE can be filled inline as well.
>>>>
>>>> In local testing, this reduces the latency on the sender side by 5-6x.
>>>>
>>>> Signed-off-by: Jens Axboe <axboe@kernel.dk>
>>>> ---
>>>>    io_uring/msg_ring.c | 77 +++++++++++++++++++++++++++++++++++++++++++--
>>>>    1 file changed, 74 insertions(+), 3 deletions(-)
>>>>
>>>> diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c
>>>> index feff2b0822cf..3f89ff3a40ad 100644
>>>> --- a/io_uring/msg_ring.c
>>>> +++ b/io_uring/msg_ring.c
>>>> @@ -123,6 +123,69 @@ static void io_msg_tw_complete(struct callback_head *head)
>>>>        io_req_queue_tw_complete(req, ret);
>>>>    }
>>>>    +static struct io_overflow_cqe *io_alloc_overflow(struct io_ring_ctx *target_ctx)
>>>> +{
>>>> +    bool is_cqe32 = target_ctx->flags & IORING_SETUP_CQE32;
>>>> +    size_t cqe_size = sizeof(struct io_overflow_cqe);
>>>> +    struct io_overflow_cqe *ocqe;
>>>> +
>>>> +    if (is_cqe32)
>>>> +        cqe_size += sizeof(struct io_uring_cqe);
>>>> +
>>>> +    ocqe = kmalloc(cqe_size, GFP_ATOMIC | __GFP_ACCOUNT);
>>>
>>> __GFP_ACCOUNT looks painful
>>
>> It always is - I did add the usual alloc cache for this after posting
>> this series, which makes it a no-op basically:
> 
> Simple ring private cache wouldn't work so well with non
> uniform transfer distributions. One way messaging, userspace
> level batching, etc., but the main question is in the other
> email, i.e. maybe it's better to go with the 2 tw hop model,
> which returns memory back where it came from.

The cache is local to the ring, so anyone that sends messages to that
ring gets to use it. So I believe it should in fact work really well. If
messaging is bidirectional, then caching on the target will apply in
both directions.
Pavel Begunkov May 29, 2024, 2:04 a.m. UTC | #7
On 5/28/24 18:59, Jens Axboe wrote:
> On 5/28/24 10:23 AM, Pavel Begunkov wrote:
>> On 5/28/24 15:23, Jens Axboe wrote:
>>> On 5/28/24 7:32 AM, Pavel Begunkov wrote:
>>>> On 5/24/24 23:58, Jens Axboe wrote:
>>>>> If IORING_SETUP_SINGLE_ISSUER is set, then we can't post CQEs remotely
>>>>> to the target ring. Instead, task_work is queued for the target ring,
>>>>> which is used to post the CQE. To make matters worse, once the target
>>>>> CQE has been posted, task_work is then queued with the originator to
>>>>> fill the completion.
>>>>>
>>>>> This obviously adds a bunch of overhead and latency. Instead of relying
>>>>> on generic kernel task_work for this, fill an overflow entry on the
>>>>> target ring and flag it as such that the target ring will flush it. This
>>>>> avoids both the task_work for posting the CQE, and it means that the
>>>>> originator CQE can be filled inline as well.
>>>>>
>>>>> In local testing, this reduces the latency on the sender side by 5-6x.
>>>>>
>>>>> Signed-off-by: Jens Axboe <axboe@kernel.dk>
>>>>> ---
>>>>>     io_uring/msg_ring.c | 77 +++++++++++++++++++++++++++++++++++++++++++--
>>>>>     1 file changed, 74 insertions(+), 3 deletions(-)
>>>>>
>>>>> diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c
>>>>> index feff2b0822cf..3f89ff3a40ad 100644
>>>>> --- a/io_uring/msg_ring.c
>>>>> +++ b/io_uring/msg_ring.c
>>>>> @@ -123,6 +123,69 @@ static void io_msg_tw_complete(struct callback_head *head)
>>>>>         io_req_queue_tw_complete(req, ret);
>>>>>     }
>>>>>     +static struct io_overflow_cqe *io_alloc_overflow(struct io_ring_ctx *target_ctx)
>>>>> +{
>>>>> +    bool is_cqe32 = target_ctx->flags & IORING_SETUP_CQE32;
>>>>> +    size_t cqe_size = sizeof(struct io_overflow_cqe);
>>>>> +    struct io_overflow_cqe *ocqe;
>>>>> +
>>>>> +    if (is_cqe32)
>>>>> +        cqe_size += sizeof(struct io_uring_cqe);
>>>>> +
>>>>> +    ocqe = kmalloc(cqe_size, GFP_ATOMIC | __GFP_ACCOUNT);
>>>>
>>>> __GFP_ACCOUNT looks painful
>>>
>>> It always is - I did add the usual alloc cache for this after posting
>>> this series, which makes it a no-op basically:
>>
>> Simple ring private cache wouldn't work so well with non
>> uniform transfer distributions. One way messaging, userspace
>> level batching, etc., but the main question is in the other
>> email, i.e. maybe it's better to go with the 2 tw hop model,
>> which returns memory back where it came from.
> 
> The cache is local to the ring, so anyone that sends messages to that
> ring gets to use it. So I believe it should in fact work really well. If
> messaging is bidirectional, then caching on the target will apply in
> both directions.

*taking a look at the patch* it gets the entry from the target's
ring, so indeed not a problem. Taking the target lock for that,
however, is not the best, I ranted before about inter dependencies
b/w rings. E.g. requests messaging a ring run by a task CPU bound
in submission / tw execution would be directed to iowq and occupy
a worker thread for the time being.
Jens Axboe May 29, 2024, 2:43 a.m. UTC | #8
On 5/28/24 8:04 PM, Pavel Begunkov wrote:
> On 5/28/24 18:59, Jens Axboe wrote:
>> On 5/28/24 10:23 AM, Pavel Begunkov wrote:
>>> On 5/28/24 15:23, Jens Axboe wrote:
>>>> On 5/28/24 7:32 AM, Pavel Begunkov wrote:
>>>>> On 5/24/24 23:58, Jens Axboe wrote:
>>>>>> If IORING_SETUP_SINGLE_ISSUER is set, then we can't post CQEs remotely
>>>>>> to the target ring. Instead, task_work is queued for the target ring,
>>>>>> which is used to post the CQE. To make matters worse, once the target
>>>>>> CQE has been posted, task_work is then queued with the originator to
>>>>>> fill the completion.
>>>>>>
>>>>>> This obviously adds a bunch of overhead and latency. Instead of relying
>>>>>> on generic kernel task_work for this, fill an overflow entry on the
>>>>>> target ring and flag it as such that the target ring will flush it. This
>>>>>> avoids both the task_work for posting the CQE, and it means that the
>>>>>> originator CQE can be filled inline as well.
>>>>>>
>>>>>> In local testing, this reduces the latency on the sender side by 5-6x.
>>>>>>
>>>>>> Signed-off-by: Jens Axboe <axboe@kernel.dk>
>>>>>> ---
>>>>>>     io_uring/msg_ring.c | 77 +++++++++++++++++++++++++++++++++++++++++++--
>>>>>>     1 file changed, 74 insertions(+), 3 deletions(-)
>>>>>>
>>>>>> diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c
>>>>>> index feff2b0822cf..3f89ff3a40ad 100644
>>>>>> --- a/io_uring/msg_ring.c
>>>>>> +++ b/io_uring/msg_ring.c
>>>>>> @@ -123,6 +123,69 @@ static void io_msg_tw_complete(struct callback_head *head)
>>>>>>         io_req_queue_tw_complete(req, ret);
>>>>>>     }
>>>>>>     +static struct io_overflow_cqe *io_alloc_overflow(struct io_ring_ctx *target_ctx)
>>>>>> +{
>>>>>> +    bool is_cqe32 = target_ctx->flags & IORING_SETUP_CQE32;
>>>>>> +    size_t cqe_size = sizeof(struct io_overflow_cqe);
>>>>>> +    struct io_overflow_cqe *ocqe;
>>>>>> +
>>>>>> +    if (is_cqe32)
>>>>>> +        cqe_size += sizeof(struct io_uring_cqe);
>>>>>> +
>>>>>> +    ocqe = kmalloc(cqe_size, GFP_ATOMIC | __GFP_ACCOUNT);
>>>>>
>>>>> __GFP_ACCOUNT looks painful
>>>>
>>>> It always is - I did add the usual alloc cache for this after posting
>>>> this series, which makes it a no-op basically:
>>>
>>> Simple ring private cache wouldn't work so well with non
>>> uniform transfer distributions. One way messaging, userspace
>>> level batching, etc., but the main question is in the other
>>> email, i.e. maybe it's better to go with the 2 tw hop model,
>>> which returns memory back where it came from.
>>
>> The cache is local to the ring, so anyone that sends messages to that
>> ring gets to use it. So I believe it should in fact work really well. If
>> messaging is bidirectional, then caching on the target will apply in
>> both directions.
> 
> *taking a look at the patch* it gets the entry from the target's
> ring, so indeed not a problem. Taking the target lock for that,
> however, is not the best, I ranted before about inter dependencies
> b/w rings. E.g. requests messaging a ring run by a task CPU bound
> in submission / tw execution would be directed to iowq and occupy
> a worker thread for the time being.

I can try and do some stats on io-wq bouncing, it can indeed be a risk.
Might even be possible to only retain the ring lock for flushing, which
is less of an issue as it happens locally, and have the overflow entries
locked separately. For now I just kept the overflow backend that we
already have, and the locking that MSG_RING already does.
diff mbox series

Patch

diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c
index feff2b0822cf..3f89ff3a40ad 100644
--- a/io_uring/msg_ring.c
+++ b/io_uring/msg_ring.c
@@ -123,6 +123,69 @@  static void io_msg_tw_complete(struct callback_head *head)
 	io_req_queue_tw_complete(req, ret);
 }
 
+static struct io_overflow_cqe *io_alloc_overflow(struct io_ring_ctx *target_ctx)
+{
+	bool is_cqe32 = target_ctx->flags & IORING_SETUP_CQE32;
+	size_t cqe_size = sizeof(struct io_overflow_cqe);
+	struct io_overflow_cqe *ocqe;
+
+	if (is_cqe32)
+		cqe_size += sizeof(struct io_uring_cqe);
+
+	ocqe = kmalloc(cqe_size, GFP_ATOMIC | __GFP_ACCOUNT);
+	if (!ocqe)
+		return NULL;
+
+	if (is_cqe32)
+		ocqe->cqe.big_cqe[0] = ocqe->cqe.big_cqe[1] = 0;
+
+	return ocqe;
+}
+
+/*
+ * Entered with the target uring_lock held, and will drop it before
+ * returning. Adds a previously allocated ocqe to the overflow list on
+ * the target, and marks it appropriately for flushing.
+ */
+static void io_msg_add_overflow(struct io_msg *msg,
+				struct io_ring_ctx *target_ctx,
+				struct io_overflow_cqe *ocqe, int ret)
+	__releases(target_ctx->uring_lock)
+{
+	spin_lock(&target_ctx->completion_lock);
+
+	if (list_empty(&target_ctx->cq_overflow_list)) {
+		set_bit(IO_CHECK_CQ_OVERFLOW_BIT, &target_ctx->check_cq);
+		atomic_or(IORING_SQ_TASKRUN, &target_ctx->rings->sq_flags);
+	}
+
+	ocqe->cqe.user_data = msg->user_data;
+	ocqe->cqe.res = ret;
+	list_add_tail(&ocqe->list, &target_ctx->cq_overflow_list);
+	spin_unlock(&target_ctx->completion_lock);
+	mutex_unlock(&target_ctx->uring_lock);
+	wake_up_state(target_ctx->submitter_task, TASK_INTERRUPTIBLE);
+}
+
+static bool io_msg_fill_remote(struct io_msg *msg, unsigned int issue_flags,
+			       struct io_ring_ctx *target_ctx, u32 flags)
+{
+	struct io_overflow_cqe *ocqe;
+
+	ocqe = io_alloc_overflow(target_ctx);
+	if (!ocqe)
+		return false;
+
+	if (unlikely(io_double_lock_ctx(target_ctx, issue_flags))) {
+		kfree(ocqe);
+		return false;
+	}
+
+	ocqe->cqe.flags = flags;
+	io_msg_add_overflow(msg, target_ctx, ocqe, msg->len);
+	return true;
+}
+
 static int io_msg_ring_data(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct io_ring_ctx *target_ctx = req->file->private_data;
@@ -137,12 +200,20 @@  static int io_msg_ring_data(struct io_kiocb *req, unsigned int issue_flags)
 	if (target_ctx->flags & IORING_SETUP_R_DISABLED)
 		return -EBADFD;
 
-	if (io_msg_need_remote(target_ctx))
-		return io_msg_exec_remote(req, io_msg_tw_complete);
-
 	if (msg->flags & IORING_MSG_RING_FLAGS_PASS)
 		flags = msg->cqe_flags;
 
+	if (io_msg_need_remote(target_ctx)) {
+		/*
+		 * Try adding an overflow entry to the target, and only if
+		 * that fails, resort to using more expensive task_work to
+		 * have the target_ctx owner fill the CQE.
+		 */
+		if (!io_msg_fill_remote(msg, issue_flags, target_ctx, flags))
+			return io_msg_exec_remote(req, io_msg_tw_complete);
+		return 0;
+	}
+
 	ret = -EOVERFLOW;
 	if (target_ctx->flags & IORING_SETUP_IOPOLL) {
 		if (unlikely(io_double_lock_ctx(target_ctx, issue_flags)))