diff mbox series

[RFC,v4,13/16] io_uring: add io_recvzc request

Message ID 20240312214430.2923019-14-dw@davidwei.uk (mailing list archive)
State RFC
Headers show
Series Zero copy Rx using io_uring | expand

Checks

Context Check Description
netdev/tree_selection success Guessing tree name failed - patch did not apply, async

Commit Message

David Wei March 12, 2024, 9:44 p.m. UTC
Add an io_uring opcode OP_RECV_ZC for doing ZC reads from a socket that
is set up for ZC Rx. The request reads skbs from a socket. Completions
are posted into the main CQ for each page frag read.

Big CQEs (CQE32) is required as the OP_RECV_ZC specific metadata (ZC
region, offset, len) are stored in the extended 16 bytes as a
struct io_uring_rbuf_cqe.

For now there is no limit as to how much work each OP_RECV_ZC request
does. It will attempt to drain a socket of all available data.

Multishot requests are also supported. The first time an io_recvzc
request completes, EAGAIN is returned which arms an async poll. Then, in
subsequent runs in task work, IOU_ISSUE_SKIP_COMPLETE is returned to
continue async polling.

Signed-off-by: David Wei <dw@davidwei.uk>
---
 include/uapi/linux/io_uring.h |   1 +
 io_uring/io_uring.h           |  10 ++
 io_uring/net.c                |  94 +++++++++++++++++-
 io_uring/opdef.c              |  16 +++
 io_uring/zc_rx.c              | 177 +++++++++++++++++++++++++++++++++-
 io_uring/zc_rx.h              |  11 +++
 6 files changed, 302 insertions(+), 7 deletions(-)

Comments

Jens Axboe March 13, 2024, 8:25 p.m. UTC | #1
On 3/12/24 3:44 PM, David Wei wrote:
> Add an io_uring opcode OP_RECV_ZC for doing ZC reads from a socket that
> is set up for ZC Rx. The request reads skbs from a socket. Completions
> are posted into the main CQ for each page frag read.
> 
> Big CQEs (CQE32) is required as the OP_RECV_ZC specific metadata (ZC
> region, offset, len) are stored in the extended 16 bytes as a
> struct io_uring_rbuf_cqe.
> 
> For now there is no limit as to how much work each OP_RECV_ZC request
> does. It will attempt to drain a socket of all available data.
> 
> Multishot requests are also supported. The first time an io_recvzc
> request completes, EAGAIN is returned which arms an async poll. Then, in
> subsequent runs in task work, IOU_ISSUE_SKIP_COMPLETE is returned to
> continue async polling.

I'd probably drop that last paragraph, this is how all multishot
requests work and is implementation detail that need not go in the
commit message. Probably suffices just to say it supports multishot.

> @@ -695,7 +701,7 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
>  	unsigned int cflags;
>  
>  	cflags = io_put_kbuf(req, issue_flags);
> -	if (msg->msg_inq && msg->msg_inq != -1)
> +	if (msg && msg->msg_inq && msg->msg_inq != -1)
>  		cflags |= IORING_CQE_F_SOCK_NONEMPTY;
>  
>  	if (!(req->flags & REQ_F_APOLL_MULTISHOT)) {
> @@ -723,7 +729,7 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
>  			goto enobufs;
>  
>  		/* Known not-empty or unknown state, retry */
> -		if (cflags & IORING_CQE_F_SOCK_NONEMPTY || msg->msg_inq == -1) {
> +		if (cflags & IORING_CQE_F_SOCK_NONEMPTY || (msg && msg->msg_inq == -1)) {
>  			if (sr->nr_multishot_loops++ < MULTISHOT_MAX_RETRY)
>  				return false;
>  			/* mshot retries exceeded, force a requeue */

Maybe refactor this a bit so that you don't need to add these NULL
checks? That seems pretty fragile, hard to read, and should be doable
without extra checks.

> @@ -1053,6 +1058,85 @@ struct io_zc_rx_ifq *io_zc_verify_sock(struct io_kiocb *req,
>  	return ifq;
>  }
>  
> +int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
> +{
> +	struct io_recvzc *zc = io_kiocb_to_cmd(req, struct io_recvzc);
> +
> +	/* non-iopoll defer_taskrun only */
> +	if (!req->ctx->task_complete)
> +		return -EINVAL;

What's the reasoning behind this?

> +	struct io_recvzc *zc = io_kiocb_to_cmd(req, struct io_recvzc);
> +	struct io_zc_rx_ifq *ifq;
> +	struct socket *sock;
> +	int ret;
> +
> +	/*
> +	 * We're posting CQEs deeper in the stack, and to avoid taking CQ locks
> +	 * we serialise by having only the master thread modifying the CQ with
> +	 * DEFER_TASkRUN checked earlier and forbidding executing it from io-wq.
> +	 * That's similar to io_check_multishot() for multishot CQEs.
> +	 */
> +	if (issue_flags & IO_URING_F_IOWQ)
> +		return -EAGAIN;
> +	if (WARN_ON_ONCE(!(issue_flags & IO_URING_F_NONBLOCK)))
> +		return -EAGAIN;

If rebased on the current tree, does this go away?
Pavel Begunkov March 13, 2024, 8:26 p.m. UTC | #2
On 3/13/24 20:25, Jens Axboe wrote:
> On 3/12/24 3:44 PM, David Wei wrote:
>> Add an io_uring opcode OP_RECV_ZC for doing ZC reads from a socket that
>> is set up for ZC Rx. The request reads skbs from a socket. Completions
>> are posted into the main CQ for each page frag read.
>>
>> Big CQEs (CQE32) is required as the OP_RECV_ZC specific metadata (ZC
>> region, offset, len) are stored in the extended 16 bytes as a
>> struct io_uring_rbuf_cqe.
>>
>> For now there is no limit as to how much work each OP_RECV_ZC request
>> does. It will attempt to drain a socket of all available data.
>>
>> Multishot requests are also supported. The first time an io_recvzc
>> request completes, EAGAIN is returned which arms an async poll. Then, in
>> subsequent runs in task work, IOU_ISSUE_SKIP_COMPLETE is returned to
>> continue async polling.
> 
> I'd probably drop that last paragraph, this is how all multishot
> requests work and is implementation detail that need not go in the
> commit message. Probably suffices just to say it supports multishot.
> 
>> @@ -695,7 +701,7 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
>>   	unsigned int cflags;
>>   
>>   	cflags = io_put_kbuf(req, issue_flags);
>> -	if (msg->msg_inq && msg->msg_inq != -1)
>> +	if (msg && msg->msg_inq && msg->msg_inq != -1)
>>   		cflags |= IORING_CQE_F_SOCK_NONEMPTY;
>>   
>>   	if (!(req->flags & REQ_F_APOLL_MULTISHOT)) {
>> @@ -723,7 +729,7 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
>>   			goto enobufs;
>>   
>>   		/* Known not-empty or unknown state, retry */
>> -		if (cflags & IORING_CQE_F_SOCK_NONEMPTY || msg->msg_inq == -1) {
>> +		if (cflags & IORING_CQE_F_SOCK_NONEMPTY || (msg && msg->msg_inq == -1)) {
>>   			if (sr->nr_multishot_loops++ < MULTISHOT_MAX_RETRY)
>>   				return false;
>>   			/* mshot retries exceeded, force a requeue */
> 
> Maybe refactor this a bit so that you don't need to add these NULL
> checks? That seems pretty fragile, hard to read, and should be doable
> without extra checks.

That chunk can be completely thrown away, we're not using
io_recv_finish() here anymore


>> @@ -1053,6 +1058,85 @@ struct io_zc_rx_ifq *io_zc_verify_sock(struct io_kiocb *req,
>>   	return ifq;
>>   }
>>   
>> +int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
>> +{
>> +	struct io_recvzc *zc = io_kiocb_to_cmd(req, struct io_recvzc);
>> +
>> +	/* non-iopoll defer_taskrun only */
>> +	if (!req->ctx->task_complete)
>> +		return -EINVAL;
> 
> What's the reasoning behind this?

CQ locking, see the comment a couple lines below


>> +	struct io_recvzc *zc = io_kiocb_to_cmd(req, struct io_recvzc);
>> +	struct io_zc_rx_ifq *ifq;
>> +	struct socket *sock;
>> +	int ret;
>> +
>> +	/*
>> +	 * We're posting CQEs deeper in the stack, and to avoid taking CQ locks
>> +	 * we serialise by having only the master thread modifying the CQ with
>> +	 * DEFER_TASkRUN checked earlier and forbidding executing it from io-wq.
>> +	 * That's similar to io_check_multishot() for multishot CQEs.
>> +	 */

This one ^^, though it doesn't read well, I should reword it for
next time.

>> +	if (issue_flags & IO_URING_F_IOWQ)
>> +		return -EAGAIN;
>> +	if (WARN_ON_ONCE(!(issue_flags & IO_URING_F_NONBLOCK)))
>> +		return -EAGAIN;
> 
> If rebased on the current tree, does this go away?

It's just a little behind not to have that change
Jens Axboe March 13, 2024, 9:03 p.m. UTC | #3
On 3/13/24 2:26 PM, Pavel Begunkov wrote:
> On 3/13/24 20:25, Jens Axboe wrote:
>> On 3/12/24 3:44 PM, David Wei wrote:
>>> Add an io_uring opcode OP_RECV_ZC for doing ZC reads from a socket that
>>> is set up for ZC Rx. The request reads skbs from a socket. Completions
>>> are posted into the main CQ for each page frag read.
>>>
>>> Big CQEs (CQE32) is required as the OP_RECV_ZC specific metadata (ZC
>>> region, offset, len) are stored in the extended 16 bytes as a
>>> struct io_uring_rbuf_cqe.
>>>
>>> For now there is no limit as to how much work each OP_RECV_ZC request
>>> does. It will attempt to drain a socket of all available data.
>>>
>>> Multishot requests are also supported. The first time an io_recvzc
>>> request completes, EAGAIN is returned which arms an async poll. Then, in
>>> subsequent runs in task work, IOU_ISSUE_SKIP_COMPLETE is returned to
>>> continue async polling.
>>
>> I'd probably drop that last paragraph, this is how all multishot
>> requests work and is implementation detail that need not go in the
>> commit message. Probably suffices just to say it supports multishot.
>>
>>> @@ -695,7 +701,7 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
>>>       unsigned int cflags;
>>>         cflags = io_put_kbuf(req, issue_flags);
>>> -    if (msg->msg_inq && msg->msg_inq != -1)
>>> +    if (msg && msg->msg_inq && msg->msg_inq != -1)
>>>           cflags |= IORING_CQE_F_SOCK_NONEMPTY;
>>>         if (!(req->flags & REQ_F_APOLL_MULTISHOT)) {
>>> @@ -723,7 +729,7 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
>>>               goto enobufs;
>>>             /* Known not-empty or unknown state, retry */
>>> -        if (cflags & IORING_CQE_F_SOCK_NONEMPTY || msg->msg_inq == -1) {
>>> +        if (cflags & IORING_CQE_F_SOCK_NONEMPTY || (msg && msg->msg_inq == -1)) {
>>>               if (sr->nr_multishot_loops++ < MULTISHOT_MAX_RETRY)
>>>                   return false;
>>>               /* mshot retries exceeded, force a requeue */
>>
>> Maybe refactor this a bit so that you don't need to add these NULL
>> checks? That seems pretty fragile, hard to read, and should be doable
>> without extra checks.
> 
> That chunk can be completely thrown away, we're not using
> io_recv_finish() here anymore
> 
> 
>>> @@ -1053,6 +1058,85 @@ struct io_zc_rx_ifq *io_zc_verify_sock(struct io_kiocb *req,
>>>       return ifq;
>>>   }
>>>   +int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
>>> +{
>>> +    struct io_recvzc *zc = io_kiocb_to_cmd(req, struct io_recvzc);
>>> +
>>> +    /* non-iopoll defer_taskrun only */
>>> +    if (!req->ctx->task_complete)
>>> +        return -EINVAL;
>>
>> What's the reasoning behind this?
> 
> CQ locking, see the comment a couple lines below
> 
> 
>>> +    struct io_recvzc *zc = io_kiocb_to_cmd(req, struct io_recvzc);
>>> +    struct io_zc_rx_ifq *ifq;
>>> +    struct socket *sock;
>>> +    int ret;
>>> +
>>> +    /*
>>> +     * We're posting CQEs deeper in the stack, and to avoid taking CQ locks
>>> +     * we serialise by having only the master thread modifying the CQ with
>>> +     * DEFER_TASkRUN checked earlier and forbidding executing it from io-wq.
>>> +     * That's similar to io_check_multishot() for multishot CQEs.
>>> +     */
> 
> This one ^^, though it doesn't read well, I should reword it for
> next time.
> 
>>> +    if (issue_flags & IO_URING_F_IOWQ)
>>> +        return -EAGAIN;
>>> +    if (WARN_ON_ONCE(!(issue_flags & IO_URING_F_NONBLOCK)))
>>> +        return -EAGAIN;
>>
>> If rebased on the current tree, does this go away?
> 
> It's just a little behind not to have that change
>
Jens Axboe March 14, 2024, 4:14 p.m. UTC | #4
(Apparently this went out without my comments attached, only one thing
worth noting so repeating that)

>>> @@ -695,7 +701,7 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
>>>       unsigned int cflags;
>>>         cflags = io_put_kbuf(req, issue_flags);
>>> -    if (msg->msg_inq && msg->msg_inq != -1)
>>> +    if (msg && msg->msg_inq && msg->msg_inq != -1)
>>>           cflags |= IORING_CQE_F_SOCK_NONEMPTY;
>>>         if (!(req->flags & REQ_F_APOLL_MULTISHOT)) {
>>> @@ -723,7 +729,7 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
>>>               goto enobufs;
>>>             /* Known not-empty or unknown state, retry */
>>> -        if (cflags & IORING_CQE_F_SOCK_NONEMPTY || msg->msg_inq == -1) {
>>> +        if (cflags & IORING_CQE_F_SOCK_NONEMPTY || (msg && msg->msg_inq == -1)) {
>>>               if (sr->nr_multishot_loops++ < MULTISHOT_MAX_RETRY)
>>>                   return false;
>>>               /* mshot retries exceeded, force a requeue */
>>
>> Maybe refactor this a bit so that you don't need to add these NULL
>> checks? That seems pretty fragile, hard to read, and should be doable
>> without extra checks.
> 
> That chunk can be completely thrown away, we're not using
> io_recv_finish() here anymore

OK good!

>>> @@ -1053,6 +1058,85 @@ struct io_zc_rx_ifq *io_zc_verify_sock(struct io_kiocb *req,
>>>       return ifq;
>>>   }
>>>   +int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
>>> +{
>>> +    struct io_recvzc *zc = io_kiocb_to_cmd(req, struct io_recvzc);
>>> +
>>> +    /* non-iopoll defer_taskrun only */
>>> +    if (!req->ctx->task_complete)
>>> +        return -EINVAL;
>>
>> What's the reasoning behind this?
> 
> CQ locking, see the comment a couple lines below

My question here was more towards "is this something we want to do".
Maybe this is just a temporary work-around and it's nothing to discuss,
but I'm not sure we want to have opcodes only work on certain ring
setups.
Pavel Begunkov March 15, 2024, 5:34 p.m. UTC | #5
On 3/14/24 16:14, Jens Axboe wrote:
[...]
>>>> @@ -1053,6 +1058,85 @@ struct io_zc_rx_ifq *io_zc_verify_sock(struct io_kiocb *req,
>>>>        return ifq;
>>>>    }
>>>>    +int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
>>>> +{
>>>> +    struct io_recvzc *zc = io_kiocb_to_cmd(req, struct io_recvzc);
>>>> +
>>>> +    /* non-iopoll defer_taskrun only */
>>>> +    if (!req->ctx->task_complete)
>>>> +        return -EINVAL;
>>>
>>> What's the reasoning behind this?
>>
>> CQ locking, see the comment a couple lines below
> 
> My question here was more towards "is this something we want to do".
> Maybe this is just a temporary work-around and it's nothing to discuss,
> but I'm not sure we want to have opcodes only work on certain ring
> setups.

I don't think it's that unreasonable restricting it. It's hard to
care about !DEFER_TASKRUN for net workloads, it makes CQE posting a bit
cleaner, and who knows where the single task part would become handy.
Thinking about ifq termination, which should better cancel and wait
for all corresponding zc requests, it's should be easier without
parallel threads. E.g. what if another thread is in the enter syscall
using ifq, or running task_work and not cancellable. Then apart
from (non-atomic) refcounting, we'd need to somehow wait for it,
doing wake ups on the zc side, and so on.

The CQ side is easy to support though, put conditional locking
around the posting like fill/post_cqe does with the todays
patchset.
Jens Axboe March 15, 2024, 6:38 p.m. UTC | #6
On 3/15/24 11:34 AM, Pavel Begunkov wrote:
> On 3/14/24 16:14, Jens Axboe wrote:
> [...]
>>>>> @@ -1053,6 +1058,85 @@ struct io_zc_rx_ifq *io_zc_verify_sock(struct io_kiocb *req,
>>>>>        return ifq;
>>>>>    }
>>>>>    +int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
>>>>> +{
>>>>> +    struct io_recvzc *zc = io_kiocb_to_cmd(req, struct io_recvzc);
>>>>> +
>>>>> +    /* non-iopoll defer_taskrun only */
>>>>> +    if (!req->ctx->task_complete)
>>>>> +        return -EINVAL;
>>>>
>>>> What's the reasoning behind this?
>>>
>>> CQ locking, see the comment a couple lines below
>>
>> My question here was more towards "is this something we want to do".
>> Maybe this is just a temporary work-around and it's nothing to discuss,
>> but I'm not sure we want to have opcodes only work on certain ring
>> setups.
> 
> I don't think it's that unreasonable restricting it. It's hard to
> care about !DEFER_TASKRUN for net workloads, it makes CQE posting a bit

I think there's a distinction between "not reasonable to support because
it's complicated/impossible to do so", and "we prefer not to support
it". I agree, as a developer it's hard to care about !DEFER_TASKRUN for
networking workloads, but as a user, they will just setup a default
queue until they wise up. And maybe this can be a good thing in that
they'd be nudged toward DEFER_TASKRUN, but I can also see some head
scratching when something just returns (the worst of all error codes)
-EINVAL when they attempt to use it.

> cleaner, and who knows where the single task part would become handy.

But you can still take advantage of single task, since you know if
that's going to be true or not. It just can't be unconditional.

> Thinking about ifq termination, which should better cancel and wait
> for all corresponding zc requests, it's should be easier without
> parallel threads. E.g. what if another thread is in the enter syscall
> using ifq, or running task_work and not cancellable. Then apart
> from (non-atomic) refcounting, we'd need to somehow wait for it,
> doing wake ups on the zc side, and so on.

I don't know, not seeing a lot of strong arguments for making it
DEFER_TASKRUN only. My worry is that once we starting doing that, then
more will follow. And honestly I think that would be a shame.

For ifq termination, surely these things are referenced, and termination
would need to wait for the last reference to drop? And if that isn't an
expected condition (it should not be), then a percpu ref would suffice.
Nobody cares if the teardown side is more expensive, as long as the fast
path is efficient.

Dunno - anyway, for now let's just leave it as-is, it's just something
to consider once we get closer to a more finished patchset.

> The CQ side is easy to support though, put conditional locking
> around the posting like fill/post_cqe does with the todays
> patchset.

Yep, which is one of the reasons why I was hopeful this could go away!
Pavel Begunkov March 15, 2024, 11:52 p.m. UTC | #7
On 3/15/24 18:38, Jens Axboe wrote:
> On 3/15/24 11:34 AM, Pavel Begunkov wrote:
>> On 3/14/24 16:14, Jens Axboe wrote:
>> [...]
>>>>>> @@ -1053,6 +1058,85 @@ struct io_zc_rx_ifq *io_zc_verify_sock(struct io_kiocb *req,
>>>>>>         return ifq;
>>>>>>     }
>>>>>>     +int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
>>>>>> +{
>>>>>> +    struct io_recvzc *zc = io_kiocb_to_cmd(req, struct io_recvzc);
>>>>>> +
>>>>>> +    /* non-iopoll defer_taskrun only */
>>>>>> +    if (!req->ctx->task_complete)
>>>>>> +        return -EINVAL;
>>>>>
>>>>> What's the reasoning behind this?
>>>>
>>>> CQ locking, see the comment a couple lines below
>>>
>>> My question here was more towards "is this something we want to do".
>>> Maybe this is just a temporary work-around and it's nothing to discuss,
>>> but I'm not sure we want to have opcodes only work on certain ring
>>> setups.
>>
>> I don't think it's that unreasonable restricting it. It's hard to
>> care about !DEFER_TASKRUN for net workloads, it makes CQE posting a bit
> 
> I think there's a distinction between "not reasonable to support because
> it's complicated/impossible to do so", and "we prefer not to support
> it". I agree, as a developer it's hard to care about !DEFER_TASKRUN for
> networking workloads, but as a user, they will just setup a default
> queue until they wise up. And maybe this can be a good thing in that

They'd still need to find a supported NIC and do all the other
setup, comparably to that it doesn't add much trouble. And my
usual argument is that io_uring is a low-level api, it's expected
that people interacting with it directly are experienced enough,
expect to spend some time to make it right and likely library
devs.

> they'd be nudged toward DEFER_TASKRUN, but I can also see some head
> scratching when something just returns (the worst of all error codes)
> -EINVAL when they attempt to use it.

Yeah, we should try to find a better error code, and the check
should migrate to ifq registration.

>> cleaner, and who knows where the single task part would become handy.
> 
> But you can still take advantage of single task, since you know if
> that's going to be true or not. It just can't be unconditional.
> 
>> Thinking about ifq termination, which should better cancel and wait
>> for all corresponding zc requests, it's should be easier without
>> parallel threads. E.g. what if another thread is in the enter syscall
>> using ifq, or running task_work and not cancellable. Then apart
>> from (non-atomic) refcounting, we'd need to somehow wait for it,
>> doing wake ups on the zc side, and so on.
> 
> I don't know, not seeing a lot of strong arguments for making it
> DEFER_TASKRUN only. My worry is that once we starting doing that, then
> more will follow. And honestly I think that would be a shame.
> 
> For ifq termination, surely these things are referenced, and termination
> would need to wait for the last reference to drop? And if that isn't an
> expected condition (it should not be), then a percpu ref would suffice.
> Nobody cares if the teardown side is more expensive, as long as the fast
> path is efficient.

You can solve any of that, it's true, the question how much crap
you'd need to add in hot paths and diffstat wise. Just take a look
at what a nice function io_recvmsg() is together with its helpers
like io_recvmsg_multishot().

The biggest concern is optimisations and quirks that we can't
predict at the moment. DEFER_TASKRUN/SINGLE_ISSUER provide a simpler
model, I'd rather keep recvzc simple than having tens of conditional
optimisations with different execution flavours and contexts.
Especially, since it can be implemented later, wouldn't work the
other way around.

> Dunno - anyway, for now let's just leave it as-is, it's just something
> to consider once we get closer to a more finished patchset.
> 
>> The CQ side is easy to support though, put conditional locking
>> around the posting like fill/post_cqe does with the todays
>> patchset.
> 
> Yep, which is one of the reasons why I was hopeful this could go away!
>
Jens Axboe March 16, 2024, 4:59 p.m. UTC | #8
On 3/15/24 5:52 PM, Pavel Begunkov wrote:
> On 3/15/24 18:38, Jens Axboe wrote:
>> On 3/15/24 11:34 AM, Pavel Begunkov wrote:
>>> On 3/14/24 16:14, Jens Axboe wrote:
>>> [...]
>>>>>>> @@ -1053,6 +1058,85 @@ struct io_zc_rx_ifq *io_zc_verify_sock(struct io_kiocb *req,
>>>>>>>         return ifq;
>>>>>>>     }
>>>>>>>     +int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
>>>>>>> +{
>>>>>>> +    struct io_recvzc *zc = io_kiocb_to_cmd(req, struct io_recvzc);
>>>>>>> +
>>>>>>> +    /* non-iopoll defer_taskrun only */
>>>>>>> +    if (!req->ctx->task_complete)
>>>>>>> +        return -EINVAL;
>>>>>>
>>>>>> What's the reasoning behind this?
>>>>>
>>>>> CQ locking, see the comment a couple lines below
>>>>
>>>> My question here was more towards "is this something we want to do".
>>>> Maybe this is just a temporary work-around and it's nothing to discuss,
>>>> but I'm not sure we want to have opcodes only work on certain ring
>>>> setups.
>>>
>>> I don't think it's that unreasonable restricting it. It's hard to
>>> care about !DEFER_TASKRUN for net workloads, it makes CQE posting a bit
>>
>> I think there's a distinction between "not reasonable to support because
>> it's complicated/impossible to do so", and "we prefer not to support
>> it". I agree, as a developer it's hard to care about !DEFER_TASKRUN for
>> networking workloads, but as a user, they will just setup a default
>> queue until they wise up. And maybe this can be a good thing in that
> 
> They'd still need to find a supported NIC and do all the other
> setup, comparably to that it doesn't add much trouble. And my

Hopefully down the line, it'll work on more NICs, and configuration will
be less of a nightmare than it is now.

> usual argument is that io_uring is a low-level api, it's expected
> that people interacting with it directly are experienced enough,
> expect to spend some time to make it right and likely library
> devs.

Have you seen some of the code that has gone in to libraries for
io_uring support? I have, and I don't think that statement is true at
all for that side.

It should work out of the box even with a naive approach, while the best
approach may require some knowledge. At least I think that's the sanest
stance on that.

>> they'd be nudged toward DEFER_TASKRUN, but I can also see some head
>> scratching when something just returns (the worst of all error codes)
>> -EINVAL when they attempt to use it.
> 
> Yeah, we should try to find a better error code, and the check
> should migrate to ifq registration.

Wasn't really a jab at the code in question, just more that -EINVAL is
the ubiqitious error code for all kinds of things and it's hard to
diagnose in general for a user. You just have to start guessing...

>>> cleaner, and who knows where the single task part would become handy.
>>
>> But you can still take advantage of single task, since you know if
>> that's going to be true or not. It just can't be unconditional.
>>
>>> Thinking about ifq termination, which should better cancel and wait
>>> for all corresponding zc requests, it's should be easier without
>>> parallel threads. E.g. what if another thread is in the enter syscall
>>> using ifq, or running task_work and not cancellable. Then apart
>>> from (non-atomic) refcounting, we'd need to somehow wait for it,
>>> doing wake ups on the zc side, and so on.
>>
>> I don't know, not seeing a lot of strong arguments for making it
>> DEFER_TASKRUN only. My worry is that once we starting doing that, then
>> more will follow. And honestly I think that would be a shame.
>>
>> For ifq termination, surely these things are referenced, and termination
>> would need to wait for the last reference to drop? And if that isn't an
>> expected condition (it should not be), then a percpu ref would suffice.
>> Nobody cares if the teardown side is more expensive, as long as the fast
>> path is efficient.
> 
> You can solve any of that, it's true, the question how much crap
> you'd need to add in hot paths and diffstat wise. Just take a look
> at what a nice function io_recvmsg() is together with its helpers
> like io_recvmsg_multishot().

That is true, and I guess my real question is "what would it look like
if we supported !DEFER_TASKRUN". Which I think is a valid question.

> The biggest concern is optimisations and quirks that we can't
> predict at the moment. DEFER_TASKRUN/SINGLE_ISSUER provide a simpler
> model, I'd rather keep recvzc simple than having tens of conditional
> optimisations with different execution flavours and contexts.
> Especially, since it can be implemented later, wouldn't work the
> other way around.

Yes me too, and I'd hate to have two variants just because of that. But
comparing to eg io_recv() and helpers, it's really not that bad. Hence
my question on how much would it take, and how nasty would it be, to
support !DEFER_TASKRUN.
Pavel Begunkov March 17, 2024, 9:22 p.m. UTC | #9
On 3/16/24 16:59, Jens Axboe wrote:
> On 3/15/24 5:52 PM, Pavel Begunkov wrote:
>> On 3/15/24 18:38, Jens Axboe wrote:
>>> On 3/15/24 11:34 AM, Pavel Begunkov wrote:
>>>> On 3/14/24 16:14, Jens Axboe wrote:
>>>> [...]
>>>>>>>> @@ -1053,6 +1058,85 @@ struct io_zc_rx_ifq *io_zc_verify_sock(struct io_kiocb *req,
>>>>>>>>          return ifq;
>>>>>>>>      }
>>>>>>>>      +int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
>>>>>>>> +{
>>>>>>>> +    struct io_recvzc *zc = io_kiocb_to_cmd(req, struct io_recvzc);
>>>>>>>> +
>>>>>>>> +    /* non-iopoll defer_taskrun only */
>>>>>>>> +    if (!req->ctx->task_complete)
>>>>>>>> +        return -EINVAL;
>>>>>>>
>>>>>>> What's the reasoning behind this?
>>>>>>
>>>>>> CQ locking, see the comment a couple lines below
>>>>>
>>>>> My question here was more towards "is this something we want to do".
>>>>> Maybe this is just a temporary work-around and it's nothing to discuss,
>>>>> but I'm not sure we want to have opcodes only work on certain ring
>>>>> setups.
>>>>
>>>> I don't think it's that unreasonable restricting it. It's hard to
>>>> care about !DEFER_TASKRUN for net workloads, it makes CQE posting a bit
>>>
>>> I think there's a distinction between "not reasonable to support because
>>> it's complicated/impossible to do so", and "we prefer not to support
>>> it". I agree, as a developer it's hard to care about !DEFER_TASKRUN for
>>> networking workloads, but as a user, they will just setup a default
>>> queue until they wise up. And maybe this can be a good thing in that
>>
>> They'd still need to find a supported NIC and do all the other
>> setup, comparably to that it doesn't add much trouble. And my
> 
> Hopefully down the line, it'll work on more NICs,

I wouldn't hope all necessary features will be seen in consumer
cards

> and configuration will be less of a nightmare than it is now.

I'm already assuming steering will be taken care by the kernel,
but you have to choose your nic, allocate an ifq, mmap a ring,
and then you're getting scattered chunks instead of

recv((void *)one_large_buffer);

My point is that it requires more involvement from user by design.
  
>> usual argument is that io_uring is a low-level api, it's expected
>> that people interacting with it directly are experienced enough,
>> expect to spend some time to make it right and likely library
>> devs.
> 
> Have you seen some of the code that has gone in to libraries for
> io_uring support? I have, and I don't think that statement is true at
> all for that side.

Well, some implementations are crappy, some are ok, some are
learning and improving what they have.

> 
> It should work out of the box even with a naive approach, while the best
> approach may require some knowledge. At least I think that's the sanest
> stance on that.
> 
>>> they'd be nudged toward DEFER_TASKRUN, but I can also see some head
>>> scratching when something just returns (the worst of all error codes)
>>> -EINVAL when they attempt to use it.
>>
>> Yeah, we should try to find a better error code, and the check
>> should migrate to ifq registration.
> 
> Wasn't really a jab at the code in question, just more that -EINVAL is
> the ubiqitious error code for all kinds of things and it's hard to
> diagnose in general for a user. You just have to start guessing...
> 
>>>> cleaner, and who knows where the single task part would become handy.
>>>
>>> But you can still take advantage of single task, since you know if
>>> that's going to be true or not. It just can't be unconditional.
>>>
>>>> Thinking about ifq termination, which should better cancel and wait
>>>> for all corresponding zc requests, it's should be easier without
>>>> parallel threads. E.g. what if another thread is in the enter syscall
>>>> using ifq, or running task_work and not cancellable. Then apart
>>>> from (non-atomic) refcounting, we'd need to somehow wait for it,
>>>> doing wake ups on the zc side, and so on.
>>>
>>> I don't know, not seeing a lot of strong arguments for making it
>>> DEFER_TASKRUN only. My worry is that once we starting doing that, then
>>> more will follow. And honestly I think that would be a shame.
>>>
>>> For ifq termination, surely these things are referenced, and termination
>>> would need to wait for the last reference to drop? And if that isn't an
>>> expected condition (it should not be), then a percpu ref would suffice.
>>> Nobody cares if the teardown side is more expensive, as long as the fast
>>> path is efficient.
>>
>> You can solve any of that, it's true, the question how much crap
>> you'd need to add in hot paths and diffstat wise. Just take a look
>> at what a nice function io_recvmsg() is together with its helpers
>> like io_recvmsg_multishot().
> 
> That is true, and I guess my real question is "what would it look like
> if we supported !DEFER_TASKRUN". Which I think is a valid question.
> 
>> The biggest concern is optimisations and quirks that we can't
>> predict at the moment. DEFER_TASKRUN/SINGLE_ISSUER provide a simpler
>> model, I'd rather keep recvzc simple than having tens of conditional
>> optimisations with different execution flavours and contexts.
>> Especially, since it can be implemented later, wouldn't work the
>> other way around.
> 
> Yes me too, and I'd hate to have two variants just because of that. But
> comparing to eg io_recv() and helpers, it's really not that bad. Hence
> my question on how much would it take, and how nasty would it be, to
> support !DEFER_TASKRUN.

It might look bearable... at first, but when it stops on that?
There will definitely be fixes and optimisations, whenever in my
mind it's something that is not even needed. I guess I'm too
traumatised by the amount of uapi binding features I wish I
could axe out and never see again.
Jens Axboe March 17, 2024, 9:30 p.m. UTC | #10
On 3/17/24 3:22 PM, Pavel Begunkov wrote:
> On 3/16/24 16:59, Jens Axboe wrote:
>> On 3/15/24 5:52 PM, Pavel Begunkov wrote:
>>> On 3/15/24 18:38, Jens Axboe wrote:
>>>> On 3/15/24 11:34 AM, Pavel Begunkov wrote:
>>>>> On 3/14/24 16:14, Jens Axboe wrote:
>>>>> [...]
>>>>>>>>> @@ -1053,6 +1058,85 @@ struct io_zc_rx_ifq *io_zc_verify_sock(struct io_kiocb *req,
>>>>>>>>>          return ifq;
>>>>>>>>>      }
>>>>>>>>>      +int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
>>>>>>>>> +{
>>>>>>>>> +    struct io_recvzc *zc = io_kiocb_to_cmd(req, struct io_recvzc);
>>>>>>>>> +
>>>>>>>>> +    /* non-iopoll defer_taskrun only */
>>>>>>>>> +    if (!req->ctx->task_complete)
>>>>>>>>> +        return -EINVAL;
>>>>>>>>
>>>>>>>> What's the reasoning behind this?
>>>>>>>
>>>>>>> CQ locking, see the comment a couple lines below
>>>>>>
>>>>>> My question here was more towards "is this something we want to do".
>>>>>> Maybe this is just a temporary work-around and it's nothing to discuss,
>>>>>> but I'm not sure we want to have opcodes only work on certain ring
>>>>>> setups.
>>>>>
>>>>> I don't think it's that unreasonable restricting it. It's hard to
>>>>> care about !DEFER_TASKRUN for net workloads, it makes CQE posting a bit
>>>>
>>>> I think there's a distinction between "not reasonable to support because
>>>> it's complicated/impossible to do so", and "we prefer not to support
>>>> it". I agree, as a developer it's hard to care about !DEFER_TASKRUN for
>>>> networking workloads, but as a user, they will just setup a default
>>>> queue until they wise up. And maybe this can be a good thing in that
>>>
>>> They'd still need to find a supported NIC and do all the other
>>> setup, comparably to that it doesn't add much trouble. And my
>>
>> Hopefully down the line, it'll work on more NICs,
> 
> I wouldn't hope all necessary features will be seen in consumer
> cards

Nah that would never be the case, but normal users aren't going to be
interested in zerocopy rx. If they are, then it's power users, and they
can pick an appropriate NIC rather than just rely on what's in their
laptop or desktop PC. But hopefully on the server production front,
there will be more NICs that support it. It's all driven by demand. If
it's a useful feature, then customers will ask for it.

>> and configuration will be less of a nightmare than it is now.
> 
> I'm already assuming steering will be taken care by the kernel,
> but you have to choose your nic, allocate an ifq, mmap a ring,
> and then you're getting scattered chunks instead of
> 
> recv((void *)one_large_buffer);
> 
> My point is that it requires more involvement from user by design.

For sure, it's more complicated than non-zerocopy, that's unavoidable.

>>> usual argument is that io_uring is a low-level api, it's expected
>>> that people interacting with it directly are experienced enough,
>>> expect to spend some time to make it right and likely library
>>> devs.
>>
>> Have you seen some of the code that has gone in to libraries for
>> io_uring support? I have, and I don't think that statement is true at
>> all for that side.
> 
> Well, some implementations are crappy, some are ok, some are
> learning and improving what they have.

Right, it wasn't a jab at them in general, it's natural to start
somewhere simple and then improve things as they go along. I don't
expect folks to have the level of knowledge of the internals that we do,
nor should they need to.

>> It should work out of the box even with a naive approach, while the best
>> approach may require some knowledge. At least I think that's the sanest
>> stance on that.
>>
>>>> they'd be nudged toward DEFER_TASKRUN, but I can also see some head
>>>> scratching when something just returns (the worst of all error codes)
>>>> -EINVAL when they attempt to use it.
>>>
>>> Yeah, we should try to find a better error code, and the check
>>> should migrate to ifq registration.
>>
>> Wasn't really a jab at the code in question, just more that -EINVAL is
>> the ubiqitious error code for all kinds of things and it's hard to
>> diagnose in general for a user. You just have to start guessing...
>>
>>>>> cleaner, and who knows where the single task part would become handy.
>>>>
>>>> But you can still take advantage of single task, since you know if
>>>> that's going to be true or not. It just can't be unconditional.
>>>>
>>>>> Thinking about ifq termination, which should better cancel and wait
>>>>> for all corresponding zc requests, it's should be easier without
>>>>> parallel threads. E.g. what if another thread is in the enter syscall
>>>>> using ifq, or running task_work and not cancellable. Then apart
>>>>> from (non-atomic) refcounting, we'd need to somehow wait for it,
>>>>> doing wake ups on the zc side, and so on.
>>>>
>>>> I don't know, not seeing a lot of strong arguments for making it
>>>> DEFER_TASKRUN only. My worry is that once we starting doing that, then
>>>> more will follow. And honestly I think that would be a shame.
>>>>
>>>> For ifq termination, surely these things are referenced, and termination
>>>> would need to wait for the last reference to drop? And if that isn't an
>>>> expected condition (it should not be), then a percpu ref would suffice.
>>>> Nobody cares if the teardown side is more expensive, as long as the fast
>>>> path is efficient.
>>>
>>> You can solve any of that, it's true, the question how much crap
>>> you'd need to add in hot paths and diffstat wise. Just take a look
>>> at what a nice function io_recvmsg() is together with its helpers
>>> like io_recvmsg_multishot().
>>
>> That is true, and I guess my real question is "what would it look like
>> if we supported !DEFER_TASKRUN". Which I think is a valid question.
>>
>>> The biggest concern is optimisations and quirks that we can't
>>> predict at the moment. DEFER_TASKRUN/SINGLE_ISSUER provide a simpler
>>> model, I'd rather keep recvzc simple than having tens of conditional
>>> optimisations with different execution flavours and contexts.
>>> Especially, since it can be implemented later, wouldn't work the
>>> other way around.
>>
>> Yes me too, and I'd hate to have two variants just because of that. But
>> comparing to eg io_recv() and helpers, it's really not that bad. Hence
>> my question on how much would it take, and how nasty would it be, to
>> support !DEFER_TASKRUN.
> 
> It might look bearable... at first, but when it stops on that?
> There will definitely be fixes and optimisations, whenever in my
> mind it's something that is not even needed. I guess I'm too
> traumatised by the amount of uapi binding features I wish I
> could axe out and never see again.

But that's real world though, particularly for the kernel. We'd all love
to restart things from scratch, and sometimes that'd lead to something
better which then down the line inevitably you'd love to redo again.
diff mbox series

Patch

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 26e945e6258d..ad2ec60b0390 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -256,6 +256,7 @@  enum io_uring_op {
 	IORING_OP_FUTEX_WAITV,
 	IORING_OP_FIXED_FD_INSTALL,
 	IORING_OP_FTRUNCATE,
+	IORING_OP_RECV_ZC,
 
 	/* this goes last, obviously */
 	IORING_OP_LAST,
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 6426ee382276..cd1b3da96f62 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -180,6 +180,16 @@  static inline bool io_get_cqe(struct io_ring_ctx *ctx, struct io_uring_cqe **ret
 	return io_get_cqe_overflow(ctx, ret, false);
 }
 
+static inline bool io_defer_get_uncommited_cqe(struct io_ring_ctx *ctx,
+					       struct io_uring_cqe **cqe_ret)
+{
+	io_lockdep_assert_cq_locked(ctx);
+
+	ctx->cq_extra++;
+	ctx->submit_state.flush_cqes = true;
+	return io_get_cqe(ctx, cqe_ret);
+}
+
 static __always_inline bool io_fill_cqe_req(struct io_ring_ctx *ctx,
 					    struct io_kiocb *req)
 {
diff --git a/io_uring/net.c b/io_uring/net.c
index 1fa7c1fa6b5d..56172335387e 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -79,6 +79,12 @@  struct io_sr_msg {
  */
 #define MULTISHOT_MAX_RETRY	32
 
+struct io_recvzc {
+	struct file			*file;
+	unsigned			msg_flags;
+	u16				flags;
+};
+
 static inline bool io_check_multishot(struct io_kiocb *req,
 				      unsigned int issue_flags)
 {
@@ -695,7 +701,7 @@  static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
 	unsigned int cflags;
 
 	cflags = io_put_kbuf(req, issue_flags);
-	if (msg->msg_inq && msg->msg_inq != -1)
+	if (msg && msg->msg_inq && msg->msg_inq != -1)
 		cflags |= IORING_CQE_F_SOCK_NONEMPTY;
 
 	if (!(req->flags & REQ_F_APOLL_MULTISHOT)) {
@@ -723,7 +729,7 @@  static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
 			goto enobufs;
 
 		/* Known not-empty or unknown state, retry */
-		if (cflags & IORING_CQE_F_SOCK_NONEMPTY || msg->msg_inq == -1) {
+		if (cflags & IORING_CQE_F_SOCK_NONEMPTY || (msg && msg->msg_inq == -1)) {
 			if (sr->nr_multishot_loops++ < MULTISHOT_MAX_RETRY)
 				return false;
 			/* mshot retries exceeded, force a requeue */
@@ -1034,9 +1040,8 @@  int io_recv(struct io_kiocb *req, unsigned int issue_flags)
 	return ret;
 }
 
-static __maybe_unused
-struct io_zc_rx_ifq *io_zc_verify_sock(struct io_kiocb *req,
-					struct socket *sock)
+static struct io_zc_rx_ifq *io_zc_verify_sock(struct io_kiocb *req,
+					      struct socket *sock)
 {
 	unsigned token = READ_ONCE(sock->zc_rx_idx);
 	unsigned ifq_idx = token >> IO_ZC_IFQ_IDX_OFFSET;
@@ -1053,6 +1058,85 @@  struct io_zc_rx_ifq *io_zc_verify_sock(struct io_kiocb *req,
 	return ifq;
 }
 
+int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	struct io_recvzc *zc = io_kiocb_to_cmd(req, struct io_recvzc);
+
+	/* non-iopoll defer_taskrun only */
+	if (!req->ctx->task_complete)
+		return -EINVAL;
+	if (unlikely(sqe->file_index || sqe->addr2))
+		return -EINVAL;
+	if (READ_ONCE(sqe->len) || READ_ONCE(sqe->addr3))
+		return -EINVAL;
+
+	zc->flags = READ_ONCE(sqe->ioprio);
+	zc->msg_flags = READ_ONCE(sqe->msg_flags);
+
+	if (zc->msg_flags)
+		return -EINVAL;
+	if (zc->flags & ~RECVMSG_FLAGS)
+		return -EINVAL;
+	if (zc->flags & IORING_RECV_MULTISHOT)
+		req->flags |= REQ_F_APOLL_MULTISHOT;
+#ifdef CONFIG_COMPAT
+	if (req->ctx->compat)
+		zc->msg_flags |= MSG_CMSG_COMPAT;
+#endif
+	return 0;
+}
+
+int io_recvzc(struct io_kiocb *req, unsigned int issue_flags)
+{
+	struct io_recvzc *zc = io_kiocb_to_cmd(req, struct io_recvzc);
+	struct io_zc_rx_ifq *ifq;
+	struct socket *sock;
+	int ret;
+
+	/*
+	 * We're posting CQEs deeper in the stack, and to avoid taking CQ locks
+	 * we serialise by having only the master thread modifying the CQ with
+	 * DEFER_TASkRUN checked earlier and forbidding executing it from io-wq.
+	 * That's similar to io_check_multishot() for multishot CQEs.
+	 */
+	if (issue_flags & IO_URING_F_IOWQ)
+		return -EAGAIN;
+	if (WARN_ON_ONCE(!(issue_flags & IO_URING_F_NONBLOCK)))
+		return -EAGAIN;
+	if (!(req->flags & REQ_F_POLLED) &&
+	    (zc->flags & IORING_RECVSEND_POLL_FIRST))
+		return -EAGAIN;
+
+	sock = sock_from_file(req->file);
+	if (unlikely(!sock))
+		return -ENOTSOCK;
+	ifq = io_zc_verify_sock(req, sock);
+	if (!ifq)
+		return -EINVAL;
+
+	ret = io_zc_rx_recv(req, ifq, sock, zc->msg_flags | MSG_DONTWAIT);
+	if (unlikely(ret <= 0)) {
+		if (ret == -EAGAIN) {
+			if (issue_flags & IO_URING_F_MULTISHOT)
+				return IOU_ISSUE_SKIP_COMPLETE;
+			return -EAGAIN;
+		}
+		if (ret == -ERESTARTSYS)
+			ret = -EINTR;
+
+		req_set_fail(req);
+		io_req_set_res(req, ret, 0);
+
+		if (issue_flags & IO_URING_F_MULTISHOT)
+			return IOU_STOP_MULTISHOT;
+		return IOU_OK;
+	}
+
+	if (issue_flags & IO_URING_F_MULTISHOT)
+		return IOU_ISSUE_SKIP_COMPLETE;
+	return -EAGAIN;
+}
+
 void io_send_zc_cleanup(struct io_kiocb *req)
 {
 	struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg);
diff --git a/io_uring/opdef.c b/io_uring/opdef.c
index 9c080aadc5a6..78ec5197917e 100644
--- a/io_uring/opdef.c
+++ b/io_uring/opdef.c
@@ -36,6 +36,7 @@ 
 #include "waitid.h"
 #include "futex.h"
 #include "truncate.h"
+#include "zc_rx.h"
 
 static int io_no_issue(struct io_kiocb *req, unsigned int issue_flags)
 {
@@ -481,6 +482,18 @@  const struct io_issue_def io_issue_defs[] = {
 		.prep			= io_ftruncate_prep,
 		.issue			= io_ftruncate,
 	},
+	[IORING_OP_RECV_ZC] = {
+		.needs_file		= 1,
+		.unbound_nonreg_file	= 1,
+		.pollin			= 1,
+		.ioprio			= 1,
+#if defined(CONFIG_NET)
+		.prep			= io_recvzc_prep,
+		.issue			= io_recvzc,
+#else
+		.prep			= io_eopnotsupp_prep,
+#endif
+	},
 };
 
 const struct io_cold_def io_cold_defs[] = {
@@ -722,6 +735,9 @@  const struct io_cold_def io_cold_defs[] = {
 	[IORING_OP_FTRUNCATE] = {
 		.name			= "FTRUNCATE",
 	},
+	[IORING_OP_RECV_ZC] = {
+		.name			= "RECV_ZC",
+	},
 };
 
 const char *io_uring_get_opcode(u8 opcode)
diff --git a/io_uring/zc_rx.c b/io_uring/zc_rx.c
index 4bd27eda4bc9..bb9251111735 100644
--- a/io_uring/zc_rx.c
+++ b/io_uring/zc_rx.c
@@ -6,10 +6,12 @@ 
 #include <linux/io_uring.h>
 #include <linux/netdevice.h>
 #include <linux/nospec.h>
+
+#include <net/page_pool/helpers.h>
 #include <net/tcp.h>
 #include <net/af_unix.h>
+
 #include <trace/events/page_pool.h>
-#include <net/page_pool/helpers.h>
 
 #include <uapi/linux/io_uring.h>
 
@@ -18,6 +20,12 @@ 
 #include "zc_rx.h"
 #include "rsrc.h"
 
+struct io_zc_rx_args {
+	struct io_kiocb		*req;
+	struct io_zc_rx_ifq	*ifq;
+	struct socket		*sock;
+};
+
 typedef int (*bpf_op_t)(struct net_device *dev, struct netdev_bpf *bpf);
 
 static int __io_queue_mgmt(struct net_device *dev, struct io_zc_rx_ifq *ifq,
@@ -371,7 +379,7 @@  static inline unsigned io_buf_pgid(struct io_zc_rx_pool *pool,
 	return buf - pool->bufs;
 }
 
-static __maybe_unused void io_zc_rx_get_buf_uref(struct io_zc_rx_buf *buf)
+static void io_zc_rx_get_buf_uref(struct io_zc_rx_buf *buf)
 {
 	atomic_long_add(IO_ZC_RX_UREF, &buf->niov.pp_ref_count);
 }
@@ -640,5 +648,170 @@  const struct memory_provider_ops io_uring_pp_zc_ops = {
 };
 EXPORT_SYMBOL(io_uring_pp_zc_ops);
 
+static bool zc_rx_queue_cqe(struct io_kiocb *req, struct io_zc_rx_buf *buf,
+			   struct io_zc_rx_ifq *ifq, int off, int len)
+{
+	struct io_uring_rbuf_cqe *rcqe;
+	struct io_uring_cqe *cqe;
+
+	if (!io_defer_get_uncommited_cqe(req->ctx, &cqe))
+		return false;
+
+	cqe->user_data = req->cqe.user_data;
+	cqe->res = 0;
+	cqe->flags = IORING_CQE_F_MORE;
+
+	rcqe = (struct io_uring_rbuf_cqe *)(cqe + 1);
+	rcqe->region = 0;
+	rcqe->off = io_buf_pgid(ifq->pool, buf) * PAGE_SIZE + off;
+	rcqe->len = len;
+	memset(rcqe->__pad, 0, sizeof(rcqe->__pad));
+	return true;
+}
+
+static int zc_rx_recv_frag(struct io_kiocb *req, struct io_zc_rx_ifq *ifq,
+			   const skb_frag_t *frag, int off, int len)
+{
+	off += skb_frag_off(frag);
+
+	if (likely(skb_frag_is_net_iov(frag))) {
+		struct io_zc_rx_buf *buf;
+		struct net_iov *niov;
+
+		niov = netmem_to_net_iov(frag->netmem);
+		if (niov->pp->mp_ops != &io_uring_pp_zc_ops ||
+		    niov->pp->mp_priv != ifq)
+			return -EFAULT;
+
+		buf = io_niov_to_buf(niov);
+		if (!zc_rx_queue_cqe(req, buf, ifq, off, len))
+			return -ENOSPC;
+		io_zc_rx_get_buf_uref(buf);
+	} else {
+		return -EOPNOTSUPP;
+	}
+
+	return len;
+}
+
+static int
+zc_rx_recv_skb(read_descriptor_t *desc, struct sk_buff *skb,
+	       unsigned int offset, size_t len)
+{
+	struct io_zc_rx_args *args = desc->arg.data;
+	struct io_zc_rx_ifq *ifq = args->ifq;
+	struct io_kiocb *req = args->req;
+	struct sk_buff *frag_iter;
+	unsigned start, start_off;
+	int i, copy, end, off;
+	int ret = 0;
+
+	start = skb_headlen(skb);
+	start_off = offset;
+
+	if (offset < start)
+		return -EOPNOTSUPP;
+
+	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+		const skb_frag_t *frag;
+
+		if (WARN_ON(start > offset + len))
+			return -EFAULT;
+
+		frag = &skb_shinfo(skb)->frags[i];
+		end = start + skb_frag_size(frag);
+
+		if (offset < end) {
+			copy = end - offset;
+			if (copy > len)
+				copy = len;
+
+			off = offset - start;
+			ret = zc_rx_recv_frag(req, ifq, frag, off, copy);
+			if (ret < 0)
+				goto out;
+
+			offset += ret;
+			len -= ret;
+			if (len == 0 || ret != copy)
+				goto out;
+		}
+		start = end;
+	}
+
+	skb_walk_frags(skb, frag_iter) {
+		if (WARN_ON(start > offset + len))
+			return -EFAULT;
+
+		end = start + frag_iter->len;
+		if (offset < end) {
+			copy = end - offset;
+			if (copy > len)
+				copy = len;
+
+			off = offset - start;
+			ret = zc_rx_recv_skb(desc, frag_iter, off, copy);
+			if (ret < 0)
+				goto out;
+
+			offset += ret;
+			len -= ret;
+			if (len == 0 || ret != copy)
+				goto out;
+		}
+		start = end;
+	}
+
+out:
+	if (offset == start_off)
+		return ret;
+	return offset - start_off;
+}
+
+static int io_zc_rx_tcp_recvmsg(struct io_kiocb *req, struct io_zc_rx_ifq *ifq,
+				struct sock *sk, int flags)
+{
+	struct io_zc_rx_args args = {
+		.req = req,
+		.ifq = ifq,
+		.sock = sk->sk_socket,
+	};
+	read_descriptor_t rd_desc = {
+		.count = 1,
+		.arg.data = &args,
+	};
+	int ret;
+
+	lock_sock(sk);
+	ret = tcp_read_sock(sk, &rd_desc, zc_rx_recv_skb);
+	if (ret <= 0) {
+		if (ret < 0 || sock_flag(sk, SOCK_DONE))
+			goto out;
+		if (sk->sk_err)
+			ret = sock_error(sk);
+		else if (sk->sk_shutdown & RCV_SHUTDOWN)
+			goto out;
+		else if (sk->sk_state == TCP_CLOSE)
+			ret = -ENOTCONN;
+		else
+			ret = -EAGAIN;
+	}
+out:
+	release_sock(sk);
+	return ret;
+}
+
+int io_zc_rx_recv(struct io_kiocb *req, struct io_zc_rx_ifq *ifq,
+		  struct socket *sock, unsigned int flags)
+{
+	struct sock *sk = sock->sk;
+	const struct proto *prot = READ_ONCE(sk->sk_prot);
+
+	if (prot->recvmsg != tcp_recvmsg)
+		return -EPROTONOSUPPORT;
+
+	sock_rps_record_flow(sk);
+	return io_zc_rx_tcp_recvmsg(req, ifq, sk, flags);
+}
 
 #endif
diff --git a/io_uring/zc_rx.h b/io_uring/zc_rx.h
index c02bf8cabc6c..c14ea3cf544a 100644
--- a/io_uring/zc_rx.h
+++ b/io_uring/zc_rx.h
@@ -50,6 +50,8 @@  void io_unregister_zc_rx_ifqs(struct io_ring_ctx *ctx);
 void io_shutdown_zc_rx_ifqs(struct io_ring_ctx *ctx);
 int io_register_zc_rx_sock(struct io_ring_ctx *ctx,
 			   struct io_uring_zc_rx_sock_reg __user *arg);
+int io_zc_rx_recv(struct io_kiocb *req, struct io_zc_rx_ifq *ifq,
+		  struct socket *sock, unsigned int flags);
 #else
 static inline int io_register_zc_rx_ifq(struct io_ring_ctx *ctx,
 			  struct io_uring_zc_rx_ifq_reg __user *arg)
@@ -67,6 +69,15 @@  static inline int io_register_zc_rx_sock(struct io_ring_ctx *ctx,
 {
 	return -EOPNOTSUPP;
 }
+
+static inline int io_zc_rx_recv(struct io_kiocb *req, struct io_zc_rx_ifq *ifq,
+				struct socket *sock, unsigned int flags)
+{
+	return -EOPNOTSUPP;
+}
 #endif
 
+int io_recvzc(struct io_kiocb *req, unsigned int issue_flags);
+int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+
 #endif