diff mbox series

[RFC,v3,15/20] io_uring: add io_recvzc request

Message ID 20231219210357.4029713-16-dw@davidwei.uk (mailing list archive)
State RFC
Headers show
Series Zero copy Rx using io_uring | expand

Checks

Context Check Description
netdev/tree_selection success Guessing tree name failed - patch did not apply, async

Commit Message

David Wei Dec. 19, 2023, 9:03 p.m. UTC
From: David Wei <davidhwei@meta.com>

This patch adds an io_uring opcode OP_RECV_ZC for doing ZC reads from a
socket that is set up for ZC Rx. The request reads skbs from a socket
where its page frags are tagged w/ a magic cookie in their page private
field. For each frag, entries are written into the ifq rbuf completion
ring, and the total number of bytes read is returned to user as an
io_uring completion event.

Multishot requests work. There is no need to specify provided buffers as
data is returned in  the ifq rbuf completion rings.

Userspace is expected to look into the ifq rbuf completion ring when it
receives an io_uring completion event.

The addr3 field is used to encode params in the following format:

  addr3 = (readlen << 32);

readlen is the max amount of data to read from the socket. ifq_id is the
interface queue id, and currently only 0 is supported.

Signed-off-by: David Wei <dw@davidwei.uk>
---
 include/uapi/linux/io_uring.h |   1 +
 io_uring/net.c                | 119 ++++++++++++++++-
 io_uring/opdef.c              |  16 +++
 io_uring/zc_rx.c              | 240 +++++++++++++++++++++++++++++++++-
 io_uring/zc_rx.h              |   5 +
 5 files changed, 375 insertions(+), 6 deletions(-)

Comments

Jens Axboe Dec. 20, 2023, 4:27 p.m. UTC | #1
On 12/19/23 2:03 PM, David Wei wrote:
> diff --git a/io_uring/net.c b/io_uring/net.c
> index 454ba301ae6b..7a2aadf6962c 100644
> --- a/io_uring/net.c
> +++ b/io_uring/net.c
> @@ -637,7 +647,7 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
>  	unsigned int cflags;
>  
>  	cflags = io_put_kbuf(req, issue_flags);
> -	if (msg->msg_inq && msg->msg_inq != -1)
> +	if (msg && msg->msg_inq && msg->msg_inq != -1)
>  		cflags |= IORING_CQE_F_SOCK_NONEMPTY;
>  
>  	if (!(req->flags & REQ_F_APOLL_MULTISHOT)) {
> @@ -652,7 +662,7 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
>  			io_recv_prep_retry(req);
>  			/* Known not-empty or unknown state, retry */
>  			if (cflags & IORING_CQE_F_SOCK_NONEMPTY ||
> -			    msg->msg_inq == -1)
> +			    (msg && msg->msg_inq == -1))
>  				return false;
>  			if (issue_flags & IO_URING_F_MULTISHOT)
>  				*ret = IOU_ISSUE_SKIP_COMPLETE;

These are a bit ugly, just pass in a dummy msg for this?

> +int io_recvzc(struct io_kiocb *req, unsigned int issue_flags)
> +{
> +	struct io_recvzc *zc = io_kiocb_to_cmd(req, struct io_recvzc);
> +	struct socket *sock;
> +	unsigned flags;
> +	int ret, min_ret = 0;
> +	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
> +	struct io_zc_rx_ifq *ifq;

Eg
	struct msghdr dummy_msg;

	dummy_msg.msg_inq = -1;

which will eat some stack, but probably not really an issue.


> +	if (issue_flags & IO_URING_F_UNLOCKED)
> +		return -EAGAIN;

This seems odd, why? If we're called with IO_URING_F_UNLOCKED set, then
it's from io-wq. And returning -EAGAIN there will not do anything to
change that. Usually this check is done to lock if we don't have it
already, eg with io_ring_submit_unlock(). Maybe I'm missing something
here!

> @@ -590,5 +603,230 @@ const struct pp_memory_provider_ops io_uring_pp_zc_ops = {
>  };
>  EXPORT_SYMBOL(io_uring_pp_zc_ops);
>  
> +static inline struct io_uring_rbuf_cqe *io_zc_get_rbuf_cqe(struct io_zc_rx_ifq *ifq)
> +{
> +	struct io_uring_rbuf_cqe *cqe;
> +	unsigned int cq_idx, queued, free, entries;
> +	unsigned int mask = ifq->cq_entries - 1;
> +
> +	cq_idx = ifq->cached_cq_tail & mask;
> +	smp_rmb();
> +	queued = min(io_zc_rx_cqring_entries(ifq), ifq->cq_entries);
> +	free = ifq->cq_entries - queued;
> +	entries = min(free, ifq->cq_entries - cq_idx);
> +	if (!entries)
> +		return NULL;
> +
> +	cqe = &ifq->cqes[cq_idx];
> +	ifq->cached_cq_tail++;
> +	return cqe;
> +}

smp_rmb() here needs a good comment on what the matching smp_wmb() is,
and why it's needed. Or maybe it should be an smp_load_acquire()?

> +
> +static int zc_rx_recv_frag(struct io_zc_rx_ifq *ifq, const skb_frag_t *frag,
> +			   int off, int len, unsigned sock_idx)
> +{
> +	off += skb_frag_off(frag);
> +
> +	if (likely(page_is_page_pool_iov(frag->bv_page))) {
> +		struct io_uring_rbuf_cqe *cqe;
> +		struct io_zc_rx_buf *buf;
> +		struct page_pool_iov *ppiov;
> +
> +		ppiov = page_to_page_pool_iov(frag->bv_page);
> +		if (ppiov->pp->p.memory_provider != PP_MP_IOU_ZCRX ||
> +		    ppiov->pp->mp_priv != ifq)
> +			return -EFAULT;
> +
> +		cqe = io_zc_get_rbuf_cqe(ifq);
> +		if (!cqe)
> +			return -ENOBUFS;
> +
> +		buf = io_iov_to_buf(ppiov);
> +		io_zc_rx_get_buf_uref(buf);
> +
> +		cqe->region = 0;
> +		cqe->off = io_buf_pgid(ifq->pool, buf) * PAGE_SIZE + off;
> +		cqe->len = len;
> +		cqe->sock = sock_idx;
> +		cqe->flags = 0;
> +	} else {
> +		return -EOPNOTSUPP;
> +	}
> +
> +	return len;
> +}

I think this would read a lot better as:

	if (unlikely(!page_is_page_pool_iov(frag->bv_page)))
		return -EOPNOTSUPP;

	...
	return len;

> +zc_rx_recv_skb(read_descriptor_t *desc, struct sk_buff *skb,
> +	       unsigned int offset, size_t len)
> +{
> +	struct io_zc_rx_args *args = desc->arg.data;
> +	struct io_zc_rx_ifq *ifq = args->ifq;
> +	struct socket *sock = args->sock;
> +	unsigned sock_idx = sock->zc_rx_idx & IO_ZC_IFQ_IDX_MASK;
> +	struct sk_buff *frag_iter;
> +	unsigned start, start_off;
> +	int i, copy, end, off;
> +	int ret = 0;
> +
> +	start = skb_headlen(skb);
> +	start_off = offset;
> +
> +	if (offset < start)
> +		return -EOPNOTSUPP;
> +
> +	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
> +		const skb_frag_t *frag;
> +
> +		WARN_ON(start > offset + len);

This probably can't happen, but should it abort if it did?

> +
> +		frag = &skb_shinfo(skb)->frags[i];
> +		end = start + skb_frag_size(frag);
> +
> +		if (offset < end) {
> +			copy = end - offset;
> +			if (copy > len)
> +				copy = len;
> +
> +			off = offset - start;
> +			ret = zc_rx_recv_frag(ifq, frag, off, copy, sock_idx);
> +			if (ret < 0)
> +				goto out;
> +
> +			offset += ret;
> +			len -= ret;
> +			if (len == 0 || ret != copy)
> +				goto out;
> +		}
> +		start = end;
> +	}
> +
> +	skb_walk_frags(skb, frag_iter) {
> +		WARN_ON(start > offset + len);
> +
> +		end = start + frag_iter->len;
> +		if (offset < end) {
> +			copy = end - offset;
> +			if (copy > len)
> +				copy = len;
> +
> +			off = offset - start;
> +			ret = zc_rx_recv_skb(desc, frag_iter, off, copy);
> +			if (ret < 0)
> +				goto out;
> +
> +			offset += ret;
> +			len -= ret;
> +			if (len == 0 || ret != copy)
> +				goto out;
> +		}
> +		start = end;
> +	}
> +
> +out:
> +	smp_store_release(&ifq->ring->cq.tail, ifq->cached_cq_tail);
> +	if (offset == start_off)
> +		return ret;
> +	return offset - start_off;
> +}
> +
> +static int io_zc_rx_tcp_read(struct io_zc_rx_ifq *ifq, struct sock *sk)
> +{
> +	struct io_zc_rx_args args = {
> +		.ifq = ifq,
> +		.sock = sk->sk_socket,
> +	};
> +	read_descriptor_t rd_desc = {
> +		.count = 1,
> +		.arg.data = &args,
> +	};
> +
> +	return tcp_read_sock(sk, &rd_desc, zc_rx_recv_skb);
> +}
> +
> +static int io_zc_rx_tcp_recvmsg(struct io_zc_rx_ifq *ifq, struct sock *sk,
> +				unsigned int recv_limit,
> +				int flags, int *addr_len)
> +{
> +	size_t used;
> +	long timeo;
> +	int ret;
> +
> +	ret = used = 0;
> +
> +	lock_sock(sk);
> +
> +	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
> +	while (recv_limit) {
> +		ret = io_zc_rx_tcp_read(ifq, sk);
> +		if (ret < 0)
> +			break;
> +		if (!ret) {
> +			if (used)
> +				break;
> +			if (sock_flag(sk, SOCK_DONE))
> +				break;
> +			if (sk->sk_err) {
> +				ret = sock_error(sk);
> +				break;
> +			}
> +			if (sk->sk_shutdown & RCV_SHUTDOWN)
> +				break;
> +			if (sk->sk_state == TCP_CLOSE) {
> +				ret = -ENOTCONN;
> +				break;
> +			}
> +			if (!timeo) {
> +				ret = -EAGAIN;
> +				break;
> +			}
> +			if (!skb_queue_empty(&sk->sk_receive_queue))
> +				break;
> +			sk_wait_data(sk, &timeo, NULL);
> +			if (signal_pending(current)) {
> +				ret = sock_intr_errno(timeo);
> +				break;
> +			}
> +			continue;
> +		}
> +		recv_limit -= ret;
> +		used += ret;
> +
> +		if (!timeo)
> +			break;
> +		release_sock(sk);
> +		lock_sock(sk);
> +
> +		if (sk->sk_err || sk->sk_state == TCP_CLOSE ||
> +		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
> +		    signal_pending(current))
> +			break;
> +	}
> +	release_sock(sk);
> +	/* TODO: handle timestamping */
> +	return used ? used : ret;
> +}
> +
> +int io_zc_rx_recv(struct io_zc_rx_ifq *ifq, struct socket *sock,
> +		  unsigned int limit, unsigned int flags)
> +{
> +	struct sock *sk = sock->sk;
> +	const struct proto *prot;
> +	int addr_len = 0;
> +	int ret;
> +
> +	if (flags & MSG_ERRQUEUE)
> +		return -EOPNOTSUPP;
> +
> +	prot = READ_ONCE(sk->sk_prot);
> +	if (prot->recvmsg != tcp_recvmsg)
> +		return -EPROTONOSUPPORT;
> +
> +	sock_rps_record_flow(sk);
> +
> +	ret = io_zc_rx_tcp_recvmsg(ifq, sk, limit, flags, &addr_len);
> +
> +	return ret;
> +}


return io_zc_rx_tcp_recvmsg(ifq, sk, limit, flags, &addr_len);

and then you can remove 'int ret' as well.
Pavel Begunkov Dec. 20, 2023, 5:04 p.m. UTC | #2
On 12/20/23 16:27, Jens Axboe wrote:
> On 12/19/23 2:03 PM, David Wei wrote:
>> diff --git a/io_uring/net.c b/io_uring/net.c
>> index 454ba301ae6b..7a2aadf6962c 100644
>> --- a/io_uring/net.c
>> +++ b/io_uring/net.c
>> @@ -637,7 +647,7 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
>>   	unsigned int cflags;
>>   
>>   	cflags = io_put_kbuf(req, issue_flags);
>> -	if (msg->msg_inq && msg->msg_inq != -1)
>> +	if (msg && msg->msg_inq && msg->msg_inq != -1)
>>   		cflags |= IORING_CQE_F_SOCK_NONEMPTY;
>>   
>>   	if (!(req->flags & REQ_F_APOLL_MULTISHOT)) {
>> @@ -652,7 +662,7 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
>>   			io_recv_prep_retry(req);
>>   			/* Known not-empty or unknown state, retry */
>>   			if (cflags & IORING_CQE_F_SOCK_NONEMPTY ||
>> -			    msg->msg_inq == -1)
>> +			    (msg && msg->msg_inq == -1))
>>   				return false;
>>   			if (issue_flags & IO_URING_F_MULTISHOT)
>>   				*ret = IOU_ISSUE_SKIP_COMPLETE;
> 
> These are a bit ugly, just pass in a dummy msg for this?
> 
>> +int io_recvzc(struct io_kiocb *req, unsigned int issue_flags)
>> +{
>> +	struct io_recvzc *zc = io_kiocb_to_cmd(req, struct io_recvzc);
>> +	struct socket *sock;
>> +	unsigned flags;
>> +	int ret, min_ret = 0;
>> +	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
>> +	struct io_zc_rx_ifq *ifq;
> 
> Eg
> 	struct msghdr dummy_msg;
> 
> 	dummy_msg.msg_inq = -1;
> 
> which will eat some stack, but probably not really an issue.
> 
> 
>> +	if (issue_flags & IO_URING_F_UNLOCKED)
>> +		return -EAGAIN;
> 
> This seems odd, why? If we're called with IO_URING_F_UNLOCKED set, then

It's my addition, let me explain.

io_recvzc() -> io_zc_rx_recv() -> ... -> zc_rx_recv_frag()

This chain posts completions to a buffer completion queue, and
we don't want extra locking to share it with io-wq threads. In
some sense it's quite similar to the CQ locking, considering
we restrict zc to DEFER_TASKRUN. And doesn't change anything
anyway because multishot cannot post completions from io-wq
and are executed from the poll callback in task work.

> it's from io-wq. And returning -EAGAIN there will not do anything to

It will. It's supposed to just requeue for polling (it's not
IOPOLL to keep retrying -EAGAIN), just like multishots do.

Double checking the code, it can actually terminate the request,
which doesn't make much difference for us because multishots
should normally never end up in io-wq anyway, but I guess we
can improve it a liitle bit.

And it should also use IO_URING_F_IOWQ, forgot I split out
it from *F_UNLOCK.

> change that. Usually this check is done to lock if we don't have it
> already, eg with io_ring_submit_unlock(). Maybe I'm missing something
> here!
> 
>> @@ -590,5 +603,230 @@ const struct pp_memory_provider_ops io_uring_pp_zc_ops = {
>>   };
>>   EXPORT_SYMBOL(io_uring_pp_zc_ops);
>>   
>> +static inline struct io_uring_rbuf_cqe *io_zc_get_rbuf_cqe(struct io_zc_rx_ifq *ifq)
>> +{
>> +	struct io_uring_rbuf_cqe *cqe;
>> +	unsigned int cq_idx, queued, free, entries;
>> +	unsigned int mask = ifq->cq_entries - 1;
>> +
>> +	cq_idx = ifq->cached_cq_tail & mask;
>> +	smp_rmb();
>> +	queued = min(io_zc_rx_cqring_entries(ifq), ifq->cq_entries);
>> +	free = ifq->cq_entries - queued;
>> +	entries = min(free, ifq->cq_entries - cq_idx);
>> +	if (!entries)
>> +		return NULL;
>> +
>> +	cqe = &ifq->cqes[cq_idx];
>> +	ifq->cached_cq_tail++;
>> +	return cqe;
>> +}
> 
> smp_rmb() here needs a good comment on what the matching smp_wmb() is,
> and why it's needed. Or maybe it should be an smp_load_acquire()?
> 
>> +
>> +static int zc_rx_recv_frag(struct io_zc_rx_ifq *ifq, const skb_frag_t *frag,
>> +			   int off, int len, unsigned sock_idx)
>> +{
>> +	off += skb_frag_off(frag);
>> +
>> +	if (likely(page_is_page_pool_iov(frag->bv_page))) {
>> +		struct io_uring_rbuf_cqe *cqe;
>> +		struct io_zc_rx_buf *buf;
>> +		struct page_pool_iov *ppiov;
>> +
>> +		ppiov = page_to_page_pool_iov(frag->bv_page);
>> +		if (ppiov->pp->p.memory_provider != PP_MP_IOU_ZCRX ||
>> +		    ppiov->pp->mp_priv != ifq)
>> +			return -EFAULT;
>> +
>> +		cqe = io_zc_get_rbuf_cqe(ifq);
>> +		if (!cqe)
>> +			return -ENOBUFS;
>> +
>> +		buf = io_iov_to_buf(ppiov);
>> +		io_zc_rx_get_buf_uref(buf);
>> +
>> +		cqe->region = 0;
>> +		cqe->off = io_buf_pgid(ifq->pool, buf) * PAGE_SIZE + off;
>> +		cqe->len = len;
>> +		cqe->sock = sock_idx;
>> +		cqe->flags = 0;
>> +	} else {
>> +		return -EOPNOTSUPP;
>> +	}
>> +
>> +	return len;
>> +}
> 
> I think this would read a lot better as:
> 
> 	if (unlikely(!page_is_page_pool_iov(frag->bv_page)))
> 		return -EOPNOTSUPP;

That's a bit of oracle coding, this branch is implemented in
a later patch.

> 
> 	...
> 	return len;
>
Jens Axboe Dec. 20, 2023, 6:09 p.m. UTC | #3
On 12/20/23 10:04 AM, Pavel Begunkov wrote:
> On 12/20/23 16:27, Jens Axboe wrote:
>> On 12/19/23 2:03 PM, David Wei wrote:
>>> diff --git a/io_uring/net.c b/io_uring/net.c
>>> index 454ba301ae6b..7a2aadf6962c 100644
>>> --- a/io_uring/net.c
>>> +++ b/io_uring/net.c
>>> @@ -637,7 +647,7 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
>>>       unsigned int cflags;
>>>         cflags = io_put_kbuf(req, issue_flags);
>>> -    if (msg->msg_inq && msg->msg_inq != -1)
>>> +    if (msg && msg->msg_inq && msg->msg_inq != -1)
>>>           cflags |= IORING_CQE_F_SOCK_NONEMPTY;
>>>         if (!(req->flags & REQ_F_APOLL_MULTISHOT)) {
>>> @@ -652,7 +662,7 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
>>>               io_recv_prep_retry(req);
>>>               /* Known not-empty or unknown state, retry */
>>>               if (cflags & IORING_CQE_F_SOCK_NONEMPTY ||
>>> -                msg->msg_inq == -1)
>>> +                (msg && msg->msg_inq == -1))
>>>                   return false;
>>>               if (issue_flags & IO_URING_F_MULTISHOT)
>>>                   *ret = IOU_ISSUE_SKIP_COMPLETE;
>>
>> These are a bit ugly, just pass in a dummy msg for this?
>>
>>> +int io_recvzc(struct io_kiocb *req, unsigned int issue_flags)
>>> +{
>>> +    struct io_recvzc *zc = io_kiocb_to_cmd(req, struct io_recvzc);
>>> +    struct socket *sock;
>>> +    unsigned flags;
>>> +    int ret, min_ret = 0;
>>> +    bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
>>> +    struct io_zc_rx_ifq *ifq;
>>
>> Eg
>>     struct msghdr dummy_msg;
>>
>>     dummy_msg.msg_inq = -1;
>>
>> which will eat some stack, but probably not really an issue.
>>
>>
>>> +    if (issue_flags & IO_URING_F_UNLOCKED)
>>> +        return -EAGAIN;
>>
>> This seems odd, why? If we're called with IO_URING_F_UNLOCKED set, then
> 
> It's my addition, let me explain.
> 
> io_recvzc() -> io_zc_rx_recv() -> ... -> zc_rx_recv_frag()
> 
> This chain posts completions to a buffer completion queue, and
> we don't want extra locking to share it with io-wq threads. In
> some sense it's quite similar to the CQ locking, considering
> we restrict zc to DEFER_TASKRUN. And doesn't change anything
> anyway because multishot cannot post completions from io-wq
> and are executed from the poll callback in task work.
> 
>> it's from io-wq. And returning -EAGAIN there will not do anything to
> 
> It will. It's supposed to just requeue for polling (it's not
> IOPOLL to keep retrying -EAGAIN), just like multishots do.

It definitely needs a good comment, as it's highly non-obvious when
reading the code!

> Double checking the code, it can actually terminate the request,
> which doesn't make much difference for us because multishots
> should normally never end up in io-wq anyway, but I guess we
> can improve it a liitle bit.

Right, assumptions seems to be that -EAGAIN will lead to poll arm, which
seems a bit fragile.

> And it should also use IO_URING_F_IOWQ, forgot I split out
> it from *F_UNLOCK.

Yep, that'd be clearer.

>>> +static int zc_rx_recv_frag(struct io_zc_rx_ifq *ifq, const skb_frag_t *frag,
>>> +               int off, int len, unsigned sock_idx)
>>> +{
>>> +    off += skb_frag_off(frag);
>>> +
>>> +    if (likely(page_is_page_pool_iov(frag->bv_page))) {
>>> +        struct io_uring_rbuf_cqe *cqe;
>>> +        struct io_zc_rx_buf *buf;
>>> +        struct page_pool_iov *ppiov;
>>> +
>>> +        ppiov = page_to_page_pool_iov(frag->bv_page);
>>> +        if (ppiov->pp->p.memory_provider != PP_MP_IOU_ZCRX ||
>>> +            ppiov->pp->mp_priv != ifq)
>>> +            return -EFAULT;
>>> +
>>> +        cqe = io_zc_get_rbuf_cqe(ifq);
>>> +        if (!cqe)
>>> +            return -ENOBUFS;
>>> +
>>> +        buf = io_iov_to_buf(ppiov);
>>> +        io_zc_rx_get_buf_uref(buf);
>>> +
>>> +        cqe->region = 0;
>>> +        cqe->off = io_buf_pgid(ifq->pool, buf) * PAGE_SIZE + off;
>>> +        cqe->len = len;
>>> +        cqe->sock = sock_idx;
>>> +        cqe->flags = 0;
>>> +    } else {
>>> +        return -EOPNOTSUPP;
>>> +    }
>>> +
>>> +    return len;
>>> +}
>>
>> I think this would read a lot better as:
>>
>>     if (unlikely(!page_is_page_pool_iov(frag->bv_page)))
>>         return -EOPNOTSUPP;
> 
> That's a bit of oracle coding, this branch is implemented in
> a later patch.

Oracle coding?

Each patch stands separately, there's no reason not to make this one as
clean as it can be. And an error case with the main bits inline is a lot
nicer imho than two separate indented parts. For the latter addition
instead of the -EOPNOTSUPP, would probably be nice to have it in a
separate function. Probably ditto for the page pool case here now, would
make the later patch simpler too.
Pavel Begunkov Dec. 21, 2023, 6:59 p.m. UTC | #4
On 12/20/23 18:09, Jens Axboe wrote:
> On 12/20/23 10:04 AM, Pavel Begunkov wrote:
>> On 12/20/23 16:27, Jens Axboe wrote:
>>> On 12/19/23 2:03 PM, David Wei wrote:
>>>> diff --git a/io_uring/net.c b/io_uring/net.c
>>>> index 454ba301ae6b..7a2aadf6962c 100644
>>>> --- a/io_uring/net.c
>>>> +++ b/io_uring/net.c
>>>> @@ -637,7 +647,7 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
>>>>        unsigned int cflags;
>>>>          cflags = io_put_kbuf(req, issue_flags);
>>>> -    if (msg->msg_inq && msg->msg_inq != -1)
>>>> +    if (msg && msg->msg_inq && msg->msg_inq != -1)
>>>>            cflags |= IORING_CQE_F_SOCK_NONEMPTY;
>>>>          if (!(req->flags & REQ_F_APOLL_MULTISHOT)) {
>>>> @@ -652,7 +662,7 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
>>>>                io_recv_prep_retry(req);
>>>>                /* Known not-empty or unknown state, retry */
>>>>                if (cflags & IORING_CQE_F_SOCK_NONEMPTY ||
>>>> -                msg->msg_inq == -1)
>>>> +                (msg && msg->msg_inq == -1))
>>>>                    return false;
>>>>                if (issue_flags & IO_URING_F_MULTISHOT)
>>>>                    *ret = IOU_ISSUE_SKIP_COMPLETE;
>>>
>>> These are a bit ugly, just pass in a dummy msg for this?
>>>
>>>> +int io_recvzc(struct io_kiocb *req, unsigned int issue_flags)
>>>> +{
>>>> +    struct io_recvzc *zc = io_kiocb_to_cmd(req, struct io_recvzc);
>>>> +    struct socket *sock;
>>>> +    unsigned flags;
>>>> +    int ret, min_ret = 0;
>>>> +    bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
>>>> +    struct io_zc_rx_ifq *ifq;
>>>
>>> Eg
>>>      struct msghdr dummy_msg;
>>>
>>>      dummy_msg.msg_inq = -1;
>>>
>>> which will eat some stack, but probably not really an issue.
>>>
>>>
>>>> +    if (issue_flags & IO_URING_F_UNLOCKED)
>>>> +        return -EAGAIN;
>>>
>>> This seems odd, why? If we're called with IO_URING_F_UNLOCKED set, then
>>
>> It's my addition, let me explain.
>>
>> io_recvzc() -> io_zc_rx_recv() -> ... -> zc_rx_recv_frag()
>>
>> This chain posts completions to a buffer completion queue, and
>> we don't want extra locking to share it with io-wq threads. In
>> some sense it's quite similar to the CQ locking, considering
>> we restrict zc to DEFER_TASKRUN. And doesn't change anything
>> anyway because multishot cannot post completions from io-wq
>> and are executed from the poll callback in task work.
>>
>>> it's from io-wq. And returning -EAGAIN there will not do anything to
>>
>> It will. It's supposed to just requeue for polling (it's not
>> IOPOLL to keep retrying -EAGAIN), just like multishots do.
> 
> It definitely needs a good comment, as it's highly non-obvious when
> reading the code!
> 
>> Double checking the code, it can actually terminate the request,
>> which doesn't make much difference for us because multishots
>> should normally never end up in io-wq anyway, but I guess we
>> can improve it a liitle bit.
> 
> Right, assumptions seems to be that -EAGAIN will lead to poll arm, which
> seems a bit fragile.

The main assumption is that io-wq will eventually leave the
request alone and push it somewhere else, either queuing for
polling or terminating, which is more than reasonable. I'd
add that it's rather insane for io-wq indefinitely spinning
on -EAGAIN, but it has long been fixed (for !IOPOLL).

As said, can be made a bit better, but it won't change anything
for real life execution, multishots would never end up there
after they start listening for poll events.

>> And it should also use IO_URING_F_IOWQ, forgot I split out
>> it from *F_UNLOCK.
> 
> Yep, that'd be clearer.

Not "clearer", but more correct. Even though it's not
a bug because deps between the flags.

>>>> +static int zc_rx_recv_frag(struct io_zc_rx_ifq *ifq, const skb_frag_t *frag,
>>>> +               int off, int len, unsigned sock_idx)
>>>> +{
>>>> +    off += skb_frag_off(frag);
>>>> +
>>>> +    if (likely(page_is_page_pool_iov(frag->bv_page))) {
>>>> +        struct io_uring_rbuf_cqe *cqe;
>>>> +        struct io_zc_rx_buf *buf;
>>>> +        struct page_pool_iov *ppiov;
>>>> +
>>>> +        ppiov = page_to_page_pool_iov(frag->bv_page);
>>>> +        if (ppiov->pp->p.memory_provider != PP_MP_IOU_ZCRX ||
>>>> +            ppiov->pp->mp_priv != ifq)
>>>> +            return -EFAULT;
>>>> +
>>>> +        cqe = io_zc_get_rbuf_cqe(ifq);
>>>> +        if (!cqe)
>>>> +            return -ENOBUFS;
>>>> +
>>>> +        buf = io_iov_to_buf(ppiov);
>>>> +        io_zc_rx_get_buf_uref(buf);
>>>> +
>>>> +        cqe->region = 0;
>>>> +        cqe->off = io_buf_pgid(ifq->pool, buf) * PAGE_SIZE + off;
>>>> +        cqe->len = len;
>>>> +        cqe->sock = sock_idx;
>>>> +        cqe->flags = 0;
>>>> +    } else {
>>>> +        return -EOPNOTSUPP;
>>>> +    }
>>>> +
>>>> +    return len;
>>>> +}
>>>
>>> I think this would read a lot better as:
>>>
>>>      if (unlikely(!page_is_page_pool_iov(frag->bv_page)))
>>>          return -EOPNOTSUPP;
>>
>> That's a bit of oracle coding, this branch is implemented in
>> a later patch.
> 
> Oracle coding?

I.e. knowing how later patches (should) look like.

> Each patch stands separately, there's no reason not to make this one as

They are not standalone, you cannot sanely develop anything not
thinking how and where it's used, otherwise you'd get a set of
functions full of sleeping but later used in irq context or just
unfittable into a desired framework. By extent, code often is
written while trying to look a step ahead. For example, first
patches don't push everything into io_uring.c just to wholesale
move it into zc_rx.c because of exceeding some size threshold.

> clean as it can be. And an error case with the main bits inline is a lot

I agree that it should be clean among all, but it _is_ clean
and readable, all else is stylistic nit picking. And maybe it's
just my opinion, but I also personally appreciate when a patch is
easy to review, which includes not restructuring all written before
with every patch, which also helps with back porting and other
developing aspects.

> nicer imho than two separate indented parts. For the latter addition
> instead of the -EOPNOTSUPP, would probably be nice to have it in a
> separate function. Probably ditto for the page pool case here now, would
> make the later patch simpler too.

If we'd need it in the future, we'll change it then, patches
stand separately, at least it's IMHO not needed in the current
series.
Jens Axboe Dec. 21, 2023, 9:32 p.m. UTC | #5
On 12/21/23 11:59 AM, Pavel Begunkov wrote:
> On 12/20/23 18:09, Jens Axboe wrote:
>> On 12/20/23 10:04 AM, Pavel Begunkov wrote:
>>> On 12/20/23 16:27, Jens Axboe wrote:
>>>> On 12/19/23 2:03 PM, David Wei wrote:
>>>>> diff --git a/io_uring/net.c b/io_uring/net.c
>>>>> index 454ba301ae6b..7a2aadf6962c 100644
>>>>> --- a/io_uring/net.c
>>>>> +++ b/io_uring/net.c
>>>>> @@ -637,7 +647,7 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
>>>>>        unsigned int cflags;
>>>>>          cflags = io_put_kbuf(req, issue_flags);
>>>>> -    if (msg->msg_inq && msg->msg_inq != -1)
>>>>> +    if (msg && msg->msg_inq && msg->msg_inq != -1)
>>>>>            cflags |= IORING_CQE_F_SOCK_NONEMPTY;
>>>>>          if (!(req->flags & REQ_F_APOLL_MULTISHOT)) {
>>>>> @@ -652,7 +662,7 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
>>>>>                io_recv_prep_retry(req);
>>>>>                /* Known not-empty or unknown state, retry */
>>>>>                if (cflags & IORING_CQE_F_SOCK_NONEMPTY ||
>>>>> -                msg->msg_inq == -1)
>>>>> +                (msg && msg->msg_inq == -1))
>>>>>                    return false;
>>>>>                if (issue_flags & IO_URING_F_MULTISHOT)
>>>>>                    *ret = IOU_ISSUE_SKIP_COMPLETE;
>>>>
>>>> These are a bit ugly, just pass in a dummy msg for this?
>>>>
>>>>> +int io_recvzc(struct io_kiocb *req, unsigned int issue_flags)
>>>>> +{
>>>>> +    struct io_recvzc *zc = io_kiocb_to_cmd(req, struct io_recvzc);
>>>>> +    struct socket *sock;
>>>>> +    unsigned flags;
>>>>> +    int ret, min_ret = 0;
>>>>> +    bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
>>>>> +    struct io_zc_rx_ifq *ifq;
>>>>
>>>> Eg
>>>>      struct msghdr dummy_msg;
>>>>
>>>>      dummy_msg.msg_inq = -1;
>>>>
>>>> which will eat some stack, but probably not really an issue.
>>>>
>>>>
>>>>> +    if (issue_flags & IO_URING_F_UNLOCKED)
>>>>> +        return -EAGAIN;
>>>>
>>>> This seems odd, why? If we're called with IO_URING_F_UNLOCKED set, then
>>>
>>> It's my addition, let me explain.
>>>
>>> io_recvzc() -> io_zc_rx_recv() -> ... -> zc_rx_recv_frag()
>>>
>>> This chain posts completions to a buffer completion queue, and
>>> we don't want extra locking to share it with io-wq threads. In
>>> some sense it's quite similar to the CQ locking, considering
>>> we restrict zc to DEFER_TASKRUN. And doesn't change anything
>>> anyway because multishot cannot post completions from io-wq
>>> and are executed from the poll callback in task work.
>>>
>>>> it's from io-wq. And returning -EAGAIN there will not do anything to
>>>
>>> It will. It's supposed to just requeue for polling (it's not
>>> IOPOLL to keep retrying -EAGAIN), just like multishots do.
>>
>> It definitely needs a good comment, as it's highly non-obvious when
>> reading the code!
>>
>>> Double checking the code, it can actually terminate the request,
>>> which doesn't make much difference for us because multishots
>>> should normally never end up in io-wq anyway, but I guess we
>>> can improve it a liitle bit.
>>
>> Right, assumptions seems to be that -EAGAIN will lead to poll arm, which
>> seems a bit fragile.
> 
> The main assumption is that io-wq will eventually leave the
> request alone and push it somewhere else, either queuing for
> polling or terminating, which is more than reasonable. I'd

But surely you don't want it terminated from here? It seems like a very
odd choice. As it stands, if you end up doing more than one loop, then
it won't arm poll and it'll get failed.

> add that it's rather insane for io-wq indefinitely spinning
> on -EAGAIN, but it has long been fixed (for !IOPOLL).

There's no other choice for polling, and it doesn't do it for
non-polling. The current logic makes sense - if you do a blocking
attempt and you get -EAGAIN, that's really the final result and you
cannot sanely retry for !IOPOLL in that case. Before we did poll arm for
io-wq, even the first -EAGAIN would've terminated it. Relying on -EAGAIN
from a blocking attempt to do anything but fail the request with -EAGAIN
res is pretty fragile and odd, I think that needs sorting out.

> As said, can be made a bit better, but it won't change anything
> for real life execution, multishots would never end up there
> after they start listening for poll events.

Right, probably won't ever be a thing for !multishot. As mentioned in my
original reply, it really just needs a comment explaining exactly what
it's doing and why it's fine.

>>> And it should also use IO_URING_F_IOWQ, forgot I split out
>>> it from *F_UNLOCK.
>>
>> Yep, that'd be clearer.
> 
> Not "clearer", but more correct. Even though it's not
> a bug because deps between the flags.

Both clearer and more correct, I would say.

>>>>> +static int zc_rx_recv_frag(struct io_zc_rx_ifq *ifq, const skb_frag_t *frag,
>>>>> +               int off, int len, unsigned sock_idx)
>>>>> +{
>>>>> +    off += skb_frag_off(frag);
>>>>> +
>>>>> +    if (likely(page_is_page_pool_iov(frag->bv_page))) {
>>>>> +        struct io_uring_rbuf_cqe *cqe;
>>>>> +        struct io_zc_rx_buf *buf;
>>>>> +        struct page_pool_iov *ppiov;
>>>>> +
>>>>> +        ppiov = page_to_page_pool_iov(frag->bv_page);
>>>>> +        if (ppiov->pp->p.memory_provider != PP_MP_IOU_ZCRX ||
>>>>> +            ppiov->pp->mp_priv != ifq)
>>>>> +            return -EFAULT;
>>>>> +
>>>>> +        cqe = io_zc_get_rbuf_cqe(ifq);
>>>>> +        if (!cqe)
>>>>> +            return -ENOBUFS;
>>>>> +
>>>>> +        buf = io_iov_to_buf(ppiov);
>>>>> +        io_zc_rx_get_buf_uref(buf);
>>>>> +
>>>>> +        cqe->region = 0;
>>>>> +        cqe->off = io_buf_pgid(ifq->pool, buf) * PAGE_SIZE + off;
>>>>> +        cqe->len = len;
>>>>> +        cqe->sock = sock_idx;
>>>>> +        cqe->flags = 0;
>>>>> +    } else {
>>>>> +        return -EOPNOTSUPP;
>>>>> +    }
>>>>> +
>>>>> +    return len;
>>>>> +}
>>>>
>>>> I think this would read a lot better as:
>>>>
>>>>      if (unlikely(!page_is_page_pool_iov(frag->bv_page)))
>>>>          return -EOPNOTSUPP;
>>>
>>> That's a bit of oracle coding, this branch is implemented in
>>> a later patch.
>>
>> Oracle coding?
> 
> I.e. knowing how later patches (should) look like.
> 
>> Each patch stands separately, there's no reason not to make this one as
> 
> They are not standalone, you cannot sanely develop anything not
> thinking how and where it's used, otherwise you'd get a set of
> functions full of sleeping but later used in irq context or just
> unfittable into a desired framework. By extent, code often is
> written while trying to look a step ahead. For example, first
> patches don't push everything into io_uring.c just to wholesale
> move it into zc_rx.c because of exceeding some size threshold.

Yes, this is how most patch series are - they will compile separately,
but obviously won't necessarily make sense or be functional until you
get to the end. But since you very much do have future knowledge in
these patches, there's no excuse for not making them interact with each
other better. Each patch should not pretend it doesn't know what comes
next in a series, if you can make a followup patch simpler with a tweak
to a previous patch, that is definitely a good idea.

And here, even the end result would be better imho without having

if (a) {
	big blob of stuff
} else {
	other blob of stuff
}

when it could just be

if (a)
	return big_blog_of_stuff();

return other_blog_of_stuff();

instead.

>> clean as it can be. And an error case with the main bits inline is a lot
> 
> I agree that it should be clean among all, but it _is_ clean
> and readable, all else is stylistic nit picking. And maybe it's
> just my opinion, but I also personally appreciate when a patch is
> easy to review, which includes not restructuring all written before
> with every patch, which also helps with back porting and other
> developing aspects.

But that's basically my point, it even makes followup patches simpler to
read as well. Is it stylistic? Certainly, I just prefer having the above
rather than two big indentations. But it also makes the followup patch
simpler and it's basically a one-liner change at that point, and a
bigger hunk of added code that's the new function that handles the new
case.

>> nicer imho than two separate indented parts. For the latter addition
>> instead of the -EOPNOTSUPP, would probably be nice to have it in a
>> separate function. Probably ditto for the page pool case here now, would
>> make the later patch simpler too.
> 
> If we'd need it in the future, we'll change it then, patches
> stand separately, at least it's IMHO not needed in the current
> series.

It's still an RFC series, please do change it for v4.
Pavel Begunkov Dec. 30, 2023, 9:15 p.m. UTC | #6
On 12/21/23 21:32, Jens Axboe wrote:
> On 12/21/23 11:59 AM, Pavel Begunkov wrote:
>> On 12/20/23 18:09, Jens Axboe wrote:
>>> On 12/20/23 10:04 AM, Pavel Begunkov wrote:
>>>> On 12/20/23 16:27, Jens Axboe wrote:
>>>>> On 12/19/23 2:03 PM, David Wei wrote:
>>>>>> diff --git a/io_uring/net.c b/io_uring/net.c
>>>>>> index 454ba301ae6b..7a2aadf6962c 100644
>>>>>> --- a/io_uring/net.c
>>>>>> +++ b/io_uring/net.c
[...]
>>>>>> +    if (issue_flags & IO_URING_F_UNLOCKED)
>>>>>> +        return -EAGAIN;
>>>>>
>>>>> This seems odd, why? If we're called with IO_URING_F_UNLOCKED set, then
>>>>
>>>> It's my addition, let me explain.
>>>>
>>>> io_recvzc() -> io_zc_rx_recv() -> ... -> zc_rx_recv_frag()
>>>>
>>>> This chain posts completions to a buffer completion queue, and
>>>> we don't want extra locking to share it with io-wq threads. In
>>>> some sense it's quite similar to the CQ locking, considering
>>>> we restrict zc to DEFER_TASKRUN. And doesn't change anything
>>>> anyway because multishot cannot post completions from io-wq
>>>> and are executed from the poll callback in task work.
>>>>
>>>>> it's from io-wq. And returning -EAGAIN there will not do anything to
>>>>
>>>> It will. It's supposed to just requeue for polling (it's not
>>>> IOPOLL to keep retrying -EAGAIN), just like multishots do.
>>>
>>> It definitely needs a good comment, as it's highly non-obvious when
>>> reading the code!
>>>
>>>> Double checking the code, it can actually terminate the request,
>>>> which doesn't make much difference for us because multishots
>>>> should normally never end up in io-wq anyway, but I guess we
>>>> can improve it a liitle bit.
>>>
>>> Right, assumptions seems to be that -EAGAIN will lead to poll arm, which
>>> seems a bit fragile.
>>
>> The main assumption is that io-wq will eventually leave the
>> request alone and push it somewhere else, either queuing for
>> polling or terminating, which is more than reasonable. I'd
> 
> But surely you don't want it terminated from here? It seems like a very
> odd choice. As it stands, if you end up doing more than one loop, then
> it won't arm poll and it'll get failed.
>> add that it's rather insane for io-wq indefinitely spinning
>> on -EAGAIN, but it has long been fixed (for !IOPOLL).
> 
> There's no other choice for polling, and it doesn't do it for

zc rx is !IOPOLL, that's what I care about.

> non-polling. The current logic makes sense - if you do a blocking
> attempt and you get -EAGAIN, that's really the final result and you
> cannot sanely retry for !IOPOLL in that case. Before we did poll arm for
> io-wq, even the first -EAGAIN would've terminated it. Relying on -EAGAIN
> from a blocking attempt to do anything but fail the request with -EAGAIN
> res is pretty fragile and odd, I think that needs sorting out.
> 
>> As said, can be made a bit better, but it won't change anything
>> for real life execution, multishots would never end up there
>> after they start listening for poll events.
> 
> Right, probably won't ever be a thing for !multishot. As mentioned in my
> original reply, it really just needs a comment explaining exactly what
> it's doing and why it's fine.
> 
>>>> And it should also use IO_URING_F_IOWQ, forgot I split out
>>>> it from *F_UNLOCK.
>>>
>>> Yep, that'd be clearer.
>>
>> Not "clearer", but more correct. Even though it's not
>> a bug because deps between the flags.
> 
> Both clearer and more correct, I would say.
> 
[...]
>>>
>>> Oracle coding?
>>
>> I.e. knowing how later patches (should) look like.
>>
>>> Each patch stands separately, there's no reason not to make this one as
>>
>> They are not standalone, you cannot sanely develop anything not
>> thinking how and where it's used, otherwise you'd get a set of
>> functions full of sleeping but later used in irq context or just
>> unfittable into a desired framework. By extent, code often is
>> written while trying to look a step ahead. For example, first
>> patches don't push everything into io_uring.c just to wholesale
>> move it into zc_rx.c because of exceeding some size threshold.
> 
> Yes, this is how most patch series are - they will compile separately,
> but obviously won't necessarily make sense or be functional until you
> get to the end. But since you very much do have future knowledge in
> these patches, there's no excuse for not making them interact with each
> other better. Each patch should not pretend it doesn't know what comes

Which is exactly the reason why it is how it is.

> next in a series, if you can make a followup patch simpler with a tweak
> to a previous patch, that is definitely a good idea.
> 
> And here, even the end result would be better imho without having
> 
> if (a) {
> 	big blob of stuff
> } else {
> 	other blob of stuff
> }
> 
> when it could just be
> 
> if (a)
> 	return big_blog_of_stuff();
> 
> return other_blog_of_stuff();
> 
> instead.

That sounds like a good general advice, but the "blobs" are
not big nor expected to grow to require splitting, I can't say
it makes it any cleaner or simpler.

>>> clean as it can be. And an error case with the main bits inline is a lot
>>
>> I agree that it should be clean among all, but it _is_ clean
>> and readable, all else is stylistic nit picking. And maybe it's
>> just my opinion, but I also personally appreciate when a patch is
>> easy to review, which includes not restructuring all written before
>> with every patch, which also helps with back porting and other
>> developing aspects.
> 
> But that's basically my point, it even makes followup patches simpler to
> read as well. Is it stylistic? Certainly, I just prefer having the above
> rather than two big indentations. But it also makes the followup patch
> simpler
> and it's basically a one-liner change at that point, and a
> bigger hunk of added code that's the new function that handles the new
> case.
> 
>>> nicer imho than two separate indented parts. For the latter addition
>>> instead of the -EOPNOTSUPP, would probably be nice to have it in a
>>> separate function. Probably ditto for the page pool case here now, would
>>> make the later patch simpler too.
>>
>> If we'd need it in the future, we'll change it then, patches
>> stand separately, at least it's IMHO not needed in the current
>> series.
> 
> It's still an RFC series, please do change it for v4.

It's not my patch, but I don't view it as moving the patches in
any positive direction.
diff mbox series

Patch

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index f4ba58bce3bd..f57f394744fe 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -253,6 +253,7 @@  enum io_uring_op {
 	IORING_OP_FUTEX_WAIT,
 	IORING_OP_FUTEX_WAKE,
 	IORING_OP_FUTEX_WAITV,
+	IORING_OP_RECV_ZC,
 
 	/* this goes last, obviously */
 	IORING_OP_LAST,
diff --git a/io_uring/net.c b/io_uring/net.c
index 454ba301ae6b..7a2aadf6962c 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -71,6 +71,16 @@  struct io_sr_msg {
 	struct io_kiocb 		*notif;
 };
 
+struct io_recvzc {
+	struct file			*file;
+	unsigned			len;
+	unsigned			done_io;
+	unsigned			msg_flags;
+	u16				flags;
+
+	u32				datalen;
+};
+
 static inline bool io_check_multishot(struct io_kiocb *req,
 				      unsigned int issue_flags)
 {
@@ -637,7 +647,7 @@  static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
 	unsigned int cflags;
 
 	cflags = io_put_kbuf(req, issue_flags);
-	if (msg->msg_inq && msg->msg_inq != -1)
+	if (msg && msg->msg_inq && msg->msg_inq != -1)
 		cflags |= IORING_CQE_F_SOCK_NONEMPTY;
 
 	if (!(req->flags & REQ_F_APOLL_MULTISHOT)) {
@@ -652,7 +662,7 @@  static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
 			io_recv_prep_retry(req);
 			/* Known not-empty or unknown state, retry */
 			if (cflags & IORING_CQE_F_SOCK_NONEMPTY ||
-			    msg->msg_inq == -1)
+			    (msg && msg->msg_inq == -1))
 				return false;
 			if (issue_flags & IO_URING_F_MULTISHOT)
 				*ret = IOU_ISSUE_SKIP_COMPLETE;
@@ -956,9 +966,8 @@  int io_recv(struct io_kiocb *req, unsigned int issue_flags)
 	return ret;
 }
 
-static __maybe_unused
-struct io_zc_rx_ifq *io_zc_verify_sock(struct io_kiocb *req,
-					struct socket *sock)
+static struct io_zc_rx_ifq *io_zc_verify_sock(struct io_kiocb *req,
+					      struct socket *sock)
 {
 	unsigned token = READ_ONCE(sock->zc_rx_idx);
 	unsigned ifq_idx = token >> IO_ZC_IFQ_IDX_OFFSET;
@@ -975,6 +984,106 @@  struct io_zc_rx_ifq *io_zc_verify_sock(struct io_kiocb *req,
 	return ifq;
 }
 
+int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	struct io_recvzc *zc = io_kiocb_to_cmd(req, struct io_recvzc);
+	u64 recvzc_cmd;
+
+	recvzc_cmd = READ_ONCE(sqe->addr3);
+	zc->datalen = recvzc_cmd >> 32;
+	if (recvzc_cmd & 0xffff)
+		return -EINVAL;
+	if (!(req->ctx->flags & IORING_SETUP_DEFER_TASKRUN))
+		return -EINVAL;
+	if (unlikely(sqe->file_index || sqe->addr2))
+		return -EINVAL;
+
+	zc->len = READ_ONCE(sqe->len);
+	zc->flags = READ_ONCE(sqe->ioprio);
+	if (zc->flags & ~(RECVMSG_FLAGS))
+		return -EINVAL;
+	zc->msg_flags = READ_ONCE(sqe->msg_flags);
+	if (zc->msg_flags & MSG_DONTWAIT)
+		req->flags |= REQ_F_NOWAIT;
+	if (zc->msg_flags & MSG_ERRQUEUE)
+		req->flags |= REQ_F_CLEAR_POLLIN;
+	if (zc->flags & IORING_RECV_MULTISHOT) {
+		if (zc->msg_flags & MSG_WAITALL)
+			return -EINVAL;
+		if (req->opcode == IORING_OP_RECV && zc->len)
+			return -EINVAL;
+		req->flags |= REQ_F_APOLL_MULTISHOT;
+	}
+
+#ifdef CONFIG_COMPAT
+	if (req->ctx->compat)
+		zc->msg_flags |= MSG_CMSG_COMPAT;
+#endif
+	zc->done_io = 0;
+	return 0;
+}
+
+int io_recvzc(struct io_kiocb *req, unsigned int issue_flags)
+{
+	struct io_recvzc *zc = io_kiocb_to_cmd(req, struct io_recvzc);
+	struct socket *sock;
+	unsigned flags;
+	int ret, min_ret = 0;
+	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
+	struct io_zc_rx_ifq *ifq;
+
+	if (issue_flags & IO_URING_F_UNLOCKED)
+		return -EAGAIN;
+
+	if (!(req->flags & REQ_F_POLLED) &&
+	    (zc->flags & IORING_RECVSEND_POLL_FIRST))
+		return -EAGAIN;
+
+	sock = sock_from_file(req->file);
+	if (unlikely(!sock))
+		return -ENOTSOCK;
+	ifq = io_zc_verify_sock(req, sock);
+	if (!ifq)
+		return -EINVAL;
+
+retry_multishot:
+	flags = zc->msg_flags;
+	if (force_nonblock)
+		flags |= MSG_DONTWAIT;
+	if (flags & MSG_WAITALL)
+		min_ret = zc->len;
+
+	ret = io_zc_rx_recv(ifq, sock, zc->datalen, flags);
+	if (ret < min_ret) {
+		if (ret == -EAGAIN && force_nonblock) {
+			if (issue_flags & IO_URING_F_MULTISHOT)
+				return IOU_ISSUE_SKIP_COMPLETE;
+			return -EAGAIN;
+		}
+		if (ret > 0 && io_net_retry(sock, flags)) {
+			zc->len -= ret;
+			zc->done_io += ret;
+			req->flags |= REQ_F_PARTIAL_IO;
+			return -EAGAIN;
+		}
+		if (ret == -ERESTARTSYS)
+			ret = -EINTR;
+		req_set_fail(req);
+	} else if ((flags & MSG_WAITALL) && (flags & (MSG_TRUNC | MSG_CTRUNC))) {
+		req_set_fail(req);
+	}
+
+	if (ret > 0)
+		ret += zc->done_io;
+	else if (zc->done_io)
+		ret = zc->done_io;
+
+	if (!io_recv_finish(req, &ret, 0, ret <= 0, issue_flags))
+		goto retry_multishot;
+
+	return ret;
+}
+
 void io_send_zc_cleanup(struct io_kiocb *req)
 {
 	struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg);
diff --git a/io_uring/opdef.c b/io_uring/opdef.c
index 799db44283c7..a90231566d09 100644
--- a/io_uring/opdef.c
+++ b/io_uring/opdef.c
@@ -35,6 +35,7 @@ 
 #include "rw.h"
 #include "waitid.h"
 #include "futex.h"
+#include "zc_rx.h"
 
 static int io_no_issue(struct io_kiocb *req, unsigned int issue_flags)
 {
@@ -467,6 +468,18 @@  const struct io_issue_def io_issue_defs[] = {
 		.issue			= io_futexv_wait,
 #else
 		.prep			= io_eopnotsupp_prep,
+#endif
+	},
+	[IORING_OP_RECV_ZC] = {
+		.needs_file		= 1,
+		.unbound_nonreg_file	= 1,
+		.pollin			= 1,
+		.ioprio			= 1,
+#if defined(CONFIG_NET)
+		.prep			= io_recvzc_prep,
+		.issue			= io_recvzc,
+#else
+		.prep			= io_eopnotsupp_prep,
 #endif
 	},
 };
@@ -704,6 +717,9 @@  const struct io_cold_def io_cold_defs[] = {
 	[IORING_OP_FUTEX_WAITV] = {
 		.name			= "FUTEX_WAITV",
 	},
+	[IORING_OP_RECV_ZC] = {
+		.name			= "RECV_ZC",
+	},
 };
 
 const char *io_uring_get_opcode(u8 opcode)
diff --git a/io_uring/zc_rx.c b/io_uring/zc_rx.c
index ff1dac24ac40..acb70ca23150 100644
--- a/io_uring/zc_rx.c
+++ b/io_uring/zc_rx.c
@@ -6,6 +6,7 @@ 
 #include <linux/io_uring.h>
 #include <linux/netdevice.h>
 #include <linux/nospec.h>
+#include <net/tcp.h>
 #include <trace/events/page_pool.h>
 
 #include <uapi/linux/io_uring.h>
@@ -15,8 +16,20 @@ 
 #include "zc_rx.h"
 #include "rsrc.h"
 
+struct io_zc_rx_args {
+	struct io_zc_rx_ifq	*ifq;
+	struct socket		*sock;
+};
+
 typedef int (*bpf_op_t)(struct net_device *dev, struct netdev_bpf *bpf);
 
+static inline u32 io_zc_rx_cqring_entries(struct io_zc_rx_ifq *ifq)
+{
+	struct io_rbuf_ring *ring = ifq->ring;
+
+	return ifq->cached_cq_tail - READ_ONCE(ring->cq.head);
+}
+
 static inline struct device *netdev2dev(struct net_device *dev)
 {
 	return dev->dev.parent;
@@ -399,7 +412,7 @@  static inline unsigned io_buf_pgid(struct io_zc_rx_pool *pool,
 	return buf - pool->bufs;
 }
 
-static __maybe_unused void io_zc_rx_get_buf_uref(struct io_zc_rx_buf *buf)
+static void io_zc_rx_get_buf_uref(struct io_zc_rx_buf *buf)
 {
 	refcount_add(IO_ZC_RX_UREF, &buf->ppiov.refcount);
 }
@@ -590,5 +603,230 @@  const struct pp_memory_provider_ops io_uring_pp_zc_ops = {
 };
 EXPORT_SYMBOL(io_uring_pp_zc_ops);
 
+static inline struct io_uring_rbuf_cqe *io_zc_get_rbuf_cqe(struct io_zc_rx_ifq *ifq)
+{
+	struct io_uring_rbuf_cqe *cqe;
+	unsigned int cq_idx, queued, free, entries;
+	unsigned int mask = ifq->cq_entries - 1;
+
+	cq_idx = ifq->cached_cq_tail & mask;
+	smp_rmb();
+	queued = min(io_zc_rx_cqring_entries(ifq), ifq->cq_entries);
+	free = ifq->cq_entries - queued;
+	entries = min(free, ifq->cq_entries - cq_idx);
+	if (!entries)
+		return NULL;
+
+	cqe = &ifq->cqes[cq_idx];
+	ifq->cached_cq_tail++;
+	return cqe;
+}
+
+static int zc_rx_recv_frag(struct io_zc_rx_ifq *ifq, const skb_frag_t *frag,
+			   int off, int len, unsigned sock_idx)
+{
+	off += skb_frag_off(frag);
+
+	if (likely(page_is_page_pool_iov(frag->bv_page))) {
+		struct io_uring_rbuf_cqe *cqe;
+		struct io_zc_rx_buf *buf;
+		struct page_pool_iov *ppiov;
+
+		ppiov = page_to_page_pool_iov(frag->bv_page);
+		if (ppiov->pp->p.memory_provider != PP_MP_IOU_ZCRX ||
+		    ppiov->pp->mp_priv != ifq)
+			return -EFAULT;
+
+		cqe = io_zc_get_rbuf_cqe(ifq);
+		if (!cqe)
+			return -ENOBUFS;
+
+		buf = io_iov_to_buf(ppiov);
+		io_zc_rx_get_buf_uref(buf);
+
+		cqe->region = 0;
+		cqe->off = io_buf_pgid(ifq->pool, buf) * PAGE_SIZE + off;
+		cqe->len = len;
+		cqe->sock = sock_idx;
+		cqe->flags = 0;
+	} else {
+		return -EOPNOTSUPP;
+	}
+
+	return len;
+}
+
+static int
+zc_rx_recv_skb(read_descriptor_t *desc, struct sk_buff *skb,
+	       unsigned int offset, size_t len)
+{
+	struct io_zc_rx_args *args = desc->arg.data;
+	struct io_zc_rx_ifq *ifq = args->ifq;
+	struct socket *sock = args->sock;
+	unsigned sock_idx = sock->zc_rx_idx & IO_ZC_IFQ_IDX_MASK;
+	struct sk_buff *frag_iter;
+	unsigned start, start_off;
+	int i, copy, end, off;
+	int ret = 0;
+
+	start = skb_headlen(skb);
+	start_off = offset;
+
+	if (offset < start)
+		return -EOPNOTSUPP;
+
+	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+		const skb_frag_t *frag;
+
+		WARN_ON(start > offset + len);
+
+		frag = &skb_shinfo(skb)->frags[i];
+		end = start + skb_frag_size(frag);
+
+		if (offset < end) {
+			copy = end - offset;
+			if (copy > len)
+				copy = len;
+
+			off = offset - start;
+			ret = zc_rx_recv_frag(ifq, frag, off, copy, sock_idx);
+			if (ret < 0)
+				goto out;
+
+			offset += ret;
+			len -= ret;
+			if (len == 0 || ret != copy)
+				goto out;
+		}
+		start = end;
+	}
+
+	skb_walk_frags(skb, frag_iter) {
+		WARN_ON(start > offset + len);
+
+		end = start + frag_iter->len;
+		if (offset < end) {
+			copy = end - offset;
+			if (copy > len)
+				copy = len;
+
+			off = offset - start;
+			ret = zc_rx_recv_skb(desc, frag_iter, off, copy);
+			if (ret < 0)
+				goto out;
+
+			offset += ret;
+			len -= ret;
+			if (len == 0 || ret != copy)
+				goto out;
+		}
+		start = end;
+	}
+
+out:
+	smp_store_release(&ifq->ring->cq.tail, ifq->cached_cq_tail);
+	if (offset == start_off)
+		return ret;
+	return offset - start_off;
+}
+
+static int io_zc_rx_tcp_read(struct io_zc_rx_ifq *ifq, struct sock *sk)
+{
+	struct io_zc_rx_args args = {
+		.ifq = ifq,
+		.sock = sk->sk_socket,
+	};
+	read_descriptor_t rd_desc = {
+		.count = 1,
+		.arg.data = &args,
+	};
+
+	return tcp_read_sock(sk, &rd_desc, zc_rx_recv_skb);
+}
+
+static int io_zc_rx_tcp_recvmsg(struct io_zc_rx_ifq *ifq, struct sock *sk,
+				unsigned int recv_limit,
+				int flags, int *addr_len)
+{
+	size_t used;
+	long timeo;
+	int ret;
+
+	ret = used = 0;
+
+	lock_sock(sk);
+
+	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
+	while (recv_limit) {
+		ret = io_zc_rx_tcp_read(ifq, sk);
+		if (ret < 0)
+			break;
+		if (!ret) {
+			if (used)
+				break;
+			if (sock_flag(sk, SOCK_DONE))
+				break;
+			if (sk->sk_err) {
+				ret = sock_error(sk);
+				break;
+			}
+			if (sk->sk_shutdown & RCV_SHUTDOWN)
+				break;
+			if (sk->sk_state == TCP_CLOSE) {
+				ret = -ENOTCONN;
+				break;
+			}
+			if (!timeo) {
+				ret = -EAGAIN;
+				break;
+			}
+			if (!skb_queue_empty(&sk->sk_receive_queue))
+				break;
+			sk_wait_data(sk, &timeo, NULL);
+			if (signal_pending(current)) {
+				ret = sock_intr_errno(timeo);
+				break;
+			}
+			continue;
+		}
+		recv_limit -= ret;
+		used += ret;
+
+		if (!timeo)
+			break;
+		release_sock(sk);
+		lock_sock(sk);
+
+		if (sk->sk_err || sk->sk_state == TCP_CLOSE ||
+		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
+		    signal_pending(current))
+			break;
+	}
+	release_sock(sk);
+	/* TODO: handle timestamping */
+	return used ? used : ret;
+}
+
+int io_zc_rx_recv(struct io_zc_rx_ifq *ifq, struct socket *sock,
+		  unsigned int limit, unsigned int flags)
+{
+	struct sock *sk = sock->sk;
+	const struct proto *prot;
+	int addr_len = 0;
+	int ret;
+
+	if (flags & MSG_ERRQUEUE)
+		return -EOPNOTSUPP;
+
+	prot = READ_ONCE(sk->sk_prot);
+	if (prot->recvmsg != tcp_recvmsg)
+		return -EPROTONOSUPPORT;
+
+	sock_rps_record_flow(sk);
+
+	ret = io_zc_rx_tcp_recvmsg(ifq, sk, limit, flags, &addr_len);
+
+	return ret;
+}
 
 #endif
diff --git a/io_uring/zc_rx.h b/io_uring/zc_rx.h
index 00d864700c67..3e8f07e4b252 100644
--- a/io_uring/zc_rx.h
+++ b/io_uring/zc_rx.h
@@ -72,4 +72,9 @@  static inline int io_register_zc_rx_sock(struct io_ring_ctx *ctx,
 }
 #endif
 
+int io_recvzc(struct io_kiocb *req, unsigned int issue_flags);
+int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_zc_rx_recv(struct io_zc_rx_ifq *ifq, struct socket *sock,
+		  unsigned int limit, unsigned int flags);
+
 #endif