diff mbox series

[RFC,v3,07/20] io_uring: add interface queue

Message ID 20231219210357.4029713-8-dw@davidwei.uk (mailing list archive)
State New
Headers show
Series Zero copy Rx using io_uring | expand

Commit Message

David Wei Dec. 19, 2023, 9:03 p.m. UTC
From: David Wei <davidhwei@meta.com>

This patch introduces a new object in io_uring called an interface queue
(ifq) which contains:

* A pool region allocated by userspace and registered w/ io_uring where
  Rx data is written to.
* A net device and one specific Rx queue in it that will be configured
  for ZC Rx.
* A pair of shared ringbuffers w/ userspace, dubbed registered buf
  (rbuf) rings. Each entry contains a pool region id and an offset + len
  within that region. The kernel writes entries into the completion ring
  to tell userspace where RX data is relative to the start of a region.
  Userspace writes entries into the refill ring to tell the kernel when
  it is done with the data.

For now, each io_uring instance has a single ifq, and each ifq has a
single pool region associated with one Rx queue.

Add a new opcode to io_uring_register that sets up an ifq. Size and
offsets of shared ringbuffers are returned to userspace for it to mmap.
The implementation will be added in a later patch.

Signed-off-by: David Wei <dw@davidwei.uk>
---
 include/linux/io_uring_types.h |   8 +++
 include/uapi/linux/io_uring.h  |  51 +++++++++++++++
 io_uring/Makefile              |   2 +-
 io_uring/io_uring.c            |  13 ++++
 io_uring/zc_rx.c               | 116 +++++++++++++++++++++++++++++++++
 io_uring/zc_rx.h               |  37 +++++++++++
 6 files changed, 226 insertions(+), 1 deletion(-)
 create mode 100644 io_uring/zc_rx.c
 create mode 100644 io_uring/zc_rx.h

Comments

Jens Axboe Dec. 20, 2023, 4:13 p.m. UTC | #1
On 12/19/23 2:03 PM, David Wei wrote:
> @@ -750,6 +753,54 @@ enum {
>  	SOCKET_URING_OP_SETSOCKOPT,
>  };
>  
> +struct io_uring_rbuf_rqe {
> +	__u32	off;
> +	__u32	len;
> +	__u16	region;
> +	__u8	__pad[6];
> +};
> +
> +struct io_uring_rbuf_cqe {
> +	__u32	off;
> +	__u32	len;
> +	__u16	region;
> +	__u8	sock;
> +	__u8	flags;
> +	__u8	__pad[2];
> +};

Looks like this leaves a gap? Should be __pad[4] or probably just __u32
__pad; For all of these, definitely worth thinking about if we'll ever
need more than the slight padding. Might not hurt to always leave 8
bytes extra, outside of the required padding.

> +struct io_rbuf_rqring_offsets {
> +	__u32	head;
> +	__u32	tail;
> +	__u32	rqes;
> +	__u8	__pad[4];
> +};

Ditto here, __u32 __pad;

> +struct io_rbuf_cqring_offsets {
> +	__u32	head;
> +	__u32	tail;
> +	__u32	cqes;
> +	__u8	__pad[4];
> +};

And here.

> +
> +/*
> + * Argument for IORING_REGISTER_ZC_RX_IFQ
> + */
> +struct io_uring_zc_rx_ifq_reg {
> +	__u32	if_idx;
> +	/* hw rx descriptor ring id */
> +	__u32	if_rxq_id;
> +	__u32	region_id;
> +	__u32	rq_entries;
> +	__u32	cq_entries;
> +	__u32	flags;
> +	__u16	cpu;
> +
> +	__u32	mmap_sz;
> +	struct io_rbuf_rqring_offsets rq_off;
> +	struct io_rbuf_cqring_offsets cq_off;
> +};

You have rq_off starting at a 48-bit offset here, don't think this is
going to work as it's uapi. You'd need padding to align it to 64-bits.

> diff --git a/io_uring/zc_rx.c b/io_uring/zc_rx.c
> new file mode 100644
> index 000000000000..5fc94cad5e3a
> --- /dev/null
> +++ b/io_uring/zc_rx.c
> +int io_register_zc_rx_ifq(struct io_ring_ctx *ctx,
> +			  struct io_uring_zc_rx_ifq_reg __user *arg)
> +{
> +	struct io_uring_zc_rx_ifq_reg reg;
> +	struct io_zc_rx_ifq *ifq;
> +	int ret;
> +
> +	if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN))
> +		return -EINVAL;
> +	if (copy_from_user(&reg, arg, sizeof(reg)))
> +		return -EFAULT;
> +	if (ctx->ifq)
> +		return -EBUSY;
> +	if (reg.if_rxq_id == -1)
> +		return -EINVAL;
> +
> +	ifq = io_zc_rx_ifq_alloc(ctx);
> +	if (!ifq)
> +		return -ENOMEM;
> +
> +	/* TODO: initialise network interface */
> +
> +	ret = io_allocate_rbuf_ring(ifq, &reg);
> +	if (ret)
> +		goto err;
> +
> +	/* TODO: map zc region and initialise zc pool */
> +
> +	ifq->rq_entries = reg.rq_entries;
> +	ifq->cq_entries = reg.cq_entries;
> +	ifq->if_rxq_id = reg.if_rxq_id;
> +	ctx->ifq = ifq;

As these TODO's are removed in later patches, I think you should just
not include them to begin with. It reads more like notes to yourself,
doesn't really add anything to the series.

> +void io_shutdown_zc_rx_ifqs(struct io_ring_ctx *ctx)
> +{
> +	lockdep_assert_held(&ctx->uring_lock);
> +}

This is a bit odd?
Pavel Begunkov Dec. 20, 2023, 4:23 p.m. UTC | #2
On 12/20/23 16:13, Jens Axboe wrote:
> On 12/19/23 2:03 PM, David Wei wrote:
>> @@ -750,6 +753,54 @@ enum {
>>   	SOCKET_URING_OP_SETSOCKOPT,
>>   };
>>   
>> +struct io_uring_rbuf_rqe {
>> +	__u32	off;
>> +	__u32	len;
>> +	__u16	region;
>> +	__u8	__pad[6];
>> +};
>> +
>> +struct io_uring_rbuf_cqe {
>> +	__u32	off;
>> +	__u32	len;
>> +	__u16	region;
>> +	__u8	sock;
>> +	__u8	flags;
>> +	__u8	__pad[2];
>> +};
> 
> Looks like this leaves a gap? Should be __pad[4] or probably just __u32
> __pad; For all of these, definitely worth thinking about if we'll ever
> need more than the slight padding. Might not hurt to always leave 8
> bytes extra, outside of the required padding.

Good catch, and that all should be paholed to ensure all of them
are fitted nicely.

FWIW, the format will also be revisited, e.g. max 256 sockets per
ifq is too restrictive, and most probably moved from a separate queue
into the CQ.


>> +struct io_rbuf_rqring_offsets {
>> +	__u32	head;
>> +	__u32	tail;
>> +	__u32	rqes;
>> +	__u8	__pad[4];
>> +};
> 
> Ditto here, __u32 __pad;
> 
>> +struct io_rbuf_cqring_offsets {
>> +	__u32	head;
>> +	__u32	tail;
>> +	__u32	cqes;
>> +	__u8	__pad[4];
>> +};
> 
> And here.
> 
>> +
>> +/*
>> + * Argument for IORING_REGISTER_ZC_RX_IFQ
>> + */
>> +struct io_uring_zc_rx_ifq_reg {
>> +	__u32	if_idx;
>> +	/* hw rx descriptor ring id */
>> +	__u32	if_rxq_id;
>> +	__u32	region_id;
>> +	__u32	rq_entries;
>> +	__u32	cq_entries;
>> +	__u32	flags;
>> +	__u16	cpu;
>> +
>> +	__u32	mmap_sz;
>> +	struct io_rbuf_rqring_offsets rq_off;
>> +	struct io_rbuf_cqring_offsets cq_off;
>> +};
> 
> You have rq_off starting at a 48-bit offset here, don't think this is
> going to work as it's uapi. You'd need padding to align it to 64-bits.
> 
>> diff --git a/io_uring/zc_rx.c b/io_uring/zc_rx.c
>> new file mode 100644
>> index 000000000000..5fc94cad5e3a
>> --- /dev/null
>> +++ b/io_uring/zc_rx.c
>> +int io_register_zc_rx_ifq(struct io_ring_ctx *ctx,
>> +			  struct io_uring_zc_rx_ifq_reg __user *arg)
>> +{
>> +	struct io_uring_zc_rx_ifq_reg reg;
>> +	struct io_zc_rx_ifq *ifq;
>> +	int ret;
>> +
>> +	if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN))
>> +		return -EINVAL;
>> +	if (copy_from_user(&reg, arg, sizeof(reg)))
>> +		return -EFAULT;
>> +	if (ctx->ifq)
>> +		return -EBUSY;
>> +	if (reg.if_rxq_id == -1)
>> +		return -EINVAL;
>> +
>> +	ifq = io_zc_rx_ifq_alloc(ctx);
>> +	if (!ifq)
>> +		return -ENOMEM;
>> +
>> +	/* TODO: initialise network interface */
>> +
>> +	ret = io_allocate_rbuf_ring(ifq, &reg);
>> +	if (ret)
>> +		goto err;
>> +
>> +	/* TODO: map zc region and initialise zc pool */
>> +
>> +	ifq->rq_entries = reg.rq_entries;
>> +	ifq->cq_entries = reg.cq_entries;
>> +	ifq->if_rxq_id = reg.if_rxq_id;
>> +	ctx->ifq = ifq;
> 
> As these TODO's are removed in later patches, I think you should just
> not include them to begin with. It reads more like notes to yourself,
> doesn't really add anything to the series.
> 
>> +void io_shutdown_zc_rx_ifqs(struct io_ring_ctx *ctx)
>> +{
>> +	lockdep_assert_held(&ctx->uring_lock);
>> +}
> 
> This is a bit odd?

Oh, this chunk actually leaked here from my rebases, which is not
a big deal as it provides the interface and a later patch implements
it, but might be better to move it there in the first place.
David Wei Dec. 21, 2023, 1:44 a.m. UTC | #3
On 2023-12-20 08:13, Jens Axboe wrote:
> On 12/19/23 2:03 PM, David Wei wrote:
>> @@ -750,6 +753,54 @@ enum {
>>  	SOCKET_URING_OP_SETSOCKOPT,
>>  };
>>  
>> +struct io_uring_rbuf_rqe {
>> +	__u32	off;
>> +	__u32	len;
>> +	__u16	region;
>> +	__u8	__pad[6];
>> +};
>> +
>> +struct io_uring_rbuf_cqe {
>> +	__u32	off;
>> +	__u32	len;
>> +	__u16	region;
>> +	__u8	sock;
>> +	__u8	flags;
>> +	__u8	__pad[2];
>> +};
> 
> Looks like this leaves a gap? Should be __pad[4] or probably just __u32
> __pad; For all of these, definitely worth thinking about if we'll ever
> need more than the slight padding. Might not hurt to always leave 8
> bytes extra, outside of the required padding.

Apologies, it's been a while since I last pahole'd these structs. We may
have added more fields later and reintroduced gaps.

> 
>> +struct io_rbuf_rqring_offsets {
>> +	__u32	head;
>> +	__u32	tail;
>> +	__u32	rqes;
>> +	__u8	__pad[4];
>> +};
> 
> Ditto here, __u32 __pad;
> 
>> +struct io_rbuf_cqring_offsets {
>> +	__u32	head;
>> +	__u32	tail;
>> +	__u32	cqes;
>> +	__u8	__pad[4];
>> +};
> 
> And here.
> 
>> +
>> +/*
>> + * Argument for IORING_REGISTER_ZC_RX_IFQ
>> + */
>> +struct io_uring_zc_rx_ifq_reg {
>> +	__u32	if_idx;
>> +	/* hw rx descriptor ring id */
>> +	__u32	if_rxq_id;
>> +	__u32	region_id;
>> +	__u32	rq_entries;
>> +	__u32	cq_entries;
>> +	__u32	flags;
>> +	__u16	cpu;
>> +
>> +	__u32	mmap_sz;
>> +	struct io_rbuf_rqring_offsets rq_off;
>> +	struct io_rbuf_cqring_offsets cq_off;
>> +};
> 
> You have rq_off starting at a 48-bit offset here, don't think this is
> going to work as it's uapi. You'd need padding to align it to 64-bits.

I will remove the io_rbuf_cqring in a future patchset which should
simplify things, but io_rbuf_rqring will stay. I'll make sure offsets
are 64-bit aligned.

> 
>> diff --git a/io_uring/zc_rx.c b/io_uring/zc_rx.c
>> new file mode 100644
>> index 000000000000..5fc94cad5e3a
>> --- /dev/null
>> +++ b/io_uring/zc_rx.c
>> +int io_register_zc_rx_ifq(struct io_ring_ctx *ctx,
>> +			  struct io_uring_zc_rx_ifq_reg __user *arg)
>> +{
>> +	struct io_uring_zc_rx_ifq_reg reg;
>> +	struct io_zc_rx_ifq *ifq;
>> +	int ret;
>> +
>> +	if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN))
>> +		return -EINVAL;
>> +	if (copy_from_user(&reg, arg, sizeof(reg)))
>> +		return -EFAULT;
>> +	if (ctx->ifq)
>> +		return -EBUSY;
>> +	if (reg.if_rxq_id == -1)
>> +		return -EINVAL;
>> +
>> +	ifq = io_zc_rx_ifq_alloc(ctx);
>> +	if (!ifq)
>> +		return -ENOMEM;
>> +
>> +	/* TODO: initialise network interface */
>> +
>> +	ret = io_allocate_rbuf_ring(ifq, &reg);
>> +	if (ret)
>> +		goto err;
>> +
>> +	/* TODO: map zc region and initialise zc pool */
>> +
>> +	ifq->rq_entries = reg.rq_entries;
>> +	ifq->cq_entries = reg.cq_entries;
>> +	ifq->if_rxq_id = reg.if_rxq_id;
>> +	ctx->ifq = ifq;
> 
> As these TODO's are removed in later patches, I think you should just
> not include them to begin with. It reads more like notes to yourself,
> doesn't really add anything to the series.

Got it, will remove them.

> 
>> +void io_shutdown_zc_rx_ifqs(struct io_ring_ctx *ctx)
>> +{
>> +	lockdep_assert_held(&ctx->uring_lock);
>> +}
> 
> This is a bit odd?
>
Willem de Bruijn Dec. 21, 2023, 5:57 p.m. UTC | #4
David Wei wrote:
> From: David Wei <davidhwei@meta.com>
> 
> This patch introduces a new object in io_uring called an interface queue
> (ifq) which contains:
> 
> * A pool region allocated by userspace and registered w/ io_uring where
>   Rx data is written to.
> * A net device and one specific Rx queue in it that will be configured
>   for ZC Rx.
> * A pair of shared ringbuffers w/ userspace, dubbed registered buf
>   (rbuf) rings. Each entry contains a pool region id and an offset + len
>   within that region. The kernel writes entries into the completion ring
>   to tell userspace where RX data is relative to the start of a region.
>   Userspace writes entries into the refill ring to tell the kernel when
>   it is done with the data.
> 
> For now, each io_uring instance has a single ifq, and each ifq has a
> single pool region associated with one Rx queue.
> 
> Add a new opcode to io_uring_register that sets up an ifq. Size and
> offsets of shared ringbuffers are returned to userspace for it to mmap.
> The implementation will be added in a later patch.
> 
> Signed-off-by: David Wei <dw@davidwei.uk>

This is quite similar to AF_XDP, of course. Is it at all possible to
reuse all or some of that? If not, why not?

As a side effect, unification would also show a path of moving AF_XDP
from its custom allocator to the page_pool infra.

Related: what is the story wrt the process crashing while user memory
is posted to the NIC or present in the kernel stack.

SO_DEVMEM already demonstrates zerocopy into user buffers using usdma.
To a certain extent that and asyncronous I/O with iouring are two
independent goals. SO_DEVMEM imposes limitations on the stack because
it might hold opaque device mem. That is too strong for this case.

But for this iouring provider, is there anything ioring specific about
it beyond being user memory? If not, maybe just call it a umem
provider, and anticipate it being usable for AF_XDP in the future too?

Besides delivery up to the intended socket, packets may also end up
in other code paths, such as packet sockets or forwarding. All of
this is simpler with userspace backed buffers than with device mem.
But good to call out explicitly how this is handled. MSG_ZEROCOPY
makes a deep packet copy in unexpected code paths, for instance. To
avoid indefinite latency to buffer reclaim.
Pavel Begunkov Dec. 30, 2023, 4:25 p.m. UTC | #5
On 12/21/23 17:57, Willem de Bruijn wrote:
> David Wei wrote:
>> From: David Wei <davidhwei@meta.com>
>>
>> This patch introduces a new object in io_uring called an interface queue
>> (ifq) which contains:
>>
>> * A pool region allocated by userspace and registered w/ io_uring where
>>    Rx data is written to.
>> * A net device and one specific Rx queue in it that will be configured
>>    for ZC Rx.
>> * A pair of shared ringbuffers w/ userspace, dubbed registered buf
>>    (rbuf) rings. Each entry contains a pool region id and an offset + len
>>    within that region. The kernel writes entries into the completion ring
>>    to tell userspace where RX data is relative to the start of a region.
>>    Userspace writes entries into the refill ring to tell the kernel when
>>    it is done with the data.
>>
>> For now, each io_uring instance has a single ifq, and each ifq has a
>> single pool region associated with one Rx queue.
>>
>> Add a new opcode to io_uring_register that sets up an ifq. Size and
>> offsets of shared ringbuffers are returned to userspace for it to mmap.
>> The implementation will be added in a later patch.
>>
>> Signed-off-by: David Wei <dw@davidwei.uk>
> 
> This is quite similar to AF_XDP, of course. Is it at all possible to
> reuse all or some of that? If not, why not?

Let me rather ask what do you have in mind for reuse? I'm not too
intimately familiar with xdp, but I don't see what we can take.

Queue formats will be different, there won't be a separate CQ
for zc all they will lend in the main io_uring CQ in next revisions.
io_uring also supports multiple sockets per zc ifq and other quirks
reflected in the uapi.

Receive has to work with generic sockets and skbs if we want
to be able to reuse the protocol stack. Queue allocation and
mapping is similar but that one thing that should be bound to
the API (i.e. io_uring vs af xdp) together with locking and
synchronisation. Wakeups are different as well.

And IIUC AF_XDP is still operates with raw packets quite early
in the stack, while io_uring completes from a syscall, that
would definitely make sync diverging a lot.

I don't see many opportunities here.

> As a side effect, unification would also show a path of moving AF_XDP
> from its custom allocator to the page_pool infra.

I assume it's about xsk_buff_alloc() and likes of it. I'm lacking
here, I it's much better to ask XDP guys what they think about
moving to pp, whether it's needed, etc. And if so, it'd likely
be easier to base it on raw page pool providers api than the io_uring
provider implementation, probably having some common helpers if
things come to that.

> Related: what is the story wrt the process crashing while user memory
> is posted to the NIC or present in the kernel stack.

Buffers are pinned by io_uring. If the process crashes closing the
ring, io_uring will release the pp provider and wait for all buffer
to come back before unpinning pages and freeing the rest. I.e.
it's not going to unpin before pp's ->destroy is called.

> SO_DEVMEM already demonstrates zerocopy into user buffers using usdma.
> To a certain extent that and asyncronous I/O with iouring are two
> independent goals. SO_DEVMEM imposes limitations on the stack because
> it might hold opaque device mem. That is too strong for this case.

Basing it onto ppiov simplifies refcounting a lot, with that we
don't need any dirty hacks nor adding any extra changes in the stack,
and I think it's aligned with the net stack goals. What I think
we can do on top is allowing ppiov's to optionally have pages
(via a callback ->get_page), and use it it in those rare cases
when someone has to peek at the payload.

> But for this iouring provider, is there anything ioring specific about
> it beyond being user memory? If not, maybe just call it a umem
> provider, and anticipate it being usable for AF_XDP in the future too?

Queue formats with a set of features, synchronisation, mostly
answered above, but I also think it should as easy to just have
a separate provider and reuse some code later if there is anything
to reuse.

> Besides delivery up to the intended socket, packets may also end up
> in other code paths, such as packet sockets or forwarding. All of
> this is simpler with userspace backed buffers than with device mem.
> But good to call out explicitly how this is handled. MSG_ZEROCOPY
> makes a deep packet copy in unexpected code paths, for instance. To
> avoid indefinite latency to buffer reclaim.

Yeah, that's concerning, I intend to add something for the sockets
we used, but there is nothing for truly unexpected paths. How devmem
handles it?

It's probably not a huge worry for now, I expect killing the
task/sockets should resolve dependencies, but would be great to find
such scenarios. I'd appreciate any pointers if you have some in mind.
Willem de Bruijn Dec. 31, 2023, 10:25 p.m. UTC | #6
Pavel Begunkov wrote:
> On 12/21/23 17:57, Willem de Bruijn wrote:
> > David Wei wrote:
> >> From: David Wei <davidhwei@meta.com>
> >>
> >> This patch introduces a new object in io_uring called an interface queue
> >> (ifq) which contains:
> >>
> >> * A pool region allocated by userspace and registered w/ io_uring where
> >>    Rx data is written to.
> >> * A net device and one specific Rx queue in it that will be configured
> >>    for ZC Rx.
> >> * A pair of shared ringbuffers w/ userspace, dubbed registered buf
> >>    (rbuf) rings. Each entry contains a pool region id and an offset + len
> >>    within that region. The kernel writes entries into the completion ring
> >>    to tell userspace where RX data is relative to the start of a region.
> >>    Userspace writes entries into the refill ring to tell the kernel when
> >>    it is done with the data.
> >>
> >> For now, each io_uring instance has a single ifq, and each ifq has a
> >> single pool region associated with one Rx queue.
> >>
> >> Add a new opcode to io_uring_register that sets up an ifq. Size and
> >> offsets of shared ringbuffers are returned to userspace for it to mmap.
> >> The implementation will be added in a later patch.
> >>
> >> Signed-off-by: David Wei <dw@davidwei.uk>
> > 
> > This is quite similar to AF_XDP, of course. Is it at all possible to
> > reuse all or some of that? If not, why not?
> 
> Let me rather ask what do you have in mind for reuse? I'm not too
> intimately familiar with xdp, but I don't see what we can take.

At a high level all points in this commit message:

	* A pool region allocated by userspace and registered w/ io_uring where
	  Rx data is written to.
	* A net device and one specific Rx queue in it that will be configured
	  for ZC Rx.
	* A pair of shared ringbuffers w/ userspace, dubbed registered buf
	  (rbuf) rings. Each entry contains a pool region id and an offset + len
	  within that region. The kernel writes entries into the completion ring
	  to tell userspace where RX data is relative to the start of a region.
	  Userspace writes entries into the refill ring to tell the kernel when
	  it is done with the data.

	For now, each io_uring instance has a single ifq, and each ifq has a
	single pool region associated with one Rx queue.

AF_XDP allows shared pools, but otherwise this sounds like the same
feature set.

> Queue formats will be different

I'd like to makes sure that this is for a reason. Not just divergence
because we did not consider reusing existing user/kernel queue formats.

> there won't be a separate CQ
> for zc all they will lend in the main io_uring CQ in next revisions.

Okay, that's different.

> io_uring also supports multiple sockets per zc ifq and other quirks
> reflected in the uapi.
> 
> Receive has to work with generic sockets and skbs if we want
> to be able to reuse the protocol stack. Queue allocation and
> mapping is similar but that one thing that should be bound to
> the API (i.e. io_uring vs af xdp) together with locking and
> synchronisation. Wakeups are different as well.
> 
> And IIUC AF_XDP is still operates with raw packets quite early
> in the stack, while io_uring completes from a syscall, that
> would definitely make sync diverging a lot.

The difference is in frame payload, not in the queue structure:
a fixed frame buffer pool plus sets of post + completion queues that
store a relative offset and length into that pool.

I don't intend to ask for the impossible, to be extra clear: If there
are reasons the structures need to be different, so be it. And no
intention to complicate development. Anything not ABI can be
refactored later, too, if overlap becomes clear. But for ABI it's
worth asking now whether these queue formats really are different for
a concrete reason.

> I don't see many opportunities here.
> 
> > As a side effect, unification would also show a path of moving AF_XDP
> > from its custom allocator to the page_pool infra.
> 
> I assume it's about xsk_buff_alloc() and likes of it. I'm lacking
> here, I it's much better to ask XDP guys what they think about
> moving to pp, whether it's needed, etc. And if so, it'd likely
> be easier to base it on raw page pool providers api than the io_uring
> provider implementation, probably having some common helpers if
> things come to that.

Fair enough, on giving it some more thought and reviewing a recent
use case of the AF_XDP allocation APIs including xsk_buff_alloc.

> 
> > Related: what is the story wrt the process crashing while user memory
> > is posted to the NIC or present in the kernel stack.
> 
> Buffers are pinned by io_uring. If the process crashes closing the
> ring, io_uring will release the pp provider and wait for all buffer
> to come back before unpinning pages and freeing the rest. I.e.
> it's not going to unpin before pp's ->destroy is called.

Great. That's how all page pools work iirc. There is some potential
concern with unbound delay until all buffers are recycled. But that
is not unique to the io_uring provider.

> > SO_DEVMEM already demonstrates zerocopy into user buffers using usdma.
> > To a certain extent that and asyncronous I/O with iouring are two
> > independent goals. SO_DEVMEM imposes limitations on the stack because
> > it might hold opaque device mem. That is too strong for this case.
> 
> Basing it onto ppiov simplifies refcounting a lot, with that we
> don't need any dirty hacks nor adding any extra changes in the stack,
> and I think it's aligned with the net stack goals.

Great to hear.

> What I think
> we can do on top is allowing ppiov's to optionally have pages
> (via a callback ->get_page), and use it it in those rare cases
> when someone has to peek at the payload.
> 
> > But for this iouring provider, is there anything ioring specific about
> > it beyond being user memory? If not, maybe just call it a umem
> > provider, and anticipate it being usable for AF_XDP in the future too?
> 
> Queue formats with a set of features, synchronisation, mostly
> answered above, but I also think it should as easy to just have
> a separate provider and reuse some code later if there is anything
> to reuse.
> 
> > Besides delivery up to the intended socket, packets may also end up
> > in other code paths, such as packet sockets or forwarding. All of
> > this is simpler with userspace backed buffers than with device mem.
> > But good to call out explicitly how this is handled. MSG_ZEROCOPY
> > makes a deep packet copy in unexpected code paths, for instance. To
> > avoid indefinite latency to buffer reclaim.
> 
> Yeah, that's concerning, I intend to add something for the sockets
> we used, but there is nothing for truly unexpected paths. How devmem
> handles it?

MSG_ZEROCOPY handles this by copying to regular kernel memory using
skb_orphan_frags_rx whenever a tx packet could get looped onto an rx
queue and thus held indefinitely. This is not allowed for MSG_ZEROCOPY
as it causes a potentially unbound latency before data can be reused
by the application. Called from __netif_receive_skb_core,
dev_queue_xmit_nit and a few others.

SO_DEVMEM does allow data to enter packet sockets, but instruments
each point that might reference memory to not do this. For instance:

	@@ -2156,7 +2156,7 @@  static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
			}
		}
	 
	-	snaplen = skb->len;
	+	snaplen = skb_frags_readable(skb) ? skb->len : skb_headlen(skb);

https://patchwork.kernel.org/project/netdevbpf/patch/20231218024024.3516870-11-almasrymina@google.com/

Either approach could be extended to cover io_uring packets.

Multicast is a perhaps an interesting other receive case. I have not
given that much thought.

> It's probably not a huge worry for now, I expect killing the
> task/sockets should resolve dependencies, but would be great to find
> such scenarios. I'd appreciate any pointers if you have some in mind.
> 
> -- 
> Pavel Begunkov
diff mbox series

Patch

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index bebab36abce8..e87053b200f2 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -38,6 +38,8 @@  enum io_uring_cmd_flags {
 	IO_URING_F_COMPAT		= (1 << 12),
 };
 
+struct io_zc_rx_ifq;
+
 struct io_wq_work_node {
 	struct io_wq_work_node *next;
 };
@@ -182,6 +184,10 @@  struct io_rings {
 	struct io_uring_cqe	cqes[] ____cacheline_aligned_in_smp;
 };
 
+struct io_rbuf_ring {
+	struct io_uring		rq, cq;
+};
+
 struct io_restriction {
 	DECLARE_BITMAP(register_op, IORING_REGISTER_LAST);
 	DECLARE_BITMAP(sqe_op, IORING_OP_LAST);
@@ -383,6 +389,8 @@  struct io_ring_ctx {
 	struct io_rsrc_data		*file_data;
 	struct io_rsrc_data		*buf_data;
 
+	struct io_zc_rx_ifq		*ifq;
+
 	/* protected by ->uring_lock */
 	struct list_head		rsrc_ref_list;
 	struct io_alloc_cache		rsrc_node_cache;
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index f1c16f817742..024a6f79323b 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -558,6 +558,9 @@  enum {
 	/* register a range of fixed file slots for automatic slot allocation */
 	IORING_REGISTER_FILE_ALLOC_RANGE	= 25,
 
+	/* register a network interface queue for zerocopy */
+	IORING_REGISTER_ZC_RX_IFQ		= 26,
+
 	/* this goes last */
 	IORING_REGISTER_LAST,
 
@@ -750,6 +753,54 @@  enum {
 	SOCKET_URING_OP_SETSOCKOPT,
 };
 
+struct io_uring_rbuf_rqe {
+	__u32	off;
+	__u32	len;
+	__u16	region;
+	__u8	__pad[6];
+};
+
+struct io_uring_rbuf_cqe {
+	__u32	off;
+	__u32	len;
+	__u16	region;
+	__u8	sock;
+	__u8	flags;
+	__u8	__pad[2];
+};
+
+struct io_rbuf_rqring_offsets {
+	__u32	head;
+	__u32	tail;
+	__u32	rqes;
+	__u8	__pad[4];
+};
+
+struct io_rbuf_cqring_offsets {
+	__u32	head;
+	__u32	tail;
+	__u32	cqes;
+	__u8	__pad[4];
+};
+
+/*
+ * Argument for IORING_REGISTER_ZC_RX_IFQ
+ */
+struct io_uring_zc_rx_ifq_reg {
+	__u32	if_idx;
+	/* hw rx descriptor ring id */
+	__u32	if_rxq_id;
+	__u32	region_id;
+	__u32	rq_entries;
+	__u32	cq_entries;
+	__u32	flags;
+	__u16	cpu;
+
+	__u32	mmap_sz;
+	struct io_rbuf_rqring_offsets rq_off;
+	struct io_rbuf_cqring_offsets cq_off;
+};
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/io_uring/Makefile b/io_uring/Makefile
index e5be47e4fc3b..6c4b4ed37a1f 100644
--- a/io_uring/Makefile
+++ b/io_uring/Makefile
@@ -8,6 +8,6 @@  obj-$(CONFIG_IO_URING)		+= io_uring.o xattr.o nop.o fs.o splice.o \
 					statx.o net.o msg_ring.o timeout.o \
 					sqpoll.o fdinfo.o tctx.o poll.o \
 					cancel.o kbuf.o rsrc.o rw.o opdef.o \
-					notif.o waitid.o
+					notif.o waitid.o zc_rx.o
 obj-$(CONFIG_IO_WQ)		+= io-wq.o
 obj-$(CONFIG_FUTEX)		+= futex.o
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 1d254f2c997d..7fff01d57e9e 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -95,6 +95,7 @@ 
 #include "notif.h"
 #include "waitid.h"
 #include "futex.h"
+#include "zc_rx.h"
 
 #include "timeout.h"
 #include "poll.h"
@@ -2919,6 +2920,7 @@  static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 		return;
 
 	mutex_lock(&ctx->uring_lock);
+	io_unregister_zc_rx_ifqs(ctx);
 	if (ctx->buf_data)
 		__io_sqe_buffers_unregister(ctx);
 	if (ctx->file_data)
@@ -3109,6 +3111,11 @@  static __cold void io_ring_exit_work(struct work_struct *work)
 			io_cqring_overflow_kill(ctx);
 			mutex_unlock(&ctx->uring_lock);
 		}
+		if (ctx->ifq) {
+			mutex_lock(&ctx->uring_lock);
+			io_shutdown_zc_rx_ifqs(ctx);
+			mutex_unlock(&ctx->uring_lock);
+		}
 
 		if (ctx->flags & IORING_SETUP_DEFER_TASKRUN)
 			io_move_task_work_from_local(ctx);
@@ -4609,6 +4616,12 @@  static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 			break;
 		ret = io_register_file_alloc_range(ctx, arg);
 		break;
+	case IORING_REGISTER_ZC_RX_IFQ:
+		ret = -EINVAL;
+		if (!arg || nr_args != 1)
+			break;
+		ret = io_register_zc_rx_ifq(ctx, arg);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/io_uring/zc_rx.c b/io_uring/zc_rx.c
new file mode 100644
index 000000000000..5fc94cad5e3a
--- /dev/null
+++ b/io_uring/zc_rx.c
@@ -0,0 +1,116 @@ 
+// SPDX-License-Identifier: GPL-2.0
+#if defined(CONFIG_PAGE_POOL)
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/io_uring.h>
+
+#include <uapi/linux/io_uring.h>
+
+#include "io_uring.h"
+#include "kbuf.h"
+#include "zc_rx.h"
+
+static int io_allocate_rbuf_ring(struct io_zc_rx_ifq *ifq,
+				 struct io_uring_zc_rx_ifq_reg *reg)
+{
+	gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP;
+	size_t off, size, rq_size, cq_size;
+	void *ptr;
+
+	off = sizeof(struct io_rbuf_ring);
+	rq_size = reg->rq_entries * sizeof(struct io_uring_rbuf_rqe);
+	cq_size = reg->cq_entries * sizeof(struct io_uring_rbuf_cqe);
+	size = off + rq_size + cq_size;
+	ptr = (void *) __get_free_pages(gfp, get_order(size));
+	if (!ptr)
+		return -ENOMEM;
+	ifq->ring = (struct io_rbuf_ring *)ptr;
+	ifq->rqes = (struct io_uring_rbuf_rqe *)((char *)ptr + off);
+	ifq->cqes = (struct io_uring_rbuf_cqe *)((char *)ifq->rqes + rq_size);
+	return 0;
+}
+
+static void io_free_rbuf_ring(struct io_zc_rx_ifq *ifq)
+{
+	if (ifq->ring)
+		folio_put(virt_to_folio(ifq->ring));
+}
+
+static struct io_zc_rx_ifq *io_zc_rx_ifq_alloc(struct io_ring_ctx *ctx)
+{
+	struct io_zc_rx_ifq *ifq;
+
+	ifq = kzalloc(sizeof(*ifq), GFP_KERNEL);
+	if (!ifq)
+		return NULL;
+
+	ifq->if_rxq_id = -1;
+	ifq->ctx = ctx;
+	return ifq;
+}
+
+static void io_zc_rx_ifq_free(struct io_zc_rx_ifq *ifq)
+{
+	io_free_rbuf_ring(ifq);
+	kfree(ifq);
+}
+
+int io_register_zc_rx_ifq(struct io_ring_ctx *ctx,
+			  struct io_uring_zc_rx_ifq_reg __user *arg)
+{
+	struct io_uring_zc_rx_ifq_reg reg;
+	struct io_zc_rx_ifq *ifq;
+	int ret;
+
+	if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN))
+		return -EINVAL;
+	if (copy_from_user(&reg, arg, sizeof(reg)))
+		return -EFAULT;
+	if (ctx->ifq)
+		return -EBUSY;
+	if (reg.if_rxq_id == -1)
+		return -EINVAL;
+
+	ifq = io_zc_rx_ifq_alloc(ctx);
+	if (!ifq)
+		return -ENOMEM;
+
+	/* TODO: initialise network interface */
+
+	ret = io_allocate_rbuf_ring(ifq, &reg);
+	if (ret)
+		goto err;
+
+	/* TODO: map zc region and initialise zc pool */
+
+	ifq->rq_entries = reg.rq_entries;
+	ifq->cq_entries = reg.cq_entries;
+	ifq->if_rxq_id = reg.if_rxq_id;
+	ctx->ifq = ifq;
+
+	return 0;
+err:
+	io_zc_rx_ifq_free(ifq);
+	return ret;
+}
+
+void io_unregister_zc_rx_ifqs(struct io_ring_ctx *ctx)
+{
+	struct io_zc_rx_ifq *ifq = ctx->ifq;
+
+	lockdep_assert_held(&ctx->uring_lock);
+
+	if (!ifq)
+		return;
+
+	ctx->ifq = NULL;
+	io_zc_rx_ifq_free(ifq);
+}
+
+void io_shutdown_zc_rx_ifqs(struct io_ring_ctx *ctx)
+{
+	lockdep_assert_held(&ctx->uring_lock);
+}
+
+#endif
diff --git a/io_uring/zc_rx.h b/io_uring/zc_rx.h
new file mode 100644
index 000000000000..aab57c1a4c5d
--- /dev/null
+++ b/io_uring/zc_rx.h
@@ -0,0 +1,37 @@ 
+// SPDX-License-Identifier: GPL-2.0
+#ifndef IOU_ZC_RX_H
+#define IOU_ZC_RX_H
+
+struct io_zc_rx_ifq {
+	struct io_ring_ctx		*ctx;
+	struct net_device		*dev;
+	struct io_rbuf_ring		*ring;
+	struct io_uring_rbuf_rqe 	*rqes;
+	struct io_uring_rbuf_cqe 	*cqes;
+	u32				rq_entries;
+	u32				cq_entries;
+
+	/* hw rx descriptor ring id */
+	u32				if_rxq_id;
+};
+
+#if defined(CONFIG_PAGE_POOL)
+int io_register_zc_rx_ifq(struct io_ring_ctx *ctx,
+			  struct io_uring_zc_rx_ifq_reg __user *arg);
+void io_unregister_zc_rx_ifqs(struct io_ring_ctx *ctx);
+void io_shutdown_zc_rx_ifqs(struct io_ring_ctx *ctx);
+#else
+static inline int io_register_zc_rx_ifq(struct io_ring_ctx *ctx,
+			  struct io_uring_zc_rx_ifq_reg __user *arg)
+{
+	return -EOPNOTSUPP;
+}
+static inline void io_unregister_zc_rx_ifqs(struct io_ring_ctx *ctx)
+{
+}
+static inline void io_shutdown_zc_rx_ifqs(struct io_ring_ctx *ctx)
+{
+}
+#endif
+
+#endif