Message ID | 20241204172204.4180482-10-dw@davidwei.uk (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | io_uring zero copy rx | expand |
On Wed, Dec 04, 2024 at 09:21:48AM -0800, David Wei wrote: > From: David Wei <davidhwei@meta.com> > > Add a new object called an interface queue (ifq) that represents a net > rx queue that has been configured for zero copy. Each ifq is registered > using a new registration opcode IORING_REGISTER_ZCRX_IFQ. > > The refill queue is allocated by the kernel and mapped by userspace > using a new offset IORING_OFF_RQ_RING, in a similar fashion to the main > SQ/CQ. It is used by userspace to return buffers that it is done with, > which will then be re-used by the netdev again. > > The main CQ ring is used to notify userspace of received data by using > the upper 16 bytes of a big CQE as a new struct io_uring_zcrx_cqe. Each > entry contains the offset + len to the data. > > For now, each io_uring instance only has a single ifq. > > Signed-off-by: David Wei <dw@davidwei.uk> ... > diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c ... > +int io_register_zcrx_ifq(struct io_ring_ctx *ctx, > + struct io_uring_zcrx_ifq_reg __user *arg) > +{ > + struct io_uring_zcrx_ifq_reg reg; > + struct io_uring_region_desc rd; > + struct io_zcrx_ifq *ifq; > + size_t ring_sz, rqes_sz; > + int ret; > + > + /* > + * 1. Interface queue allocation. > + * 2. It can observe data destined for sockets of other tasks. > + */ > + if (!capable(CAP_NET_ADMIN)) > + return -EPERM; > + > + /* mandatory io_uring features for zc rx */ > + if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN && > + ctx->flags & IORING_SETUP_CQE32)) > + return -EINVAL; > + if (ctx->ifq) > + return -EBUSY; > + if (copy_from_user(®, arg, sizeof(reg))) > + return -EFAULT; > + if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd))) > + return -EFAULT; > + if (memchr_inv(®.__resv, 0, sizeof(reg.__resv))) > + return -EINVAL; > + if (reg.if_rxq == -1 || !reg.rq_entries || reg.flags) > + return -EINVAL; > + if (reg.rq_entries > IO_RQ_MAX_ENTRIES) { > + if (!(ctx->flags & IORING_SETUP_CLAMP)) > + return -EINVAL; > + reg.rq_entries = IO_RQ_MAX_ENTRIES; > + } > + reg.rq_entries = roundup_pow_of_two(reg.rq_entries); > + > + if (!reg.area_ptr) > + return -EFAULT; > + > + ifq = io_zcrx_ifq_alloc(ctx); > + if (!ifq) > + return -ENOMEM; > + > + ret = io_allocate_rbuf_ring(ifq, ®, &rd); > + if (ret) > + goto err; > + > + ifq->rq_entries = reg.rq_entries; > + ifq->if_rxq = reg.if_rxq; > + > + ring_sz = sizeof(struct io_uring); > + rqes_sz = sizeof(struct io_uring_zcrx_rqe) * ifq->rq_entries; Hi David, A minor nit from my side: rqes_sz is set but otherwise unused in this function. Perhaps it can be removed? Flagged by W=1 builds. > + reg.offsets.rqes = ring_sz; > + reg.offsets.head = offsetof(struct io_uring, head); > + reg.offsets.tail = offsetof(struct io_uring, tail); > + > + if (copy_to_user(arg, ®, sizeof(reg)) || > + copy_to_user(u64_to_user_ptr(reg.region_ptr), &rd, sizeof(rd))) { > + ret = -EFAULT; > + goto err; > + } > + > + ctx->ifq = ifq; > + return 0; > +err: > + io_zcrx_ifq_free(ifq); > + return ret; > +} ...
On 2024-12-06 08:05, Simon Horman wrote: > On Wed, Dec 04, 2024 at 09:21:48AM -0800, David Wei wrote: >> From: David Wei <davidhwei@meta.com> >> >> Add a new object called an interface queue (ifq) that represents a net >> rx queue that has been configured for zero copy. Each ifq is registered >> using a new registration opcode IORING_REGISTER_ZCRX_IFQ. >> >> The refill queue is allocated by the kernel and mapped by userspace >> using a new offset IORING_OFF_RQ_RING, in a similar fashion to the main >> SQ/CQ. It is used by userspace to return buffers that it is done with, >> which will then be re-used by the netdev again. >> >> The main CQ ring is used to notify userspace of received data by using >> the upper 16 bytes of a big CQE as a new struct io_uring_zcrx_cqe. Each >> entry contains the offset + len to the data. >> >> For now, each io_uring instance only has a single ifq. >> >> Signed-off-by: David Wei <dw@davidwei.uk> > > ... > >> diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c > > ... > >> +int io_register_zcrx_ifq(struct io_ring_ctx *ctx, >> + struct io_uring_zcrx_ifq_reg __user *arg) >> +{ >> + struct io_uring_zcrx_ifq_reg reg; >> + struct io_uring_region_desc rd; >> + struct io_zcrx_ifq *ifq; >> + size_t ring_sz, rqes_sz; >> + int ret; >> + >> + /* >> + * 1. Interface queue allocation. >> + * 2. It can observe data destined for sockets of other tasks. >> + */ >> + if (!capable(CAP_NET_ADMIN)) >> + return -EPERM; >> + >> + /* mandatory io_uring features for zc rx */ >> + if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN && >> + ctx->flags & IORING_SETUP_CQE32)) >> + return -EINVAL; >> + if (ctx->ifq) >> + return -EBUSY; >> + if (copy_from_user(®, arg, sizeof(reg))) >> + return -EFAULT; >> + if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd))) >> + return -EFAULT; >> + if (memchr_inv(®.__resv, 0, sizeof(reg.__resv))) >> + return -EINVAL; >> + if (reg.if_rxq == -1 || !reg.rq_entries || reg.flags) >> + return -EINVAL; >> + if (reg.rq_entries > IO_RQ_MAX_ENTRIES) { >> + if (!(ctx->flags & IORING_SETUP_CLAMP)) >> + return -EINVAL; >> + reg.rq_entries = IO_RQ_MAX_ENTRIES; >> + } >> + reg.rq_entries = roundup_pow_of_two(reg.rq_entries); >> + >> + if (!reg.area_ptr) >> + return -EFAULT; >> + >> + ifq = io_zcrx_ifq_alloc(ctx); >> + if (!ifq) >> + return -ENOMEM; >> + >> + ret = io_allocate_rbuf_ring(ifq, ®, &rd); >> + if (ret) >> + goto err; >> + >> + ifq->rq_entries = reg.rq_entries; >> + ifq->if_rxq = reg.if_rxq; >> + >> + ring_sz = sizeof(struct io_uring); >> + rqes_sz = sizeof(struct io_uring_zcrx_rqe) * ifq->rq_entries; > > Hi David, > > A minor nit from my side: rqes_sz is set but otherwise unused in this > function. Perhaps it can be removed? > > Flagged by W=1 builds. Hi Simon, thanks for flagging this, I'll remove it in the next version.
On Wed, 4 Dec 2024 09:21:48 -0800 David Wei wrote:
> + depends on INET
Interesting, why INET? Just curious, in theory there shouldn't
be anything IP related in ZC.
On 12/10/24 03:49, Jakub Kicinski wrote: > On Wed, 4 Dec 2024 09:21:48 -0800 David Wei wrote: >> + depends on INET > > Interesting, why INET? Just curious, in theory there shouldn't > be anything IP related in ZC. Because of direct calls to tcp_read_sock(). With more protocols should be turned into a callback (or sharing it with splice).
On Tue, 10 Dec 2024 04:03:32 +0000 Pavel Begunkov wrote: > On 12/10/24 03:49, Jakub Kicinski wrote: > > On Wed, 4 Dec 2024 09:21:48 -0800 David Wei wrote: > >> + depends on INET > > > > Interesting, why INET? Just curious, in theory there shouldn't > > be anything IP related in ZC. > > Because of direct calls to tcp_read_sock(). With more protocols > should be turned into a callback (or sharing it with splice). Ah, I guess that comes in later patches :S Makes sense.
diff --git a/Kconfig b/Kconfig index 745bc773f567..529ea7694ba9 100644 --- a/Kconfig +++ b/Kconfig @@ -30,3 +30,5 @@ source "lib/Kconfig" source "lib/Kconfig.debug" source "Documentation/Kconfig" + +source "io_uring/KConfig" diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 593c10a02144..fecd53544a93 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -40,6 +40,8 @@ enum io_uring_cmd_flags { IO_URING_F_TASK_DEAD = (1 << 13), }; +struct io_zcrx_ifq; + struct io_wq_work_node { struct io_wq_work_node *next; }; @@ -377,6 +379,8 @@ struct io_ring_ctx { struct wait_queue_head poll_wq; struct io_restriction restrictions; + struct io_zcrx_ifq *ifq; + u32 pers_next; struct xarray personalities; diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 4418d0192959..552377a1e496 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -622,7 +622,8 @@ enum io_uring_register_op { /* send MSG_RING without having a ring */ IORING_REGISTER_SEND_MSG_RING = 31, - /* 32 reserved for zc rx */ + /* register a netdev hw rx queue for zerocopy */ + IORING_REGISTER_ZCRX_IFQ = 32, /* resize CQ ring */ IORING_REGISTER_RESIZE_RINGS = 33, @@ -953,6 +954,46 @@ enum io_uring_socket_op { SOCKET_URING_OP_SETSOCKOPT, }; +/* Zero copy receive refill queue entry */ +struct io_uring_zcrx_rqe { + __u64 off; + __u32 len; + __u32 __pad; +}; + +struct io_uring_zcrx_cqe { + __u64 off; + __u64 __pad; +}; + +/* The bit from which area id is encoded into offsets */ +#define IORING_ZCRX_AREA_SHIFT 48 +#define IORING_ZCRX_AREA_MASK (~(((__u64)1 << IORING_ZCRX_AREA_SHIFT) - 1)) + +struct io_uring_zcrx_offsets { + __u32 head; + __u32 tail; + __u32 rqes; + __u32 __resv2; + __u64 __resv[2]; +}; + +/* + * Argument for IORING_REGISTER_ZCRX_IFQ + */ +struct io_uring_zcrx_ifq_reg { + __u32 if_idx; + __u32 if_rxq; + __u32 rq_entries; + __u32 flags; + + __u64 area_ptr; /* pointer to struct io_uring_zcrx_area_reg */ + __u64 region_ptr; /* struct io_uring_region_desc * */ + + struct io_uring_zcrx_offsets offsets; + __u64 __resv[4]; +}; + #ifdef __cplusplus } #endif diff --git a/io_uring/KConfig b/io_uring/KConfig new file mode 100644 index 000000000000..9e2a4beba1ef --- /dev/null +++ b/io_uring/KConfig @@ -0,0 +1,10 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# io_uring configuration +# + +config IO_URING_ZCRX + def_bool y + depends on PAGE_POOL + depends on INET + depends on NET_RX_BUSY_POLL diff --git a/io_uring/Makefile b/io_uring/Makefile index 53167bef37d7..a95b0b8229c9 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -14,6 +14,7 @@ obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \ epoll.o statx.o timeout.o fdinfo.o \ cancel.o waitid.o register.o \ truncate.o memmap.o +obj-$(CONFIG_IO_URING_ZCRX) += zcrx.o obj-$(CONFIG_IO_WQ) += io-wq.o obj-$(CONFIG_FUTEX) += futex.o obj-$(CONFIG_NET_RX_BUSY_POLL) += napi.o diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 801293399883..a69d6afe62f6 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -97,6 +97,7 @@ #include "uring_cmd.h" #include "msg_ring.h" #include "memmap.h" +#include "zcrx.h" #include "timeout.h" #include "poll.h" @@ -2700,6 +2701,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) mutex_lock(&ctx->uring_lock); io_sqe_buffers_unregister(ctx); io_sqe_files_unregister(ctx); + io_unregister_zcrx_ifqs(ctx); io_cqring_overflow_kill(ctx); io_eventfd_unregister(ctx); io_alloc_cache_free(&ctx->apoll_cache, kfree); @@ -2865,6 +2867,11 @@ static __cold void io_ring_exit_work(struct work_struct *work) io_cqring_overflow_kill(ctx); mutex_unlock(&ctx->uring_lock); } + if (ctx->ifq) { + mutex_lock(&ctx->uring_lock); + io_shutdown_zcrx_ifqs(ctx); + mutex_unlock(&ctx->uring_lock); + } if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) io_move_task_work_from_local(ctx); diff --git a/io_uring/register.c b/io_uring/register.c index 1a60f4916649..8c68465b4f4c 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -30,6 +30,7 @@ #include "eventfd.h" #include "msg_ring.h" #include "memmap.h" +#include "zcrx.h" #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ IORING_REGISTER_LAST + IORING_OP_LAST) @@ -803,6 +804,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, break; ret = io_register_clone_buffers(ctx, arg); break; + case IORING_REGISTER_ZCRX_IFQ: + ret = -EINVAL; + if (!arg || nr_args != 1) + break; + ret = io_register_zcrx_ifq(ctx, arg); + break; case IORING_REGISTER_RESIZE_RINGS: ret = -EINVAL; if (!arg || nr_args != 1) diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c new file mode 100644 index 000000000000..3e5644718f54 --- /dev/null +++ b/io_uring/zcrx.c @@ -0,0 +1,151 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/mm.h> +#include <linux/io_uring.h> + +#include <uapi/linux/io_uring.h> + +#include "io_uring.h" +#include "kbuf.h" +#include "memmap.h" +#include "zcrx.h" + +#define IO_RQ_MAX_ENTRIES 32768 + +static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq, + struct io_uring_zcrx_ifq_reg *reg, + struct io_uring_region_desc *rd) +{ + size_t off, size; + void *ptr; + int ret; + + off = sizeof(struct io_uring); + size = off + sizeof(struct io_uring_zcrx_rqe) * reg->rq_entries; + if (size > rd->size) + return -EINVAL; + + ret = io_create_region(ifq->ctx, &ifq->region, rd); + if (ret < 0) + return ret; + + ptr = io_region_get_ptr(&ifq->region); + ifq->rq_ring = (struct io_uring *)ptr; + ifq->rqes = (struct io_uring_zcrx_rqe *)(ptr + off); + return 0; +} + +static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq) +{ + io_free_region(ifq->ctx, &ifq->region); + ifq->rq_ring = NULL; + ifq->rqes = NULL; +} + +static struct io_zcrx_ifq *io_zcrx_ifq_alloc(struct io_ring_ctx *ctx) +{ + struct io_zcrx_ifq *ifq; + + ifq = kzalloc(sizeof(*ifq), GFP_KERNEL); + if (!ifq) + return NULL; + + ifq->if_rxq = -1; + ifq->ctx = ctx; + return ifq; +} + +static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq) +{ + io_free_rbuf_ring(ifq); + kfree(ifq); +} + +int io_register_zcrx_ifq(struct io_ring_ctx *ctx, + struct io_uring_zcrx_ifq_reg __user *arg) +{ + struct io_uring_zcrx_ifq_reg reg; + struct io_uring_region_desc rd; + struct io_zcrx_ifq *ifq; + size_t ring_sz, rqes_sz; + int ret; + + /* + * 1. Interface queue allocation. + * 2. It can observe data destined for sockets of other tasks. + */ + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + /* mandatory io_uring features for zc rx */ + if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN && + ctx->flags & IORING_SETUP_CQE32)) + return -EINVAL; + if (ctx->ifq) + return -EBUSY; + if (copy_from_user(®, arg, sizeof(reg))) + return -EFAULT; + if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd))) + return -EFAULT; + if (memchr_inv(®.__resv, 0, sizeof(reg.__resv))) + return -EINVAL; + if (reg.if_rxq == -1 || !reg.rq_entries || reg.flags) + return -EINVAL; + if (reg.rq_entries > IO_RQ_MAX_ENTRIES) { + if (!(ctx->flags & IORING_SETUP_CLAMP)) + return -EINVAL; + reg.rq_entries = IO_RQ_MAX_ENTRIES; + } + reg.rq_entries = roundup_pow_of_two(reg.rq_entries); + + if (!reg.area_ptr) + return -EFAULT; + + ifq = io_zcrx_ifq_alloc(ctx); + if (!ifq) + return -ENOMEM; + + ret = io_allocate_rbuf_ring(ifq, ®, &rd); + if (ret) + goto err; + + ifq->rq_entries = reg.rq_entries; + ifq->if_rxq = reg.if_rxq; + + ring_sz = sizeof(struct io_uring); + rqes_sz = sizeof(struct io_uring_zcrx_rqe) * ifq->rq_entries; + reg.offsets.rqes = ring_sz; + reg.offsets.head = offsetof(struct io_uring, head); + reg.offsets.tail = offsetof(struct io_uring, tail); + + if (copy_to_user(arg, ®, sizeof(reg)) || + copy_to_user(u64_to_user_ptr(reg.region_ptr), &rd, sizeof(rd))) { + ret = -EFAULT; + goto err; + } + + ctx->ifq = ifq; + return 0; +err: + io_zcrx_ifq_free(ifq); + return ret; +} + +void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx) +{ + struct io_zcrx_ifq *ifq = ctx->ifq; + + lockdep_assert_held(&ctx->uring_lock); + + if (!ifq) + return; + + ctx->ifq = NULL; + io_zcrx_ifq_free(ifq); +} + +void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx) +{ + lockdep_assert_held(&ctx->uring_lock); +} diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h new file mode 100644 index 000000000000..178c515fea04 --- /dev/null +++ b/io_uring/zcrx.h @@ -0,0 +1,38 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef IOU_ZC_RX_H +#define IOU_ZC_RX_H + +#include <linux/io_uring_types.h> + +struct io_zcrx_ifq { + struct io_ring_ctx *ctx; + struct net_device *dev; + struct io_uring *rq_ring; + struct io_uring_zcrx_rqe *rqes; + u32 rq_entries; + + u32 if_rxq; + + struct io_mapped_region region; +}; + +#if defined(CONFIG_IO_URING_ZCRX) +int io_register_zcrx_ifq(struct io_ring_ctx *ctx, + struct io_uring_zcrx_ifq_reg __user *arg); +void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx); +void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx); +#else +static inline int io_register_zcrx_ifq(struct io_ring_ctx *ctx, + struct io_uring_zcrx_ifq_reg __user *arg) +{ + return -EOPNOTSUPP; +} +static inline void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx) +{ +} +static inline void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx) +{ +} +#endif + +#endif