Message ID | 20230826011954.1801099-5-dw@davidwei.uk (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | Zero copy network RX using io_uring | expand |
On 8/25/23 6:19 PM, David Wei wrote: > diff --git a/io_uring/zc_rx.c b/io_uring/zc_rx.c > index 6c57c9b06e05..8cc66731af5b 100644 > --- a/io_uring/zc_rx.c > +++ b/io_uring/zc_rx.c > @@ -10,6 +11,35 @@ > #include "kbuf.h" > #include "zc_rx.h" > > +typedef int (*bpf_op_t)(struct net_device *dev, struct netdev_bpf *bpf); > + > +static int __io_queue_mgmt(struct net_device *dev, struct io_zc_rx_ifq *ifq, > + u16 queue_id) > +{ > + struct netdev_bpf cmd; > + bpf_op_t ndo_bpf; > + > + ndo_bpf = dev->netdev_ops->ndo_bpf; This is not bpf related, so it seems wrong to be overloading this ndo. > + if (!ndo_bpf) > + return -EINVAL; > + > + cmd.command = XDP_SETUP_ZC_RX; > + cmd.zc_rx.ifq = ifq; > + cmd.zc_rx.queue_id = queue_id; > + > + return ndo_bpf(dev, &cmd); > +} > + > +static int io_open_zc_rxq(struct io_zc_rx_ifq *ifq) > +{ > + return __io_queue_mgmt(ifq->dev, ifq, ifq->if_rxq_id); > +} > + > +static int io_close_zc_rxq(struct io_zc_rx_ifq *ifq) > +{ > + return __io_queue_mgmt(ifq->dev, NULL, ifq->if_rxq_id); > +} > + > static struct io_zc_rx_ifq *io_zc_rx_ifq_alloc(struct io_ring_ctx *ctx) > { > struct io_zc_rx_ifq *ifq; > @@ -19,12 +49,17 @@ static struct io_zc_rx_ifq *io_zc_rx_ifq_alloc(struct io_ring_ctx *ctx) > return NULL; > > ifq->ctx = ctx; > + ifq->if_rxq_id = -1; > > return ifq; > } > > static void io_zc_rx_ifq_free(struct io_zc_rx_ifq *ifq) > { > + if (ifq->if_rxq_id != -1) > + io_close_zc_rxq(ifq); > + if (ifq->dev) > + dev_put(ifq->dev); > io_free_rbuf_ring(ifq); > kfree(ifq); > } > @@ -41,17 +76,22 @@ int io_register_zc_rx_ifq(struct io_ring_ctx *ctx, > return -EFAULT; > if (ctx->ifq) > return -EBUSY; > + if (reg.if_rxq_id == -1) > + return -EINVAL; > > ifq = io_zc_rx_ifq_alloc(ctx); > if (!ifq) > return -ENOMEM; > > - /* TODO: initialise network interface */ > - > ret = io_allocate_rbuf_ring(ifq, ®); > if (ret) > goto err; > > + ret = -ENODEV; > + ifq->dev = dev_get_by_index(&init_net, reg.if_idx); What's the plan for other namespaces? Ideally the device bind comes from a socket and that gives you the namespace. > + if (!ifq->dev) > + goto err; > + > /* TODO: map zc region and initialise zc pool */ > > ifq->rq_entries = reg.rq_entries;
On 25/08/2023 19:26, David Ahern wrote: > On 8/25/23 6:19 PM, David Wei wrote: >> diff --git a/io_uring/zc_rx.c b/io_uring/zc_rx.c >> index 6c57c9b06e05..8cc66731af5b 100644 >> --- a/io_uring/zc_rx.c >> +++ b/io_uring/zc_rx.c >> @@ -10,6 +11,35 @@ >> #include "kbuf.h" >> #include "zc_rx.h" >> >> +typedef int (*bpf_op_t)(struct net_device *dev, struct netdev_bpf *bpf); >> + >> +static int __io_queue_mgmt(struct net_device *dev, struct io_zc_rx_ifq *ifq, >> + u16 queue_id) >> +{ >> + struct netdev_bpf cmd; >> + bpf_op_t ndo_bpf; >> + >> + ndo_bpf = dev->netdev_ops->ndo_bpf; > > This is not bpf related, so it seems wrong to be overloading this ndo. Agreed. I believe the original author (Jonathan) was inspired by XDP_SETUP_XSK_POOL. I would also prefer a better way of setting up an RX queue for ZC. Mina's proposal I think uses netlink for this. Do you have any other suggestions? We want to keep resource registration inline, so we need to be able to call it from within io_uring. > > >> + if (!ndo_bpf) >> + return -EINVAL; >> + >> + cmd.command = XDP_SETUP_ZC_RX; >> + cmd.zc_rx.ifq = ifq; >> + cmd.zc_rx.queue_id = queue_id; >> + >> + return ndo_bpf(dev, &cmd); >> +} >> + >> +static int io_open_zc_rxq(struct io_zc_rx_ifq *ifq) >> +{ >> + return __io_queue_mgmt(ifq->dev, ifq, ifq->if_rxq_id); >> +} >> + >> +static int io_close_zc_rxq(struct io_zc_rx_ifq *ifq) >> +{ >> + return __io_queue_mgmt(ifq->dev, NULL, ifq->if_rxq_id); >> +} >> + >> static struct io_zc_rx_ifq *io_zc_rx_ifq_alloc(struct io_ring_ctx *ctx) >> { >> struct io_zc_rx_ifq *ifq; >> @@ -19,12 +49,17 @@ static struct io_zc_rx_ifq *io_zc_rx_ifq_alloc(struct io_ring_ctx *ctx) >> return NULL; >> >> ifq->ctx = ctx; >> + ifq->if_rxq_id = -1; >> >> return ifq; >> } >> >> static void io_zc_rx_ifq_free(struct io_zc_rx_ifq *ifq) >> { >> + if (ifq->if_rxq_id != -1) >> + io_close_zc_rxq(ifq); >> + if (ifq->dev) >> + dev_put(ifq->dev); >> io_free_rbuf_ring(ifq); >> kfree(ifq); >> } >> @@ -41,17 +76,22 @@ int io_register_zc_rx_ifq(struct io_ring_ctx *ctx, >> return -EFAULT; >> if (ctx->ifq) >> return -EBUSY; >> + if (reg.if_rxq_id == -1) >> + return -EINVAL; >> >> ifq = io_zc_rx_ifq_alloc(ctx); >> if (!ifq) >> return -ENOMEM; >> >> - /* TODO: initialise network interface */ >> - >> ret = io_allocate_rbuf_ring(ifq, ®); >> if (ret) >> goto err; >> >> + ret = -ENODEV; >> + ifq->dev = dev_get_by_index(&init_net, reg.if_idx); > > What's the plan for other namespaces? Ideally the device bind comes from > a socket and that gives you the namespace. Sorry, I did not consider namespaces yet. I'll look into how namespaces work and then get back to you. > >> + if (!ifq->dev) >> + goto err; >> + >> /* TODO: map zc region and initialise zc pool */ >> >> ifq->rq_entries = reg.rq_entries; > >
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 0b6c5508b1ca..f2ec0a454307 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -3098,6 +3098,7 @@ static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) percpu_ref_kill(&ctx->refs); xa_for_each(&ctx->personalities, index, creds) io_unregister_personality(ctx, index); + io_unregister_zc_rx_ifq(ctx); if (ctx->rings) io_poll_remove_all(ctx, NULL, true); mutex_unlock(&ctx->uring_lock); diff --git a/io_uring/zc_rx.c b/io_uring/zc_rx.c index 6c57c9b06e05..8cc66731af5b 100644 --- a/io_uring/zc_rx.c +++ b/io_uring/zc_rx.c @@ -3,6 +3,7 @@ #include <linux/errno.h> #include <linux/mm.h> #include <linux/io_uring.h> +#include <linux/netdevice.h> #include <uapi/linux/io_uring.h> @@ -10,6 +11,35 @@ #include "kbuf.h" #include "zc_rx.h" +typedef int (*bpf_op_t)(struct net_device *dev, struct netdev_bpf *bpf); + +static int __io_queue_mgmt(struct net_device *dev, struct io_zc_rx_ifq *ifq, + u16 queue_id) +{ + struct netdev_bpf cmd; + bpf_op_t ndo_bpf; + + ndo_bpf = dev->netdev_ops->ndo_bpf; + if (!ndo_bpf) + return -EINVAL; + + cmd.command = XDP_SETUP_ZC_RX; + cmd.zc_rx.ifq = ifq; + cmd.zc_rx.queue_id = queue_id; + + return ndo_bpf(dev, &cmd); +} + +static int io_open_zc_rxq(struct io_zc_rx_ifq *ifq) +{ + return __io_queue_mgmt(ifq->dev, ifq, ifq->if_rxq_id); +} + +static int io_close_zc_rxq(struct io_zc_rx_ifq *ifq) +{ + return __io_queue_mgmt(ifq->dev, NULL, ifq->if_rxq_id); +} + static struct io_zc_rx_ifq *io_zc_rx_ifq_alloc(struct io_ring_ctx *ctx) { struct io_zc_rx_ifq *ifq; @@ -19,12 +49,17 @@ static struct io_zc_rx_ifq *io_zc_rx_ifq_alloc(struct io_ring_ctx *ctx) return NULL; ifq->ctx = ctx; + ifq->if_rxq_id = -1; return ifq; } static void io_zc_rx_ifq_free(struct io_zc_rx_ifq *ifq) { + if (ifq->if_rxq_id != -1) + io_close_zc_rxq(ifq); + if (ifq->dev) + dev_put(ifq->dev); io_free_rbuf_ring(ifq); kfree(ifq); } @@ -41,17 +76,22 @@ int io_register_zc_rx_ifq(struct io_ring_ctx *ctx, return -EFAULT; if (ctx->ifq) return -EBUSY; + if (reg.if_rxq_id == -1) + return -EINVAL; ifq = io_zc_rx_ifq_alloc(ctx); if (!ifq) return -ENOMEM; - /* TODO: initialise network interface */ - ret = io_allocate_rbuf_ring(ifq, ®); if (ret) goto err; + ret = -ENODEV; + ifq->dev = dev_get_by_index(&init_net, reg.if_idx); + if (!ifq->dev) + goto err; + /* TODO: map zc region and initialise zc pool */ ifq->rq_entries = reg.rq_entries; @@ -59,6 +99,10 @@ int io_register_zc_rx_ifq(struct io_ring_ctx *ctx, ifq->if_rxq_id = reg.if_rxq_id; ctx->ifq = ifq; + ret = io_open_zc_rxq(ifq); + if (ret) + goto err; + ring_sz = sizeof(struct io_rbuf_ring); rqes_sz = sizeof(struct io_uring_rbuf_rqe) * ifq->rq_entries; cqes_sz = sizeof(struct io_uring_rbuf_cqe) * ifq->cq_entries; @@ -80,3 +124,17 @@ int io_register_zc_rx_ifq(struct io_ring_ctx *ctx, io_zc_rx_ifq_free(ifq); return ret; } + +int io_unregister_zc_rx_ifq(struct io_ring_ctx *ctx) +{ + struct io_zc_rx_ifq *ifq; + + ifq = ctx->ifq; + if (!ifq) + return -EINVAL; + + ctx->ifq = NULL; + io_zc_rx_ifq_free(ifq); + + return 0; +} diff --git a/io_uring/zc_rx.h b/io_uring/zc_rx.h index 4363734f3d98..340ececa9f9c 100644 --- a/io_uring/zc_rx.h +++ b/io_uring/zc_rx.h @@ -17,5 +17,6 @@ struct io_zc_rx_ifq { int io_register_zc_rx_ifq(struct io_ring_ctx *ctx, struct io_uring_zc_rx_ifq_reg __user *arg); +int io_unregister_zc_rx_ifq(struct io_ring_ctx *ctx); #endif