Message ID | 20250212185859.3509616-5-dw@davidwei.uk (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | io_uring zero copy rx | expand |
On Wed, Feb 12, 2025 at 10:59 AM David Wei <dw@davidwei.uk> wrote: > > From: Pavel Begunkov <asml.silence@gmail.com> > > Implement a page pool memory provider for io_uring to receieve in a > zero copy fashion. For that, the provider allocates user pages wrapped > around into struct net_iovs, that are stored in a previously registered > struct net_iov_area. > > Unlike the traditional receive, that frees pages and returns them back > to the page pool right after data was copied to the user, e.g. inside > recv(2), we extend the lifetime until the user space confirms that it's > done processing the data. That's done by taking a net_iov reference. > When the user is done with the buffer, it must return it back to the > kernel by posting an entry into the refill ring, which is usually polled > off the io_uring memory provider callback in the page pool's netmem > allocation path. > > There is also a separate set of per net_iov "user" references accounting > whether a buffer is currently given to the user (including possible > fragmentation). > > Reviewed-by: Jens Axboe <axboe@kernel.dk> > Signed-off-by: Pavel Begunkov <asml.silence@gmail.com> > Signed-off-by: David Wei <dw@davidwei.uk> > --- > io_uring/zcrx.c | 272 ++++++++++++++++++++++++++++++++++++++++++++++++ > io_uring/zcrx.h | 3 + > 2 files changed, 275 insertions(+) > > diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c > index 435cd634f91c..9d5c0479a285 100644 > --- a/io_uring/zcrx.c > +++ b/io_uring/zcrx.c > @@ -2,10 +2,16 @@ > #include <linux/kernel.h> > #include <linux/errno.h> > #include <linux/mm.h> > +#include <linux/nospec.h> > #include <linux/io_uring.h> > #include <linux/netdevice.h> > #include <linux/rtnetlink.h> > > +#include <net/page_pool/helpers.h> > +#include <net/page_pool/memory_provider.h> > +#include <net/netdev_rx_queue.h> > +#include <net/netlink.h> > + > #include <uapi/linux/io_uring.h> > > #include "io_uring.h" > @@ -16,6 +22,33 @@ > > #define IO_RQ_MAX_ENTRIES 32768 > > +__maybe_unused > +static const struct memory_provider_ops io_uring_pp_zc_ops; > + > +static inline struct io_zcrx_area *io_zcrx_iov_to_area(const struct net_iov *niov) > +{ > + struct net_iov_area *owner = net_iov_owner(niov); > + > + return container_of(owner, struct io_zcrx_area, nia); > +} > + > +static inline atomic_t *io_get_user_counter(struct net_iov *niov) > +{ > + struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); > + > + return &area->user_refs[net_iov_idx(niov)]; > +} > + > +static bool io_zcrx_put_niov_uref(struct net_iov *niov) > +{ > + atomic_t *uref = io_get_user_counter(niov); > + > + if (unlikely(!atomic_read(uref))) > + return false; > + atomic_dec(uref); > + return true; > +} > + > static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq, > struct io_uring_zcrx_ifq_reg *reg, > struct io_uring_region_desc *rd) > @@ -51,6 +84,7 @@ static void io_zcrx_free_area(struct io_zcrx_area *area) > { > kvfree(area->freelist); > kvfree(area->nia.niovs); > + kvfree(area->user_refs); > if (area->pages) { > unpin_user_pages(area->pages, area->nia.num_niovs); > kvfree(area->pages); > @@ -106,6 +140,19 @@ static int io_zcrx_create_area(struct io_zcrx_ifq *ifq, > for (i = 0; i < nr_pages; i++) > area->freelist[i] = i; > > + area->user_refs = kvmalloc_array(nr_pages, sizeof(area->user_refs[0]), > + GFP_KERNEL | __GFP_ZERO); > + if (!area->user_refs) > + goto err; > + > + for (i = 0; i < nr_pages; i++) { > + struct net_iov *niov = &area->nia.niovs[i]; > + > + niov->owner = &area->nia; > + area->freelist[i] = i; > + atomic_set(&area->user_refs[i], 0); > + } > + > area->free_count = nr_pages; > area->ifq = ifq; > /* we're only supporting one area per ifq for now */ > @@ -131,6 +178,7 @@ static struct io_zcrx_ifq *io_zcrx_ifq_alloc(struct io_ring_ctx *ctx) > ifq->if_rxq = -1; > ifq->ctx = ctx; > spin_lock_init(&ifq->lock); > + spin_lock_init(&ifq->rq_lock); > return ifq; > } > > @@ -251,12 +299,236 @@ void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx) > > if (!ifq) > return; > + if (WARN_ON_ONCE(ifq->area && > + ifq->area->free_count != ifq->area->nia.num_niovs)) > > ctx->ifq = NULL; > io_zcrx_ifq_free(ifq); > } > > +static struct net_iov *__io_zcrx_get_free_niov(struct io_zcrx_area *area) > +{ > + unsigned niov_idx; > + > + lockdep_assert_held(&area->freelist_lock); > + > + niov_idx = area->freelist[--area->free_count]; > + return &area->nia.niovs[niov_idx]; > +} > + > +static void io_zcrx_return_niov_freelist(struct net_iov *niov) > +{ > + struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); > + > + spin_lock_bh(&area->freelist_lock); > + area->freelist[area->free_count++] = net_iov_idx(niov); > + spin_unlock_bh(&area->freelist_lock); > +} > + > +static void io_zcrx_return_niov(struct net_iov *niov) > +{ > + netmem_ref netmem = net_iov_to_netmem(niov); > + > + page_pool_put_unrefed_netmem(niov->pp, netmem, -1, false); > +} > + > +static void io_zcrx_scrub(struct io_zcrx_ifq *ifq) > +{ > + struct io_zcrx_area *area = ifq->area; > + int i; > + > + if (!area) > + return; > + > + /* Reclaim back all buffers given to the user space. */ > + for (i = 0; i < area->nia.num_niovs; i++) { > + struct net_iov *niov = &area->nia.niovs[i]; > + int nr; > + > + if (!atomic_read(io_get_user_counter(niov))) > + continue; > + nr = atomic_xchg(io_get_user_counter(niov), 0); > + if (nr && !page_pool_unref_netmem(net_iov_to_netmem(niov), nr)) > + io_zcrx_return_niov(niov); I assume nr can be > 1? If it's always 1, then page_pool_put_netmem() does the page_pool_unref_netmem() + page_pool_put_unrefed_netmem() a bit more succinctly. > + } > +} > + > void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx) > { > lockdep_assert_held(&ctx->uring_lock); > + > + if (ctx->ifq) > + io_zcrx_scrub(ctx->ifq); > +} > + > +static inline u32 io_zcrx_rqring_entries(struct io_zcrx_ifq *ifq) > +{ > + u32 entries; > + > + entries = smp_load_acquire(&ifq->rq_ring->tail) - ifq->cached_rq_head; > + return min(entries, ifq->rq_entries); > } > + > +static struct io_uring_zcrx_rqe *io_zcrx_get_rqe(struct io_zcrx_ifq *ifq, > + unsigned mask) > +{ > + unsigned int idx = ifq->cached_rq_head++ & mask; > + > + return &ifq->rqes[idx]; > +} > + > +static void io_zcrx_ring_refill(struct page_pool *pp, > + struct io_zcrx_ifq *ifq) > +{ > + unsigned int mask = ifq->rq_entries - 1; > + unsigned int entries; > + netmem_ref netmem; > + > + spin_lock_bh(&ifq->rq_lock); > + > + entries = io_zcrx_rqring_entries(ifq); > + entries = min_t(unsigned, entries, PP_ALLOC_CACHE_REFILL - pp->alloc.count); > + if (unlikely(!entries)) { > + spin_unlock_bh(&ifq->rq_lock); > + return; > + } > + > + do { > + struct io_uring_zcrx_rqe *rqe = io_zcrx_get_rqe(ifq, mask); > + struct io_zcrx_area *area; > + struct net_iov *niov; > + unsigned niov_idx, area_idx; > + > + area_idx = rqe->off >> IORING_ZCRX_AREA_SHIFT; > + niov_idx = (rqe->off & ~IORING_ZCRX_AREA_MASK) >> PAGE_SHIFT; > + > + if (unlikely(rqe->__pad || area_idx)) > + continue; nit: I believe a lot of the unlikely in the file are redundant. AFAIU the compiler always treats the condition inside the if as unlikely by default if there is no else statement. > + area = ifq->area; > + > + if (unlikely(niov_idx >= area->nia.num_niovs)) > + continue; > + niov_idx = array_index_nospec(niov_idx, area->nia.num_niovs); > + > + niov = &area->nia.niovs[niov_idx]; > + if (!io_zcrx_put_niov_uref(niov)) > + continue; > + > + netmem = net_iov_to_netmem(niov); > + if (page_pool_unref_netmem(netmem, 1) != 0) > + continue; > + > + if (unlikely(niov->pp != pp)) { > + io_zcrx_return_niov(niov); > + continue; > + } > + > + net_mp_netmem_place_in_cache(pp, netmem); > + } while (--entries); > + > + smp_store_release(&ifq->rq_ring->head, ifq->cached_rq_head); > + spin_unlock_bh(&ifq->rq_lock); > +} > + > +static void io_zcrx_refill_slow(struct page_pool *pp, struct io_zcrx_ifq *ifq) > +{ > + struct io_zcrx_area *area = ifq->area; > + > + spin_lock_bh(&area->freelist_lock); > + while (area->free_count && pp->alloc.count < PP_ALLOC_CACHE_REFILL) { > + struct net_iov *niov = __io_zcrx_get_free_niov(area); > + netmem_ref netmem = net_iov_to_netmem(niov); > + > + net_mp_niov_set_page_pool(pp, niov); > + net_mp_netmem_place_in_cache(pp, netmem); > + } > + spin_unlock_bh(&area->freelist_lock); > +} > + > +static netmem_ref io_pp_zc_alloc_netmems(struct page_pool *pp, gfp_t gfp) > +{ > + struct io_zcrx_ifq *ifq = pp->mp_priv; > + > + /* pp should already be ensuring that */ > + if (unlikely(pp->alloc.count)) > + goto out_return; > + As the comment notes, this is a very defensive check that can be removed. We pp should never invoke alloc_netmems if it has items in the cache. > + io_zcrx_ring_refill(pp, ifq); > + if (likely(pp->alloc.count)) > + goto out_return; > + > + io_zcrx_refill_slow(pp, ifq); > + if (!pp->alloc.count) > + return 0; > +out_return: > + return pp->alloc.cache[--pp->alloc.count]; > +} > + > +static bool io_pp_zc_release_netmem(struct page_pool *pp, netmem_ref netmem) > +{ > + struct net_iov *niov; > + > + if (WARN_ON_ONCE(!netmem_is_net_iov(netmem))) > + return false; > + Also a very defensive check that can be removed. There should be no way for the pp to release a netmem to the provider that didn't come from this provider. netmem should be guaranteed to be a net_iov, and also an io_uring net_iov (not dma-buf one), and specifically be a net_iov from this particular memory provider. > + niov = netmem_to_net_iov(netmem); > + net_mp_niov_clear_page_pool(niov); > + io_zcrx_return_niov_freelist(niov); > + return false; > +} > + > +static int io_pp_zc_init(struct page_pool *pp) > +{ > + struct io_zcrx_ifq *ifq = pp->mp_priv; > + > + if (WARN_ON_ONCE(!ifq)) > + return -EINVAL; > + if (pp->dma_map) > + return -EOPNOTSUPP; This condition should be flipped actually. pp->dma_map should be true, otherwise the provider isn't supported. From the netmem.rst docs we require that netmem page_pools are configured with PP_FLAG_DMA_MAP. And we actually check that pp->dma_map == true before invoking mp_ops->init(). This code shouldn't be working unless I missed something. Also arguably this check is defensive. The pp should confirm that pp->dma_map is true before invoking any memory provider, you should assume it is true here (and the devmem provider doesn't check it IIRU). > + if (pp->p.order != 0) > + return -EOPNOTSUPP; > + > + percpu_ref_get(&ifq->ctx->refs); > + return 0; > +} > + > +static void io_pp_zc_destroy(struct page_pool *pp) > +{ > + struct io_zcrx_ifq *ifq = pp->mp_priv; > + > + percpu_ref_put(&ifq->ctx->refs); > +} > + > +static int io_pp_nl_fill(void *mp_priv, struct sk_buff *rsp, > + struct netdev_rx_queue *rxq) > +{ > + struct nlattr *nest; > + int type; > + > + type = rxq ? NETDEV_A_QUEUE_IO_URING : NETDEV_A_PAGE_POOL_IO_URING; > + nest = nla_nest_start(rsp, type); > + if (!nest) > + return -EMSGSIZE; > + nla_nest_end(rsp, nest); > + > + return 0; > +} > + > +static void io_pp_uninstall(void *mp_priv, struct netdev_rx_queue *rxq) > +{ > + struct pp_memory_provider_params *p = &rxq->mp_params; > + struct io_zcrx_ifq *ifq = mp_priv; > + > + io_zcrx_drop_netdev(ifq); > + p->mp_ops = NULL; > + p->mp_priv = NULL; > +} > + > +static const struct memory_provider_ops io_uring_pp_zc_ops = { > + .alloc_netmems = io_pp_zc_alloc_netmems, > + .release_netmem = io_pp_zc_release_netmem, > + .init = io_pp_zc_init, > + .destroy = io_pp_zc_destroy, > + .nl_fill = io_pp_nl_fill, > + .uninstall = io_pp_uninstall, > +}; > diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h > index 595bca0001d2..6c808240ac91 100644 > --- a/io_uring/zcrx.h > +++ b/io_uring/zcrx.h > @@ -9,6 +9,7 @@ > struct io_zcrx_area { > struct net_iov_area nia; > struct io_zcrx_ifq *ifq; > + atomic_t *user_refs; > > u16 area_id; > struct page **pages; > @@ -26,6 +27,8 @@ struct io_zcrx_ifq { > struct io_uring *rq_ring; > struct io_uring_zcrx_rqe *rqes; > u32 rq_entries; > + u32 cached_rq_head; > + spinlock_t rq_lock; > > u32 if_rxq; > struct device *dev; > -- > 2.43.5 >
On 2/13/25 20:57, Mina Almasry wrote: ... >> +static void io_zcrx_scrub(struct io_zcrx_ifq *ifq) >> +{ >> + struct io_zcrx_area *area = ifq->area; >> + int i; >> + >> + if (!area) >> + return; >> + >> + /* Reclaim back all buffers given to the user space. */ >> + for (i = 0; i < area->nia.num_niovs; i++) { >> + struct net_iov *niov = &area->nia.niovs[i]; >> + int nr; >> + >> + if (!atomic_read(io_get_user_counter(niov))) >> + continue; >> + nr = atomic_xchg(io_get_user_counter(niov), 0); >> + if (nr && !page_pool_unref_netmem(net_iov_to_netmem(niov), nr)) >> + io_zcrx_return_niov(niov); > > I assume nr can be > 1? Right If it's always 1, then page_pool_put_netmem() > does the page_pool_unref_netmem() + page_pool_put_unrefed_netmem() a > bit more succinctly. ... >> + entries = io_zcrx_rqring_entries(ifq); >> + entries = min_t(unsigned, entries, PP_ALLOC_CACHE_REFILL - pp->alloc.count); >> + if (unlikely(!entries)) { >> + spin_unlock_bh(&ifq->rq_lock); >> + return; >> + } >> + >> + do { >> + struct io_uring_zcrx_rqe *rqe = io_zcrx_get_rqe(ifq, mask); >> + struct io_zcrx_area *area; >> + struct net_iov *niov; >> + unsigned niov_idx, area_idx; >> + >> + area_idx = rqe->off >> IORING_ZCRX_AREA_SHIFT; >> + niov_idx = (rqe->off & ~IORING_ZCRX_AREA_MASK) >> PAGE_SHIFT; >> + >> + if (unlikely(rqe->__pad || area_idx)) >> + continue; > > nit: I believe a lot of the unlikely in the file are redundant. AFAIU > the compiler always treats the condition inside the if as unlikely by > default if there is no else statement. That'd be too presumptious of the compiler. Sections can be reshuffled, but even without that, the code generation often looks different. The annotation is in the right place. ... >> +static netmem_ref io_pp_zc_alloc_netmems(struct page_pool *pp, gfp_t gfp) >> +{ >> + struct io_zcrx_ifq *ifq = pp->mp_priv; >> + >> + /* pp should already be ensuring that */ >> + if (unlikely(pp->alloc.count)) >> + goto out_return; >> + > > As the comment notes, this is a very defensive check that can be > removed. We pp should never invoke alloc_netmems if it has items in > the cache. Maybe I'll kill it in the future, but it might be a good idea to leave it be as even page_pool.c itself doesn't trust it too much, see __page_pool_alloc_pages_slow(). >> + io_zcrx_ring_refill(pp, ifq); >> + if (likely(pp->alloc.count)) >> + goto out_return; >> + >> + io_zcrx_refill_slow(pp, ifq); >> + if (!pp->alloc.count) >> + return 0; >> +out_return: >> + return pp->alloc.cache[--pp->alloc.count]; >> +} >> + >> +static bool io_pp_zc_release_netmem(struct page_pool *pp, netmem_ref netmem) >> +{ >> + struct net_iov *niov; >> + >> + if (WARN_ON_ONCE(!netmem_is_net_iov(netmem))) >> + return false; >> + > > Also a very defensive check that can be removed. There should be no > way for the pp to release a netmem to the provider that didn't come Agree, but it's a warning and I don't care about performance of this chunk to that extent. Maybe we'll remove it later. > from this provider. netmem should be guaranteed to be a net_iov, and Not like it matters for now, but I wouldn't say it should be net_iov, those callback were initially proposed for huge pages. > also an io_uring net_iov (not dma-buf one), and specifically be a > net_iov from this particular memory provider. > >> + niov = netmem_to_net_iov(netmem); >> + net_mp_niov_clear_page_pool(niov); >> + io_zcrx_return_niov_freelist(niov); >> + return false; >> +} >> + >> +static int io_pp_zc_init(struct page_pool *pp) >> +{ >> + struct io_zcrx_ifq *ifq = pp->mp_priv; >> + >> + if (WARN_ON_ONCE(!ifq)) >> + return -EINVAL; >> + if (pp->dma_map) >> + return -EOPNOTSUPP; > > This condition should be flipped actually. pp->dma_map should be true, > otherwise the provider isn't supported. It's not implemented in this patch, which is why rejected. You can think of it as an unconditional failure, even though io_pp_zc_init is not reachable just yet. > From the netmem.rst docs we require that netmem page_pools are > configured with PP_FLAG_DMA_MAP. > > And we actually check that pp->dma_map == true before invoking > mp_ops->init(). This code shouldn't be working unless I missed > something. > > Also arguably this check is defensive. The pp should confirm that Sure, and I have nothing against defensive checks in cold paths > pp->dma_map is true before invoking any memory provider, you should > assume it is true here (and the devmem provider doesn't check it > IIRU).
On Thu, Feb 13, 2025 at 2:36 PM Pavel Begunkov <asml.silence@gmail.com> wrote: > > On 2/13/25 20:57, Mina Almasry wrote: > ... > >> +static void io_zcrx_scrub(struct io_zcrx_ifq *ifq) > >> +{ > >> + struct io_zcrx_area *area = ifq->area; > >> + int i; > >> + > >> + if (!area) > >> + return; > >> + > >> + /* Reclaim back all buffers given to the user space. */ > >> + for (i = 0; i < area->nia.num_niovs; i++) { > >> + struct net_iov *niov = &area->nia.niovs[i]; > >> + int nr; > >> + > >> + if (!atomic_read(io_get_user_counter(niov))) > >> + continue; > >> + nr = atomic_xchg(io_get_user_counter(niov), 0); > >> + if (nr && !page_pool_unref_netmem(net_iov_to_netmem(niov), nr)) > >> + io_zcrx_return_niov(niov); > > > > I assume nr can be > 1? > > Right > > If it's always 1, then page_pool_put_netmem() > > does the page_pool_unref_netmem() + page_pool_put_unrefed_netmem() a > > bit more succinctly. > ... > >> + entries = io_zcrx_rqring_entries(ifq); > >> + entries = min_t(unsigned, entries, PP_ALLOC_CACHE_REFILL - pp->alloc.count); > >> + if (unlikely(!entries)) { > >> + spin_unlock_bh(&ifq->rq_lock); > >> + return; > >> + } > >> + > >> + do { > >> + struct io_uring_zcrx_rqe *rqe = io_zcrx_get_rqe(ifq, mask); > >> + struct io_zcrx_area *area; > >> + struct net_iov *niov; > >> + unsigned niov_idx, area_idx; > >> + > >> + area_idx = rqe->off >> IORING_ZCRX_AREA_SHIFT; > >> + niov_idx = (rqe->off & ~IORING_ZCRX_AREA_MASK) >> PAGE_SHIFT; > >> + > >> + if (unlikely(rqe->__pad || area_idx)) > >> + continue; > > > > nit: I believe a lot of the unlikely in the file are redundant. AFAIU > > the compiler always treats the condition inside the if as unlikely by > > default if there is no else statement. > > That'd be too presumptious of the compiler. Sections can be reshuffled, > but even without that, the code generation often looks different. The > annotation is in the right place. > > ... > >> +static netmem_ref io_pp_zc_alloc_netmems(struct page_pool *pp, gfp_t gfp) > >> +{ > >> + struct io_zcrx_ifq *ifq = pp->mp_priv; > >> + > >> + /* pp should already be ensuring that */ > >> + if (unlikely(pp->alloc.count)) > >> + goto out_return; > >> + > > > > As the comment notes, this is a very defensive check that can be > > removed. We pp should never invoke alloc_netmems if it has items in > > the cache. > > Maybe I'll kill it in the future, but it might be a good idea to > leave it be as even page_pool.c itself doesn't trust it too much, > see __page_pool_alloc_pages_slow(). > > >> + io_zcrx_ring_refill(pp, ifq); > >> + if (likely(pp->alloc.count)) > >> + goto out_return; > >> + > >> + io_zcrx_refill_slow(pp, ifq); > >> + if (!pp->alloc.count) > >> + return 0; > >> +out_return: > >> + return pp->alloc.cache[--pp->alloc.count]; > >> +} > >> + > >> +static bool io_pp_zc_release_netmem(struct page_pool *pp, netmem_ref netmem) > >> +{ > >> + struct net_iov *niov; > >> + > >> + if (WARN_ON_ONCE(!netmem_is_net_iov(netmem))) > >> + return false; > >> + > > > > Also a very defensive check that can be removed. There should be no > > way for the pp to release a netmem to the provider that didn't come > > Agree, but it's a warning and I don't care about performance > of this chunk to that extent. Maybe we'll remove it later. > > > from this provider. netmem should be guaranteed to be a net_iov, and > > Not like it matters for now, but I wouldn't say it should be > net_iov, those callback were initially proposed for huge pages. > > > also an io_uring net_iov (not dma-buf one), and specifically be a > > net_iov from this particular memory provider. > > > >> + niov = netmem_to_net_iov(netmem); > >> + net_mp_niov_clear_page_pool(niov); > >> + io_zcrx_return_niov_freelist(niov); > >> + return false; > >> +} > >> + > >> +static int io_pp_zc_init(struct page_pool *pp) > >> +{ > >> + struct io_zcrx_ifq *ifq = pp->mp_priv; > >> + > >> + if (WARN_ON_ONCE(!ifq)) > >> + return -EINVAL; > >> + if (pp->dma_map) > >> + return -EOPNOTSUPP; > > > > This condition should be flipped actually. pp->dma_map should be true, > > otherwise the provider isn't supported. > > It's not implemented in this patch, which is why rejected. > You can think of it as an unconditional failure, even though > io_pp_zc_init is not reachable just yet. > Ah, I see in the follow up patch you flip the condition, that's fine then. I usually see defensive checks get rejected but I don't see that blocking this series, so FWIW: Reviewed-by: Mina Almasry <almasrymina@google.com>
On 2/13/25 22:46, Mina Almasry wrote: > On Thu, Feb 13, 2025 at 2:36 PM Pavel Begunkov <asml.silence@gmail.com> wrote: ... >>>> + if (pp->dma_map) >>>> + return -EOPNOTSUPP; >>> >>> This condition should be flipped actually. pp->dma_map should be true, >>> otherwise the provider isn't supported. >> >> It's not implemented in this patch, which is why rejected. >> You can think of it as an unconditional failure, even though >> io_pp_zc_init is not reachable just yet. >> > > Ah, I see in the follow up patch you flip the condition, that's fine then. > > I usually see defensive checks get rejected but I don't see that Yeah, sounds like that's the rule in net/. > blocking this series, so FWIW: > > Reviewed-by: Mina Almasry <almasrymina@google.com> Thanks for the review!
diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 435cd634f91c..9d5c0479a285 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -2,10 +2,16 @@ #include <linux/kernel.h> #include <linux/errno.h> #include <linux/mm.h> +#include <linux/nospec.h> #include <linux/io_uring.h> #include <linux/netdevice.h> #include <linux/rtnetlink.h> +#include <net/page_pool/helpers.h> +#include <net/page_pool/memory_provider.h> +#include <net/netdev_rx_queue.h> +#include <net/netlink.h> + #include <uapi/linux/io_uring.h> #include "io_uring.h" @@ -16,6 +22,33 @@ #define IO_RQ_MAX_ENTRIES 32768 +__maybe_unused +static const struct memory_provider_ops io_uring_pp_zc_ops; + +static inline struct io_zcrx_area *io_zcrx_iov_to_area(const struct net_iov *niov) +{ + struct net_iov_area *owner = net_iov_owner(niov); + + return container_of(owner, struct io_zcrx_area, nia); +} + +static inline atomic_t *io_get_user_counter(struct net_iov *niov) +{ + struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); + + return &area->user_refs[net_iov_idx(niov)]; +} + +static bool io_zcrx_put_niov_uref(struct net_iov *niov) +{ + atomic_t *uref = io_get_user_counter(niov); + + if (unlikely(!atomic_read(uref))) + return false; + atomic_dec(uref); + return true; +} + static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq, struct io_uring_zcrx_ifq_reg *reg, struct io_uring_region_desc *rd) @@ -51,6 +84,7 @@ static void io_zcrx_free_area(struct io_zcrx_area *area) { kvfree(area->freelist); kvfree(area->nia.niovs); + kvfree(area->user_refs); if (area->pages) { unpin_user_pages(area->pages, area->nia.num_niovs); kvfree(area->pages); @@ -106,6 +140,19 @@ static int io_zcrx_create_area(struct io_zcrx_ifq *ifq, for (i = 0; i < nr_pages; i++) area->freelist[i] = i; + area->user_refs = kvmalloc_array(nr_pages, sizeof(area->user_refs[0]), + GFP_KERNEL | __GFP_ZERO); + if (!area->user_refs) + goto err; + + for (i = 0; i < nr_pages; i++) { + struct net_iov *niov = &area->nia.niovs[i]; + + niov->owner = &area->nia; + area->freelist[i] = i; + atomic_set(&area->user_refs[i], 0); + } + area->free_count = nr_pages; area->ifq = ifq; /* we're only supporting one area per ifq for now */ @@ -131,6 +178,7 @@ static struct io_zcrx_ifq *io_zcrx_ifq_alloc(struct io_ring_ctx *ctx) ifq->if_rxq = -1; ifq->ctx = ctx; spin_lock_init(&ifq->lock); + spin_lock_init(&ifq->rq_lock); return ifq; } @@ -251,12 +299,236 @@ void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx) if (!ifq) return; + if (WARN_ON_ONCE(ifq->area && + ifq->area->free_count != ifq->area->nia.num_niovs)) ctx->ifq = NULL; io_zcrx_ifq_free(ifq); } +static struct net_iov *__io_zcrx_get_free_niov(struct io_zcrx_area *area) +{ + unsigned niov_idx; + + lockdep_assert_held(&area->freelist_lock); + + niov_idx = area->freelist[--area->free_count]; + return &area->nia.niovs[niov_idx]; +} + +static void io_zcrx_return_niov_freelist(struct net_iov *niov) +{ + struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); + + spin_lock_bh(&area->freelist_lock); + area->freelist[area->free_count++] = net_iov_idx(niov); + spin_unlock_bh(&area->freelist_lock); +} + +static void io_zcrx_return_niov(struct net_iov *niov) +{ + netmem_ref netmem = net_iov_to_netmem(niov); + + page_pool_put_unrefed_netmem(niov->pp, netmem, -1, false); +} + +static void io_zcrx_scrub(struct io_zcrx_ifq *ifq) +{ + struct io_zcrx_area *area = ifq->area; + int i; + + if (!area) + return; + + /* Reclaim back all buffers given to the user space. */ + for (i = 0; i < area->nia.num_niovs; i++) { + struct net_iov *niov = &area->nia.niovs[i]; + int nr; + + if (!atomic_read(io_get_user_counter(niov))) + continue; + nr = atomic_xchg(io_get_user_counter(niov), 0); + if (nr && !page_pool_unref_netmem(net_iov_to_netmem(niov), nr)) + io_zcrx_return_niov(niov); + } +} + void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx) { lockdep_assert_held(&ctx->uring_lock); + + if (ctx->ifq) + io_zcrx_scrub(ctx->ifq); +} + +static inline u32 io_zcrx_rqring_entries(struct io_zcrx_ifq *ifq) +{ + u32 entries; + + entries = smp_load_acquire(&ifq->rq_ring->tail) - ifq->cached_rq_head; + return min(entries, ifq->rq_entries); } + +static struct io_uring_zcrx_rqe *io_zcrx_get_rqe(struct io_zcrx_ifq *ifq, + unsigned mask) +{ + unsigned int idx = ifq->cached_rq_head++ & mask; + + return &ifq->rqes[idx]; +} + +static void io_zcrx_ring_refill(struct page_pool *pp, + struct io_zcrx_ifq *ifq) +{ + unsigned int mask = ifq->rq_entries - 1; + unsigned int entries; + netmem_ref netmem; + + spin_lock_bh(&ifq->rq_lock); + + entries = io_zcrx_rqring_entries(ifq); + entries = min_t(unsigned, entries, PP_ALLOC_CACHE_REFILL - pp->alloc.count); + if (unlikely(!entries)) { + spin_unlock_bh(&ifq->rq_lock); + return; + } + + do { + struct io_uring_zcrx_rqe *rqe = io_zcrx_get_rqe(ifq, mask); + struct io_zcrx_area *area; + struct net_iov *niov; + unsigned niov_idx, area_idx; + + area_idx = rqe->off >> IORING_ZCRX_AREA_SHIFT; + niov_idx = (rqe->off & ~IORING_ZCRX_AREA_MASK) >> PAGE_SHIFT; + + if (unlikely(rqe->__pad || area_idx)) + continue; + area = ifq->area; + + if (unlikely(niov_idx >= area->nia.num_niovs)) + continue; + niov_idx = array_index_nospec(niov_idx, area->nia.num_niovs); + + niov = &area->nia.niovs[niov_idx]; + if (!io_zcrx_put_niov_uref(niov)) + continue; + + netmem = net_iov_to_netmem(niov); + if (page_pool_unref_netmem(netmem, 1) != 0) + continue; + + if (unlikely(niov->pp != pp)) { + io_zcrx_return_niov(niov); + continue; + } + + net_mp_netmem_place_in_cache(pp, netmem); + } while (--entries); + + smp_store_release(&ifq->rq_ring->head, ifq->cached_rq_head); + spin_unlock_bh(&ifq->rq_lock); +} + +static void io_zcrx_refill_slow(struct page_pool *pp, struct io_zcrx_ifq *ifq) +{ + struct io_zcrx_area *area = ifq->area; + + spin_lock_bh(&area->freelist_lock); + while (area->free_count && pp->alloc.count < PP_ALLOC_CACHE_REFILL) { + struct net_iov *niov = __io_zcrx_get_free_niov(area); + netmem_ref netmem = net_iov_to_netmem(niov); + + net_mp_niov_set_page_pool(pp, niov); + net_mp_netmem_place_in_cache(pp, netmem); + } + spin_unlock_bh(&area->freelist_lock); +} + +static netmem_ref io_pp_zc_alloc_netmems(struct page_pool *pp, gfp_t gfp) +{ + struct io_zcrx_ifq *ifq = pp->mp_priv; + + /* pp should already be ensuring that */ + if (unlikely(pp->alloc.count)) + goto out_return; + + io_zcrx_ring_refill(pp, ifq); + if (likely(pp->alloc.count)) + goto out_return; + + io_zcrx_refill_slow(pp, ifq); + if (!pp->alloc.count) + return 0; +out_return: + return pp->alloc.cache[--pp->alloc.count]; +} + +static bool io_pp_zc_release_netmem(struct page_pool *pp, netmem_ref netmem) +{ + struct net_iov *niov; + + if (WARN_ON_ONCE(!netmem_is_net_iov(netmem))) + return false; + + niov = netmem_to_net_iov(netmem); + net_mp_niov_clear_page_pool(niov); + io_zcrx_return_niov_freelist(niov); + return false; +} + +static int io_pp_zc_init(struct page_pool *pp) +{ + struct io_zcrx_ifq *ifq = pp->mp_priv; + + if (WARN_ON_ONCE(!ifq)) + return -EINVAL; + if (pp->dma_map) + return -EOPNOTSUPP; + if (pp->p.order != 0) + return -EOPNOTSUPP; + + percpu_ref_get(&ifq->ctx->refs); + return 0; +} + +static void io_pp_zc_destroy(struct page_pool *pp) +{ + struct io_zcrx_ifq *ifq = pp->mp_priv; + + percpu_ref_put(&ifq->ctx->refs); +} + +static int io_pp_nl_fill(void *mp_priv, struct sk_buff *rsp, + struct netdev_rx_queue *rxq) +{ + struct nlattr *nest; + int type; + + type = rxq ? NETDEV_A_QUEUE_IO_URING : NETDEV_A_PAGE_POOL_IO_URING; + nest = nla_nest_start(rsp, type); + if (!nest) + return -EMSGSIZE; + nla_nest_end(rsp, nest); + + return 0; +} + +static void io_pp_uninstall(void *mp_priv, struct netdev_rx_queue *rxq) +{ + struct pp_memory_provider_params *p = &rxq->mp_params; + struct io_zcrx_ifq *ifq = mp_priv; + + io_zcrx_drop_netdev(ifq); + p->mp_ops = NULL; + p->mp_priv = NULL; +} + +static const struct memory_provider_ops io_uring_pp_zc_ops = { + .alloc_netmems = io_pp_zc_alloc_netmems, + .release_netmem = io_pp_zc_release_netmem, + .init = io_pp_zc_init, + .destroy = io_pp_zc_destroy, + .nl_fill = io_pp_nl_fill, + .uninstall = io_pp_uninstall, +}; diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h index 595bca0001d2..6c808240ac91 100644 --- a/io_uring/zcrx.h +++ b/io_uring/zcrx.h @@ -9,6 +9,7 @@ struct io_zcrx_area { struct net_iov_area nia; struct io_zcrx_ifq *ifq; + atomic_t *user_refs; u16 area_id; struct page **pages; @@ -26,6 +27,8 @@ struct io_zcrx_ifq { struct io_uring *rq_ring; struct io_uring_zcrx_rqe *rqes; u32 rq_entries; + u32 cached_rq_head; + spinlock_t rq_lock; u32 if_rxq; struct device *dev;