diff mbox series

[v1,11/15] io_uring/zcrx: implement zerocopy receive pp memory provider

Message ID 20241007221603.1703699-12-dw@davidwei.uk (mailing list archive)
State New
Headers show
Series io_uring zero copy rx | expand

Commit Message

David Wei Oct. 7, 2024, 10:15 p.m. UTC
From: Pavel Begunkov <asml.silence@gmail.com>

Implement a page pool memory provider for io_uring to receieve in a
zero copy fashion. For that, the provider allocates user pages wrapped
around into struct net_iovs, that are stored in a previously registered
struct net_iov_area.

Unlike with traditional receives, for which pages from a page pool can
be deallocated right after the user receives data, e.g. via recv(2),
we extend the lifetime by recycling buffers only after the user space
acknowledges that it's done processing the data via the refill queue.
Before handing buffers to the user, we mark them by bumping the refcount
by a bias value IO_ZC_RX_UREF, which will be checked when the buffer is
returned back. When the corresponding io_uring instance and/or page pool
are destroyed, we'll force back all buffers that are currently in the
user space in ->io_pp_zc_scrub by clearing the bias.

Refcounting and lifetime:

Initially, all buffers are considered unallocated and stored in
->freelist, at which point they are not yet directly exposed to the core
page pool code and not accounted to page pool's pages_state_hold_cnt.
The ->alloc_netmems callback will allocate them by placing into the
page pool's cache, setting the refcount to 1 as usual and adjusting
pages_state_hold_cnt.

Then, either the buffer is dropped and returns back to the page pool
into the ->freelist via io_pp_zc_release_netmem, in which case the page
pool will match hold_cnt for us with ->pages_state_release_cnt. Or more
likely the buffer will go through the network/protocol stacks and end up
in the corresponding socket's receive queue. From there the user can get
it via an new io_uring request implemented in following patches. As
mentioned above, before giving a buffer to the user we bump the refcount
by IO_ZC_RX_UREF.

Once the user is done with the buffer processing, it must return it back
via the refill queue, from where our ->alloc_netmems implementation can
grab it, check references, put IO_ZC_RX_UREF, and recycle the buffer if
there are no more users left. As we place such buffers right back into
the page pools fast cache and they didn't go through the normal pp
release path, they are still considered "allocated" and no pp hold_cnt
is required. For the same reason we dma sync buffers for the device
in io_zc_add_pp_cache().

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: David Wei <dw@davidwei.uk>
---
 include/linux/io_uring/net.h |   5 +
 io_uring/zcrx.c              | 229 +++++++++++++++++++++++++++++++++++
 io_uring/zcrx.h              |   6 +
 3 files changed, 240 insertions(+)

Comments

Jens Axboe Oct. 9, 2024, 6:10 p.m. UTC | #1
Looks good to me.
Mina Almasry Oct. 9, 2024, 10:01 p.m. UTC | #2
On Mon, Oct 7, 2024 at 3:16 PM David Wei <dw@davidwei.uk> wrote:
>
> From: Pavel Begunkov <asml.silence@gmail.com>
>
> Implement a page pool memory provider for io_uring to receieve in a
> zero copy fashion. For that, the provider allocates user pages wrapped
> around into struct net_iovs, that are stored in a previously registered
> struct net_iov_area.
>
> Unlike with traditional receives, for which pages from a page pool can
> be deallocated right after the user receives data, e.g. via recv(2),
> we extend the lifetime by recycling buffers only after the user space
> acknowledges that it's done processing the data via the refill queue.
> Before handing buffers to the user, we mark them by bumping the refcount
> by a bias value IO_ZC_RX_UREF, which will be checked when the buffer is
> returned back. When the corresponding io_uring instance and/or page pool
> are destroyed, we'll force back all buffers that are currently in the
> user space in ->io_pp_zc_scrub by clearing the bias.
>

This is an interesting design choice. In my experience the page_pool
works the opposite way, i.e. all the netmems in it are kept alive
until the user is done with them. Deviating from that requires custom
behavior (->scrub), which may be fine, but why do this? Isn't it
better for uapi perspective to keep the memory alive until the user is
done with it?

> Refcounting and lifetime:
>
> Initially, all buffers are considered unallocated and stored in
> ->freelist, at which point they are not yet directly exposed to the core
> page pool code and not accounted to page pool's pages_state_hold_cnt.
> The ->alloc_netmems callback will allocate them by placing into the
> page pool's cache, setting the refcount to 1 as usual and adjusting
> pages_state_hold_cnt.
>
> Then, either the buffer is dropped and returns back to the page pool
> into the ->freelist via io_pp_zc_release_netmem, in which case the page
> pool will match hold_cnt for us with ->pages_state_release_cnt. Or more
> likely the buffer will go through the network/protocol stacks and end up
> in the corresponding socket's receive queue. From there the user can get
> it via an new io_uring request implemented in following patches. As
> mentioned above, before giving a buffer to the user we bump the refcount
> by IO_ZC_RX_UREF.
>
> Once the user is done with the buffer processing, it must return it back
> via the refill queue, from where our ->alloc_netmems implementation can
> grab it, check references, put IO_ZC_RX_UREF, and recycle the buffer if
> there are no more users left. As we place such buffers right back into
> the page pools fast cache and they didn't go through the normal pp
> release path, they are still considered "allocated" and no pp hold_cnt
> is required.

Why is this needed? In general the provider is to allocate free memory
and logic as to where the memory should go (to fast cache, to normal
pp release path, etc) should remain in provider agnostic code paths in
the page_pool. Not maintainable IMO in the long run to have individual
pp providers customizing non-provider specific code or touching pp
private structs.

> For the same reason we dma sync buffers for the device
> in io_zc_add_pp_cache().
>
> Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
> Signed-off-by: David Wei <dw@davidwei.uk>
> ---
>  include/linux/io_uring/net.h |   5 +
>  io_uring/zcrx.c              | 229 +++++++++++++++++++++++++++++++++++
>  io_uring/zcrx.h              |   6 +
>  3 files changed, 240 insertions(+)
>
> diff --git a/include/linux/io_uring/net.h b/include/linux/io_uring/net.h
> index b58f39fed4d5..610b35b451fd 100644
> --- a/include/linux/io_uring/net.h
> +++ b/include/linux/io_uring/net.h
> @@ -5,6 +5,11 @@
>  struct io_uring_cmd;
>
>  #if defined(CONFIG_IO_URING)
> +
> +#if defined(CONFIG_PAGE_POOL)
> +extern const struct memory_provider_ops io_uring_pp_zc_ops;
> +#endif
> +
>  int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags);
>
>  #else
> diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
> index 8382129402ac..6cd3dee8b90a 100644
> --- a/io_uring/zcrx.c
> +++ b/io_uring/zcrx.c
> @@ -2,7 +2,11 @@
>  #include <linux/kernel.h>
>  #include <linux/errno.h>
>  #include <linux/mm.h>
> +#include <linux/nospec.h>
> +#include <linux/netdevice.h>
>  #include <linux/io_uring.h>
> +#include <net/page_pool/helpers.h>
> +#include <trace/events/page_pool.h>
>
>  #include <uapi/linux/io_uring.h>
>
> @@ -16,6 +20,13 @@
>
>  #if defined(CONFIG_PAGE_POOL) && defined(CONFIG_INET)
>
> +static inline struct io_zcrx_area *io_zcrx_iov_to_area(const struct net_iov *niov)
> +{
> +       struct net_iov_area *owner = net_iov_owner(niov);
> +
> +       return container_of(owner, struct io_zcrx_area, nia);

Similar to other comment in the other patch, why are we sure this
doesn't return garbage (i.e. it's accidentally called on a dmabuf
net_iov?)

> +}
> +
>  static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq,
>                                  struct io_uring_zcrx_ifq_reg *reg)
>  {
> @@ -101,6 +112,9 @@ static int io_zcrx_create_area(struct io_ring_ctx *ctx,
>                 goto err;
>
>         for (i = 0; i < nr_pages; i++) {
> +               struct net_iov *niov = &area->nia.niovs[i];
> +
> +               niov->owner = &area->nia;
>                 area->freelist[i] = i;
>         }
>
> @@ -233,4 +247,219 @@ void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx)
>         lockdep_assert_held(&ctx->uring_lock);
>  }
>
> +static bool io_zcrx_niov_put(struct net_iov *niov, int nr)
> +{
> +       return atomic_long_sub_and_test(nr, &niov->pp_ref_count);
> +}
> +
> +static bool io_zcrx_put_niov_uref(struct net_iov *niov)
> +{
> +       if (atomic_long_read(&niov->pp_ref_count) < IO_ZC_RX_UREF)
> +               return false;
> +
> +       return io_zcrx_niov_put(niov, IO_ZC_RX_UREF);
> +}
> +
> +static inline void io_zc_add_pp_cache(struct page_pool *pp,
> +                                     struct net_iov *niov)
> +{
> +       netmem_ref netmem = net_iov_to_netmem(niov);
> +
> +#if defined(CONFIG_HAS_DMA) && defined(CONFIG_DMA_NEED_SYNC)
> +       if (pp->dma_sync && dma_dev_need_sync(pp->p.dev)) {

IIRC we force that dma_sync == true for memory providers, unless you
changed that and I missed it.

> +               dma_addr_t dma_addr = page_pool_get_dma_addr_netmem(netmem);
> +
> +               dma_sync_single_range_for_device(pp->p.dev, dma_addr,
> +                                                pp->p.offset, pp->p.max_len,
> +                                                pp->p.dma_dir);
> +       }
> +#endif
> +
> +       page_pool_fragment_netmem(netmem, 1);
> +       pp->alloc.cache[pp->alloc.count++] = netmem;

IMO touching pp internals in a provider should not be acceptable.

pp->alloc.cache is a data structure private to the page_pool and
should not be touched at all by any specific memory provider. Not
maintainable in the long run tbh for individual pp providers to mess
with pp private structs and we hunt for bugs that are reproducible
with 1 pp provider or another, or have to deal with the mental strain
of provider specific handling in what is supposed to be generic
page_pool paths.

IMO the provider must implement the 4 'ops' (alloc, free, init,
destroy) and must not touch pp privates while doing so. If you need to
change how pp recycling works then it needs to be done in a provider
agnostic way.

I think both the dmabuf provider and Jakub's huge page provider both
implemented the ops while never touching pp internals. I wonder if we
can follow this lead.

--
Thanks,
Mina
Pavel Begunkov Oct. 9, 2024, 10:58 p.m. UTC | #3
On 10/9/24 23:01, Mina Almasry wrote:
> On Mon, Oct 7, 2024 at 3:16 PM David Wei <dw@davidwei.uk> wrote:
>>
>> From: Pavel Begunkov <asml.silence@gmail.com>
>>
>> Implement a page pool memory provider for io_uring to receieve in a
>> zero copy fashion. For that, the provider allocates user pages wrapped
>> around into struct net_iovs, that are stored in a previously registered
>> struct net_iov_area.
>>
>> Unlike with traditional receives, for which pages from a page pool can
>> be deallocated right after the user receives data, e.g. via recv(2),
>> we extend the lifetime by recycling buffers only after the user space
>> acknowledges that it's done processing the data via the refill queue.
>> Before handing buffers to the user, we mark them by bumping the refcount
>> by a bias value IO_ZC_RX_UREF, which will be checked when the buffer is
>> returned back. When the corresponding io_uring instance and/or page pool
>> are destroyed, we'll force back all buffers that are currently in the
>> user space in ->io_pp_zc_scrub by clearing the bias.
>>
> 
> This is an interesting design choice. In my experience the page_pool
> works the opposite way, i.e. all the netmems in it are kept alive
> until the user is done with them. Deviating from that requires custom
> behavior (->scrub), which may be fine, but why do this? Isn't it
> better for uapi perspective to keep the memory alive until the user is
> done with it?

It's hardly interesting, it's _exactly_ the same thing devmem TCP
does by attaching the lifetime of buffers to a socket's xarray,
which requires custom behaviour. Maybe I wasn't clear on one thing
though, it's accounting from the page pool's perspective. Those are
user pages, likely still mapped into the user space, in which case
they're not going to be destroyed.

>> Refcounting and lifetime:
>>
>> Initially, all buffers are considered unallocated and stored in
>> ->freelist, at which point they are not yet directly exposed to the core
>> page pool code and not accounted to page pool's pages_state_hold_cnt.
>> The ->alloc_netmems callback will allocate them by placing into the
>> page pool's cache, setting the refcount to 1 as usual and adjusting
>> pages_state_hold_cnt.
>>
>> Then, either the buffer is dropped and returns back to the page pool
>> into the ->freelist via io_pp_zc_release_netmem, in which case the page
>> pool will match hold_cnt for us with ->pages_state_release_cnt. Or more
>> likely the buffer will go through the network/protocol stacks and end up
>> in the corresponding socket's receive queue. From there the user can get
>> it via an new io_uring request implemented in following patches. As
>> mentioned above, before giving a buffer to the user we bump the refcount
>> by IO_ZC_RX_UREF.
>>
>> Once the user is done with the buffer processing, it must return it back
>> via the refill queue, from where our ->alloc_netmems implementation can
>> grab it, check references, put IO_ZC_RX_UREF, and recycle the buffer if
>> there are no more users left. As we place such buffers right back into
>> the page pools fast cache and they didn't go through the normal pp
>> release path, they are still considered "allocated" and no pp hold_cnt
>> is required.
> 
> Why is this needed? In general the provider is to allocate free memory

I don't get it, what "this"? If it's refill queue, that's because
I don't like actively returning buffers back via syscall / setsockopt
and trying to transfer them into the napi context (i.e.
napi_pp_put_page) hoping it works / cached well.

If "this" is IO_ZC_RX_UREF, it's because we need to track when a
buffer is given to the userspace, and I don't think some kind of
map / xarray in the hot path is the best for performance solution.

> and logic as to where the memory should go (to fast cache, to normal
> pp release path, etc) should remain in provider agnostic code paths in
> the page_pool. Not maintainable IMO in the long run to have individual

Please do elaborate what exactly is not maintainable here

> pp providers customizing non-provider specific code or touching pp
> private structs.

...
>> diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
>> index 8382129402ac..6cd3dee8b90a 100644
>> --- a/io_uring/zcrx.c
>> +++ b/io_uring/zcrx.c
>> @@ -2,7 +2,11 @@
...
>> +static inline struct io_zcrx_area *io_zcrx_iov_to_area(const struct net_iov *niov)
>> +{
>> +       struct net_iov_area *owner = net_iov_owner(niov);
>> +
>> +       return container_of(owner, struct io_zcrx_area, nia);
> 
> Similar to other comment in the other patch, why are we sure this
> doesn't return garbage (i.e. it's accidentally called on a dmabuf
> net_iov?)

There couldn't be any net_iov at this point not belonging to
the current io_uring instance / etc. Same with devmem TCP,
devmem callbacks can't be called for some random net_iov, the
only place you need to explicitly check is where it comes
from generic path to a devmem aware path like that patched
chunk in tcp.c

>> +static inline void io_zc_add_pp_cache(struct page_pool *pp,
>> +                                     struct net_iov *niov)
>> +{
>> +       netmem_ref netmem = net_iov_to_netmem(niov);
>> +
>> +#if defined(CONFIG_HAS_DMA) && defined(CONFIG_DMA_NEED_SYNC)
>> +       if (pp->dma_sync && dma_dev_need_sync(pp->p.dev)) {
> 
> IIRC we force that dma_sync == true for memory providers, unless you
> changed that and I missed it.

I'll take a look, might remove it.

>> +               dma_addr_t dma_addr = page_pool_get_dma_addr_netmem(netmem);
>> +
>> +               dma_sync_single_range_for_device(pp->p.dev, dma_addr,
>> +                                                pp->p.offset, pp->p.max_len,
>> +                                                pp->p.dma_dir);
>> +       }
>> +#endif
>> +
>> +       page_pool_fragment_netmem(netmem, 1);
>> +       pp->alloc.cache[pp->alloc.count++] = netmem;
> 
> IMO touching pp internals in a provider should not be acceptable.

Ok, I can add a page pool helper for that.

> pp->alloc.cache is a data structure private to the page_pool and
> should not be touched at all by any specific memory provider. Not
> maintainable in the long run tbh for individual pp providers to mess
> with pp private structs and we hunt for bugs that are reproducible
> with 1 pp provider or another, or have to deal with the mental strain
> of provider specific handling in what is supposed to be generic
> page_pool paths.

I get what you're trying to say about not touching internals,
I agree with that, but I can't share the sentiment about debugging.
It's a pretty specific api, users running io_uring almost always
write directly to io_uring and we solve it. If happens it's not
the case, please do redirect the issue.
  
> IMO the provider must implement the 4 'ops' (alloc, free, init,

Doing 1 buffer per callback wouldn't be scalable at speeds
we're looking at.
Mina Almasry Oct. 10, 2024, 6:19 p.m. UTC | #4
On Wed, Oct 9, 2024 at 3:57 PM Pavel Begunkov <asml.silence@gmail.com> wrote:
>
> On 10/9/24 23:01, Mina Almasry wrote:
> > On Mon, Oct 7, 2024 at 3:16 PM David Wei <dw@davidwei.uk> wrote:
> >>
> >> From: Pavel Begunkov <asml.silence@gmail.com>
> >>
> >> Implement a page pool memory provider for io_uring to receieve in a
> >> zero copy fashion. For that, the provider allocates user pages wrapped
> >> around into struct net_iovs, that are stored in a previously registered
> >> struct net_iov_area.
> >>
> >> Unlike with traditional receives, for which pages from a page pool can
> >> be deallocated right after the user receives data, e.g. via recv(2),
> >> we extend the lifetime by recycling buffers only after the user space
> >> acknowledges that it's done processing the data via the refill queue.
> >> Before handing buffers to the user, we mark them by bumping the refcount
> >> by a bias value IO_ZC_RX_UREF, which will be checked when the buffer is
> >> returned back. When the corresponding io_uring instance and/or page pool
> >> are destroyed, we'll force back all buffers that are currently in the
> >> user space in ->io_pp_zc_scrub by clearing the bias.
> >>
> >
> > This is an interesting design choice. In my experience the page_pool
> > works the opposite way, i.e. all the netmems in it are kept alive
> > until the user is done with them. Deviating from that requires custom
> > behavior (->scrub), which may be fine, but why do this? Isn't it
> > better for uapi perspective to keep the memory alive until the user is
> > done with it?
>
> It's hardly interesting, it's _exactly_ the same thing devmem TCP
> does by attaching the lifetime of buffers to a socket's xarray,
> which requires custom behaviour. Maybe I wasn't clear on one thing
> though, it's accounting from the page pool's perspective. Those are
> user pages, likely still mapped into the user space, in which case
> they're not going to be destroyed.
>

I think we miscommunicated. Both devmem TCP and io_uring seem to bump
the refcount of memory while the user is using it, yes. But devmem TCP
doesn't scrub the memory when the page_pool dies. io_uring seems to
want to scrub the memory when the page_pool dies. I'm wondering about
this difference. Seems it's better from a uapi prespective to keep the
memory alive until the user returns it or crash. Otherwise you could
have 1 thread reading user memory and 1 thread destroying the
page_pool and the memory will be pulled from under the read, right?

> >> Refcounting and lifetime:
> >>
> >> Initially, all buffers are considered unallocated and stored in
> >> ->freelist, at which point they are not yet directly exposed to the core
> >> page pool code and not accounted to page pool's pages_state_hold_cnt.
> >> The ->alloc_netmems callback will allocate them by placing into the
> >> page pool's cache, setting the refcount to 1 as usual and adjusting
> >> pages_state_hold_cnt.
> >>
> >> Then, either the buffer is dropped and returns back to the page pool
> >> into the ->freelist via io_pp_zc_release_netmem, in which case the page
> >> pool will match hold_cnt for us with ->pages_state_release_cnt. Or more
> >> likely the buffer will go through the network/protocol stacks and end up
> >> in the corresponding socket's receive queue. From there the user can get
> >> it via an new io_uring request implemented in following patches. As
> >> mentioned above, before giving a buffer to the user we bump the refcount
> >> by IO_ZC_RX_UREF.
> >>
> >> Once the user is done with the buffer processing, it must return it back
> >> via the refill queue, from where our ->alloc_netmems implementation can
> >> grab it, check references, put IO_ZC_RX_UREF, and recycle the buffer if
> >> there are no more users left. As we place such buffers right back into
> >> the page pools fast cache and they didn't go through the normal pp
> >> release path, they are still considered "allocated" and no pp hold_cnt
> >> is required.
> >
> > Why is this needed? In general the provider is to allocate free memory
>
> I don't get it, what "this"? If it's refill queue, that's because
> I don't like actively returning buffers back via syscall / setsockopt
> and trying to transfer them into the napi context (i.e.
> napi_pp_put_page) hoping it works / cached well.
>
> If "this" is IO_ZC_RX_UREF, it's because we need to track when a
> buffer is given to the userspace, and I don't think some kind of
> map / xarray in the hot path is the best for performance solution.
>

Sorry I wasn't clear. By 'this' I'm referring to:

"from where our ->alloc_netmems implementation can grab it, check
references, put IO_ZC_RX_UREF, and recycle the buffer if there are no
more users left"

This is the part that I'm not able to stomach at the moment. Maybe if
I look deeper it would make more sense, but my first feelings is that
it's really not acceptable.

alloc_netmems (and more generically page_pool_alloc_netmem), just
allocates a netmem and gives it to the page_pool code to decide
whether to put it in the cache, in the ptr ring, or directly to the
user, etc.

The provider should not be overstepping or overriding the page_pool
logic to recycle pages or deliver them to the user. alloc_netmem
should just just alloc the netmem and hand it to the page_pool to
decide what to do with it.

> > and logic as to where the memory should go (to fast cache, to normal
> > pp release path, etc) should remain in provider agnostic code paths in
> > the page_pool. Not maintainable IMO in the long run to have individual
>
> Please do elaborate what exactly is not maintainable here
>

In the future we will have N memory providers. It's not maintainable
IMO for each of them to touch pp->alloc.cache and other internals in M
special ways and for us to have to handle N * M edge cases in the
page_pool code because each provider is overstepping on our internals.

The provider should just provide memory. The page_pool should decide
to fill its alloc.cache & ptr ring & give memory to the pp caller as
it sees fit.

> > pp providers customizing non-provider specific code or touching pp
> > private structs.
>
> ...
> >> diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
> >> index 8382129402ac..6cd3dee8b90a 100644
> >> --- a/io_uring/zcrx.c
> >> +++ b/io_uring/zcrx.c
> >> @@ -2,7 +2,11 @@
> ...
> >> +static inline struct io_zcrx_area *io_zcrx_iov_to_area(const struct net_iov *niov)
> >> +{
> >> +       struct net_iov_area *owner = net_iov_owner(niov);
> >> +
> >> +       return container_of(owner, struct io_zcrx_area, nia);
> >
> > Similar to other comment in the other patch, why are we sure this
> > doesn't return garbage (i.e. it's accidentally called on a dmabuf
> > net_iov?)
>
> There couldn't be any net_iov at this point not belonging to
> the current io_uring instance / etc. Same with devmem TCP,
> devmem callbacks can't be called for some random net_iov, the
> only place you need to explicitly check is where it comes
> from generic path to a devmem aware path like that patched
> chunk in tcp.c
>
> >> +static inline void io_zc_add_pp_cache(struct page_pool *pp,
> >> +                                     struct net_iov *niov)
> >> +{
> >> +       netmem_ref netmem = net_iov_to_netmem(niov);
> >> +
> >> +#if defined(CONFIG_HAS_DMA) && defined(CONFIG_DMA_NEED_SYNC)
> >> +       if (pp->dma_sync && dma_dev_need_sync(pp->p.dev)) {
> >
> > IIRC we force that dma_sync == true for memory providers, unless you
> > changed that and I missed it.
>
> I'll take a look, might remove it.
>
> >> +               dma_addr_t dma_addr = page_pool_get_dma_addr_netmem(netmem);
> >> +
> >> +               dma_sync_single_range_for_device(pp->p.dev, dma_addr,
> >> +                                                pp->p.offset, pp->p.max_len,
> >> +                                                pp->p.dma_dir);
> >> +       }
> >> +#endif
> >> +
> >> +       page_pool_fragment_netmem(netmem, 1);
> >> +       pp->alloc.cache[pp->alloc.count++] = netmem;
> >
> > IMO touching pp internals in a provider should not be acceptable.
>
> Ok, I can add a page pool helper for that.
>

To be clear, adding a helper will not resolve the issue I'm seeing.
IMO nothing in the alloc_netmem or any helpers its calling should
touch pp->alloc.cache. alloc_netmem should just allocate the memory
and let the non-provider pp code decide what to do with the memory.

> > pp->alloc.cache is a data structure private to the page_pool and
> > should not be touched at all by any specific memory provider. Not
> > maintainable in the long run tbh for individual pp providers to mess
> > with pp private structs and we hunt for bugs that are reproducible
> > with 1 pp provider or another, or have to deal with the mental strain
> > of provider specific handling in what is supposed to be generic
> > page_pool paths.
>
> I get what you're trying to say about not touching internals,
> I agree with that, but I can't share the sentiment about debugging.
> It's a pretty specific api, users running io_uring almost always
> write directly to io_uring and we solve it. If happens it's not
> the case, please do redirect the issue.
>
> > IMO the provider must implement the 4 'ops' (alloc, free, init,
>
> Doing 1 buffer per callback wouldn't be scalable at speeds
> we're looking at.
>

I doubt this is true or at least there needs to be more info here. The
page_pool_alloc_netmem() pretty much allocates 1 buffer per callback
for all its current users (regular memory & dmabuf), and that's good
enough to drive 200gbps NICs. What is special about io_uring use case
that this is not good enough?

The reason it is good enough in my experience is that
page_pool_alloc_netmem() is a slow path. netmems are allocated from
that function and heavily recycled by the page_pool afterwards.
Pavel Begunkov Oct. 10, 2024, 8:26 p.m. UTC | #5
On 10/10/24 19:19, Mina Almasry wrote:
> On Wed, Oct 9, 2024 at 3:57 PM Pavel Begunkov <asml.silence@gmail.com> wrote:
>>
>> On 10/9/24 23:01, Mina Almasry wrote:
>>> On Mon, Oct 7, 2024 at 3:16 PM David Wei <dw@davidwei.uk> wrote:
>>>>
>>>> From: Pavel Begunkov <asml.silence@gmail.com>
>>>>
>>>> Implement a page pool memory provider for io_uring to receieve in a
>>>> zero copy fashion. For that, the provider allocates user pages wrapped
>>>> around into struct net_iovs, that are stored in a previously registered
>>>> struct net_iov_area.
>>>>
>>>> Unlike with traditional receives, for which pages from a page pool can
>>>> be deallocated right after the user receives data, e.g. via recv(2),
>>>> we extend the lifetime by recycling buffers only after the user space
>>>> acknowledges that it's done processing the data via the refill queue.
>>>> Before handing buffers to the user, we mark them by bumping the refcount
>>>> by a bias value IO_ZC_RX_UREF, which will be checked when the buffer is
>>>> returned back. When the corresponding io_uring instance and/or page pool
>>>> are destroyed, we'll force back all buffers that are currently in the
>>>> user space in ->io_pp_zc_scrub by clearing the bias.
>>>>
>>>
>>> This is an interesting design choice. In my experience the page_pool
>>> works the opposite way, i.e. all the netmems in it are kept alive
>>> until the user is done with them. Deviating from that requires custom
>>> behavior (->scrub), which may be fine, but why do this? Isn't it
>>> better for uapi perspective to keep the memory alive until the user is
>>> done with it?
>>
>> It's hardly interesting, it's _exactly_ the same thing devmem TCP
>> does by attaching the lifetime of buffers to a socket's xarray,
>> which requires custom behaviour. Maybe I wasn't clear on one thing
>> though, it's accounting from the page pool's perspective. Those are
>> user pages, likely still mapped into the user space, in which case
>> they're not going to be destroyed.
>>
> 
> I think we miscommunicated. Both devmem TCP and io_uring seem to bump
> the refcount of memory while the user is using it, yes. But devmem TCP
> doesn't scrub the memory when the page_pool dies. io_uring seems to
> want to scrub the memory when the page_pool dies. I'm wondering about
> this difference. Seems it's better from a uapi prespective to keep the
> memory alive until the user returns it or crash. Otherwise you could

The (user) memory is not going to be pulled under the user,
it's user pages in the user's mm and pinned by it. The difference
is that the page pool will be able to die.

> have 1 thread reading user memory and 1 thread destroying the
> page_pool and the memory will be pulled from under the read, right?

If an io_uring is shared b/w users and one of them is destroying
the instance while it's still running, it's a severe userspace bug,
even then the memory will still be alive as per above.

>>>> Refcounting and lifetime:
>>>>
>>>> Initially, all buffers are considered unallocated and stored in
>>>> ->freelist, at which point they are not yet directly exposed to the core
>>>> page pool code and not accounted to page pool's pages_state_hold_cnt.
>>>> The ->alloc_netmems callback will allocate them by placing into the
>>>> page pool's cache, setting the refcount to 1 as usual and adjusting
>>>> pages_state_hold_cnt.
>>>>
>>>> Then, either the buffer is dropped and returns back to the page pool
>>>> into the ->freelist via io_pp_zc_release_netmem, in which case the page
>>>> pool will match hold_cnt for us with ->pages_state_release_cnt. Or more
>>>> likely the buffer will go through the network/protocol stacks and end up
>>>> in the corresponding socket's receive queue. From there the user can get
>>>> it via an new io_uring request implemented in following patches. As
>>>> mentioned above, before giving a buffer to the user we bump the refcount
>>>> by IO_ZC_RX_UREF.
>>>>
>>>> Once the user is done with the buffer processing, it must return it back
>>>> via the refill queue, from where our ->alloc_netmems implementation can
>>>> grab it, check references, put IO_ZC_RX_UREF, and recycle the buffer if
>>>> there are no more users left. As we place such buffers right back into
>>>> the page pools fast cache and they didn't go through the normal pp
>>>> release path, they are still considered "allocated" and no pp hold_cnt
>>>> is required.
>>>
>>> Why is this needed? In general the provider is to allocate free memory
>>
>> I don't get it, what "this"? If it's refill queue, that's because
>> I don't like actively returning buffers back via syscall / setsockopt
>> and trying to transfer them into the napi context (i.e.
>> napi_pp_put_page) hoping it works / cached well.
>>
>> If "this" is IO_ZC_RX_UREF, it's because we need to track when a
>> buffer is given to the userspace, and I don't think some kind of
>> map / xarray in the hot path is the best for performance solution.
>>
> 
> Sorry I wasn't clear. By 'this' I'm referring to:
> 
> "from where our ->alloc_netmems implementation can grab it, check
> references, put IO_ZC_RX_UREF, and recycle the buffer if there are no
> more users left"
> 
> This is the part that I'm not able to stomach at the moment. Maybe if
> I look deeper it would make more sense, but my first feelings is that
> it's really not acceptable.
> 
> alloc_netmems (and more generically page_pool_alloc_netmem), just
> allocates a netmem and gives it to the page_pool code to decide

That how it works because that's how devmem needs it and you
tailored it, not the other way around. It could've pretty well
been a callback that fills the cache as an intermediate, from
where page pool can grab netmems and return back to the user,
and it would've been a pretty clean interface as well.

> whether to put it in the cache, in the ptr ring, or directly to the
> user, etc.

And that's the semantics you've just imbued into it.

> The provider should not be overstepping or overriding the page_pool
> logic to recycle pages or deliver them to the user. alloc_netmem

I'm baffled, where does it overrides page pool's logic? It provides the
memory to the page pool, nothing more, nothing less, it doesn't decide if
it's handed to the user or go to ptr ring, the page pool is free to do
whatever is needed. Yes, it's handed by means of returning in the cache
because of performance considerations. The provider API can look
differently, e.g. passing a temp array like in the oversimplified snippet
below, but even then I don't think that's good enough.

page_pool_alloc_netmem() {
	netmem_t arr[64];
	nr = pool->mp_ops->alloc_netmems(arr, 64);

	// page pool does the page pool stuff.
	for (i = 0; i < nr; i++)
		pp->cache[idx] = arr[i];
	return pp->cache;
}

> should just just alloc the netmem and hand it to the page_pool to
> decide what to do with it.
> 
>>> and logic as to where the memory should go (to fast cache, to normal
>>> pp release path, etc) should remain in provider agnostic code paths in
>>> the page_pool. Not maintainable IMO in the long run to have individual
>>
>> Please do elaborate what exactly is not maintainable here
>>
> 
> In the future we will have N memory providers. It's not maintainable
> IMO for each of them to touch pp->alloc.cache and other internals in M
> special ways and for us to have to handle N * M edge cases in the
> page_pool code because each provider is overstepping on our internals.

It sounds like anything that strays from the devmem TCP way is wrong,
please let me know if so, because if that's the case there can only be
devmem TCP, maybe just renamed for niceness. The patch set uses the
abstractions, in a performant way, without adding overhead to everyone
else, and to the best of my taste in a clean way.

> The provider should just provide memory. The page_pool should decide
> to fill its alloc.cache & ptr ring & give memory to the pp caller as
> it sees fit.
> 
>>> pp providers customizing non-provider specific code or touching pp
>>> private structs.
>>
>> ...
>>>> diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
>>>> index 8382129402ac..6cd3dee8b90a 100644
>>>> --- a/io_uring/zcrx.c
>>>> +++ b/io_uring/zcrx.c
>>>> @@ -2,7 +2,11 @@
>> ...
>>>> +static inline struct io_zcrx_area *io_zcrx_iov_to_area(const struct net_iov *niov)
>>>> +{
>>>> +       struct net_iov_area *owner = net_iov_owner(niov);
>>>> +
>>>> +       return container_of(owner, struct io_zcrx_area, nia);
>>>
>>> Similar to other comment in the other patch, why are we sure this
>>> doesn't return garbage (i.e. it's accidentally called on a dmabuf
>>> net_iov?)
>>
>> There couldn't be any net_iov at this point not belonging to
>> the current io_uring instance / etc. Same with devmem TCP,
>> devmem callbacks can't be called for some random net_iov, the
>> only place you need to explicitly check is where it comes
>> from generic path to a devmem aware path like that patched
>> chunk in tcp.c
>>
>>>> +static inline void io_zc_add_pp_cache(struct page_pool *pp,
>>>> +                                     struct net_iov *niov)
>>>> +{
>>>> +       netmem_ref netmem = net_iov_to_netmem(niov);
>>>> +
>>>> +#if defined(CONFIG_HAS_DMA) && defined(CONFIG_DMA_NEED_SYNC)
>>>> +       if (pp->dma_sync && dma_dev_need_sync(pp->p.dev)) {
>>>
>>> IIRC we force that dma_sync == true for memory providers, unless you
>>> changed that and I missed it.
>>
>> I'll take a look, might remove it.
>>
>>>> +               dma_addr_t dma_addr = page_pool_get_dma_addr_netmem(netmem);
>>>> +
>>>> +               dma_sync_single_range_for_device(pp->p.dev, dma_addr,
>>>> +                                                pp->p.offset, pp->p.max_len,
>>>> +                                                pp->p.dma_dir);
>>>> +       }
>>>> +#endif
>>>> +
>>>> +       page_pool_fragment_netmem(netmem, 1);
>>>> +       pp->alloc.cache[pp->alloc.count++] = netmem;
>>>
>>> IMO touching pp internals in a provider should not be acceptable.
>>
>> Ok, I can add a page pool helper for that.
>>
> 
> To be clear, adding a helper will not resolve the issue I'm seeing.
> IMO nothing in the alloc_netmem or any helpers its calling should
> touch pp->alloc.cache. alloc_netmem should just allocate the memory
> and let the non-provider pp code decide what to do with the memory.

Then we have opposite opinions, and I can't agree with what
you're proposing. If I'm adding an interface, I'm trying to make
it lasting and not be thrown away in a year. One indirect call
per page in the hot hot path is just insanity. Just remember what
you've been told about one single "if" in the hot path.

>>> pp->alloc.cache is a data structure private to the page_pool and
>>> should not be touched at all by any specific memory provider. Not
>>> maintainable in the long run tbh for individual pp providers to mess
>>> with pp private structs and we hunt for bugs that are reproducible
>>> with 1 pp provider or another, or have to deal with the mental strain
>>> of provider specific handling in what is supposed to be generic
>>> page_pool paths.
>>
>> I get what you're trying to say about not touching internals,
>> I agree with that, but I can't share the sentiment about debugging.
>> It's a pretty specific api, users running io_uring almost always
>> write directly to io_uring and we solve it. If happens it's not
>> the case, please do redirect the issue.
>>
>>> IMO the provider must implement the 4 'ops' (alloc, free, init,
>>
>> Doing 1 buffer per callback wouldn't be scalable at speeds
>> we're looking at.
>>
> 
> I doubt this is true or at least there needs to be more info here. The

If you don't believe me, then, please, go ahead and do your testing,
or look through patches addressing it across the stack like [1],
but you'll be able to find many more. I don't have any recent numbers
on indirect calls, but I did a fair share of testing before for
different kinds of overhead, it has always been expensive, can easily
be 1-2% per fast block request, which could be much worse if it's per
page.

[1] https://lore.kernel.org/netdev/cover.1543836966.git.pabeni@redhat.com/


> page_pool_alloc_netmem() pretty much allocates 1 buffer per callback
> for all its current users (regular memory & dmabuf), and that's good
> enough to drive 200gbps NICs. What is special about io_uring use case
> that this is not good enough?
> 
> The reason it is good enough in my experience is that
> page_pool_alloc_netmem() is a slow path. netmems are allocated from
> that function and heavily recycled by the page_pool afterwards.

That's because how you return buffers back to the page pool, with
io_uring it is a hot path, even though ammortised exactly because
it doesn't just return one buffer at a time.
Mina Almasry Oct. 10, 2024, 8:53 p.m. UTC | #6
On Thu, Oct 10, 2024 at 1:26 PM Pavel Begunkov <asml.silence@gmail.com> wrote:
>
> On 10/10/24 19:19, Mina Almasry wrote:
> > On Wed, Oct 9, 2024 at 3:57 PM Pavel Begunkov <asml.silence@gmail.com> wrote:
> >>
> >> On 10/9/24 23:01, Mina Almasry wrote:
> >>> On Mon, Oct 7, 2024 at 3:16 PM David Wei <dw@davidwei.uk> wrote:
> >>>>
> >>>> From: Pavel Begunkov <asml.silence@gmail.com>
> >>>>
> >>>> Implement a page pool memory provider for io_uring to receieve in a
> >>>> zero copy fashion. For that, the provider allocates user pages wrapped
> >>>> around into struct net_iovs, that are stored in a previously registered
> >>>> struct net_iov_area.
> >>>>
> >>>> Unlike with traditional receives, for which pages from a page pool can
> >>>> be deallocated right after the user receives data, e.g. via recv(2),
> >>>> we extend the lifetime by recycling buffers only after the user space
> >>>> acknowledges that it's done processing the data via the refill queue.
> >>>> Before handing buffers to the user, we mark them by bumping the refcount
> >>>> by a bias value IO_ZC_RX_UREF, which will be checked when the buffer is
> >>>> returned back. When the corresponding io_uring instance and/or page pool
> >>>> are destroyed, we'll force back all buffers that are currently in the
> >>>> user space in ->io_pp_zc_scrub by clearing the bias.
> >>>>
> >>>
> >>> This is an interesting design choice. In my experience the page_pool
> >>> works the opposite way, i.e. all the netmems in it are kept alive
> >>> until the user is done with them. Deviating from that requires custom
> >>> behavior (->scrub), which may be fine, but why do this? Isn't it
> >>> better for uapi perspective to keep the memory alive until the user is
> >>> done with it?
> >>
> >> It's hardly interesting, it's _exactly_ the same thing devmem TCP
> >> does by attaching the lifetime of buffers to a socket's xarray,
> >> which requires custom behaviour. Maybe I wasn't clear on one thing
> >> though, it's accounting from the page pool's perspective. Those are
> >> user pages, likely still mapped into the user space, in which case
> >> they're not going to be destroyed.
> >>
> >
> > I think we miscommunicated. Both devmem TCP and io_uring seem to bump
> > the refcount of memory while the user is using it, yes. But devmem TCP
> > doesn't scrub the memory when the page_pool dies. io_uring seems to
> > want to scrub the memory when the page_pool dies. I'm wondering about
> > this difference. Seems it's better from a uapi prespective to keep the
> > memory alive until the user returns it or crash. Otherwise you could
>
> The (user) memory is not going to be pulled under the user,
> it's user pages in the user's mm and pinned by it. The difference
> is that the page pool will be able to die.
>
> > have 1 thread reading user memory and 1 thread destroying the
> > page_pool and the memory will be pulled from under the read, right?
>
> If an io_uring is shared b/w users and one of them is destroying
> the instance while it's still running, it's a severe userspace bug,
> even then the memory will still be alive as per above.
>
> >>>> Refcounting and lifetime:
> >>>>
> >>>> Initially, all buffers are considered unallocated and stored in
> >>>> ->freelist, at which point they are not yet directly exposed to the core
> >>>> page pool code and not accounted to page pool's pages_state_hold_cnt.
> >>>> The ->alloc_netmems callback will allocate them by placing into the
> >>>> page pool's cache, setting the refcount to 1 as usual and adjusting
> >>>> pages_state_hold_cnt.
> >>>>
> >>>> Then, either the buffer is dropped and returns back to the page pool
> >>>> into the ->freelist via io_pp_zc_release_netmem, in which case the page
> >>>> pool will match hold_cnt for us with ->pages_state_release_cnt. Or more
> >>>> likely the buffer will go through the network/protocol stacks and end up
> >>>> in the corresponding socket's receive queue. From there the user can get
> >>>> it via an new io_uring request implemented in following patches. As
> >>>> mentioned above, before giving a buffer to the user we bump the refcount
> >>>> by IO_ZC_RX_UREF.
> >>>>
> >>>> Once the user is done with the buffer processing, it must return it back
> >>>> via the refill queue, from where our ->alloc_netmems implementation can
> >>>> grab it, check references, put IO_ZC_RX_UREF, and recycle the buffer if
> >>>> there are no more users left. As we place such buffers right back into
> >>>> the page pools fast cache and they didn't go through the normal pp
> >>>> release path, they are still considered "allocated" and no pp hold_cnt
> >>>> is required.
> >>>
> >>> Why is this needed? In general the provider is to allocate free memory
> >>
> >> I don't get it, what "this"? If it's refill queue, that's because
> >> I don't like actively returning buffers back via syscall / setsockopt
> >> and trying to transfer them into the napi context (i.e.
> >> napi_pp_put_page) hoping it works / cached well.
> >>
> >> If "this" is IO_ZC_RX_UREF, it's because we need to track when a
> >> buffer is given to the userspace, and I don't think some kind of
> >> map / xarray in the hot path is the best for performance solution.
> >>
> >
> > Sorry I wasn't clear. By 'this' I'm referring to:
> >
> > "from where our ->alloc_netmems implementation can grab it, check
> > references, put IO_ZC_RX_UREF, and recycle the buffer if there are no
> > more users left"
> >
> > This is the part that I'm not able to stomach at the moment. Maybe if
> > I look deeper it would make more sense, but my first feelings is that
> > it's really not acceptable.
> >
> > alloc_netmems (and more generically page_pool_alloc_netmem), just
> > allocates a netmem and gives it to the page_pool code to decide
>
> That how it works because that's how devmem needs it and you
> tailored it, not the other way around. It could've pretty well
> been a callback that fills the cache as an intermediate, from
> where page pool can grab netmems and return back to the user,
> and it would've been a pretty clean interface as well.
>

It could have been, but that would be a much worse design IMO. The
whole point of memory proivders is that they provide memory to the
page_pool and the page_pool does its things (among which is recycling)
with that memory. In this patch you seem to have implemented a
provider which, if the page is returned by io_uring, then it's not
returned to the page_pool, it's returned directly to the provider. In
other code paths the memory will be returned to the page_pool.

I.e allocation is always:
provider -> pp -> driver

freeing from io_uring is:
io_uring -> provider -> pp

freeing from tcp stack or driver I'm guessing will be:
tcp stack/driver -> pp -> provider

I'm recommending that the model for memory providers must be in line
with what we do for pages, devmem TCP, and Jakub's out of tree huge
page provider (i.e. everything else using the page_pool). The model is
the streamlined:

allocation:
provider -> pp -> driver

freeing (always):
tcp stack/io_uring/driver/whatever else -> pp -> driver

Is special casing the freeing path for io_uring OK? Is letting the
io_uring provider do its own recycling OK? IMO, no. All providers must
follow an easy to follow general framework otherwise the special
casing for N providers will get out of hand, but that's just my
opinion. A maintainer will make the judgement call here.

> > whether to put it in the cache, in the ptr ring, or directly to the
> > user, etc.
>
> And that's the semantics you've just imbued into it.
>
> > The provider should not be overstepping or overriding the page_pool
> > logic to recycle pages or deliver them to the user. alloc_netmem
>
> I'm baffled, where does it overrides page pool's logic? It provides the
> memory to the page pool, nothing more, nothing less, it doesn't decide if
> it's handed to the user or go to ptr ring, the page pool is free to do
> whatever is needed. Yes, it's handed by means of returning in the cache
> because of performance considerations. The provider API can look
> differently, e.g. passing a temp array like in the oversimplified snippet
> below, but even then I don't think that's good enough.
>
> page_pool_alloc_netmem() {
>         netmem_t arr[64];
>         nr = pool->mp_ops->alloc_netmems(arr, 64);
>
>         // page pool does the page pool stuff.
>         for (i = 0; i < nr; i++)
>                 pp->cache[idx] = arr[i];
>         return pp->cache;
> }
>
> > should just just alloc the netmem and hand it to the page_pool to
> > decide what to do with it.
> >
> >>> and logic as to where the memory should go (to fast cache, to normal
> >>> pp release path, etc) should remain in provider agnostic code paths in
> >>> the page_pool. Not maintainable IMO in the long run to have individual
> >>
> >> Please do elaborate what exactly is not maintainable here
> >>
> >
> > In the future we will have N memory providers. It's not maintainable
> > IMO for each of them to touch pp->alloc.cache and other internals in M
> > special ways and for us to have to handle N * M edge cases in the
> > page_pool code because each provider is overstepping on our internals.
>
> It sounds like anything that strays from the devmem TCP way is wrong,
> please let me know if so, because if that's the case there can only be
> devmem TCP, maybe just renamed for niceness. The patch set uses the
> abstractions, in a performant way, without adding overhead to everyone
> else, and to the best of my taste in a clean way.
>
> > The provider should just provide memory. The page_pool should decide
> > to fill its alloc.cache & ptr ring & give memory to the pp caller as
> > it sees fit.
> >
> >>> pp providers customizing non-provider specific code or touching pp
> >>> private structs.
> >>
> >> ...
> >>>> diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
> >>>> index 8382129402ac..6cd3dee8b90a 100644
> >>>> --- a/io_uring/zcrx.c
> >>>> +++ b/io_uring/zcrx.c
> >>>> @@ -2,7 +2,11 @@
> >> ...
> >>>> +static inline struct io_zcrx_area *io_zcrx_iov_to_area(const struct net_iov *niov)
> >>>> +{
> >>>> +       struct net_iov_area *owner = net_iov_owner(niov);
> >>>> +
> >>>> +       return container_of(owner, struct io_zcrx_area, nia);
> >>>
> >>> Similar to other comment in the other patch, why are we sure this
> >>> doesn't return garbage (i.e. it's accidentally called on a dmabuf
> >>> net_iov?)
> >>
> >> There couldn't be any net_iov at this point not belonging to
> >> the current io_uring instance / etc. Same with devmem TCP,
> >> devmem callbacks can't be called for some random net_iov, the
> >> only place you need to explicitly check is where it comes
> >> from generic path to a devmem aware path like that patched
> >> chunk in tcp.c
> >>
> >>>> +static inline void io_zc_add_pp_cache(struct page_pool *pp,
> >>>> +                                     struct net_iov *niov)
> >>>> +{
> >>>> +       netmem_ref netmem = net_iov_to_netmem(niov);
> >>>> +
> >>>> +#if defined(CONFIG_HAS_DMA) && defined(CONFIG_DMA_NEED_SYNC)
> >>>> +       if (pp->dma_sync && dma_dev_need_sync(pp->p.dev)) {
> >>>
> >>> IIRC we force that dma_sync == true for memory providers, unless you
> >>> changed that and I missed it.
> >>
> >> I'll take a look, might remove it.
> >>
> >>>> +               dma_addr_t dma_addr = page_pool_get_dma_addr_netmem(netmem);
> >>>> +
> >>>> +               dma_sync_single_range_for_device(pp->p.dev, dma_addr,
> >>>> +                                                pp->p.offset, pp->p.max_len,
> >>>> +                                                pp->p.dma_dir);
> >>>> +       }
> >>>> +#endif
> >>>> +
> >>>> +       page_pool_fragment_netmem(netmem, 1);
> >>>> +       pp->alloc.cache[pp->alloc.count++] = netmem;
> >>>
> >>> IMO touching pp internals in a provider should not be acceptable.
> >>
> >> Ok, I can add a page pool helper for that.
> >>
> >
> > To be clear, adding a helper will not resolve the issue I'm seeing.
> > IMO nothing in the alloc_netmem or any helpers its calling should
> > touch pp->alloc.cache. alloc_netmem should just allocate the memory
> > and let the non-provider pp code decide what to do with the memory.
>
> Then we have opposite opinions, and I can't agree with what
> you're proposing. If I'm adding an interface, I'm trying to make
> it lasting and not be thrown away in a year. One indirect call
> per page in the hot hot path is just insanity. Just remember what
> you've been told about one single "if" in the hot path.
>
> >>> pp->alloc.cache is a data structure private to the page_pool and
> >>> should not be touched at all by any specific memory provider. Not
> >>> maintainable in the long run tbh for individual pp providers to mess
> >>> with pp private structs and we hunt for bugs that are reproducible
> >>> with 1 pp provider or another, or have to deal with the mental strain
> >>> of provider specific handling in what is supposed to be generic
> >>> page_pool paths.
> >>
> >> I get what you're trying to say about not touching internals,
> >> I agree with that, but I can't share the sentiment about debugging.
> >> It's a pretty specific api, users running io_uring almost always
> >> write directly to io_uring and we solve it. If happens it's not
> >> the case, please do redirect the issue.
> >>
> >>> IMO the provider must implement the 4 'ops' (alloc, free, init,
> >>
> >> Doing 1 buffer per callback wouldn't be scalable at speeds
> >> we're looking at.
> >>
> >
> > I doubt this is true or at least there needs to be more info here. The
>
> If you don't believe me, then, please, go ahead and do your testing,
> or look through patches addressing it across the stack like [1],
> but you'll be able to find many more. I don't have any recent numbers
> on indirect calls, but I did a fair share of testing before for
> different kinds of overhead, it has always been expensive, can easily
> be 1-2% per fast block request, which could be much worse if it's per
> page.
>
> [1] https://lore.kernel.org/netdev/cover.1543836966.git.pabeni@redhat.com/
>
>
> > page_pool_alloc_netmem() pretty much allocates 1 buffer per callback
> > for all its current users (regular memory & dmabuf), and that's good
> > enough to drive 200gbps NICs. What is special about io_uring use case
> > that this is not good enough?
> >
> > The reason it is good enough in my experience is that
> > page_pool_alloc_netmem() is a slow path. netmems are allocated from
> > that function and heavily recycled by the page_pool afterwards.
>
> That's because how you return buffers back to the page pool, with
> io_uring it is a hot path, even though ammortised exactly because
> it doesn't just return one buffer at a time.
>

Right, I guess I understand now. You need to implement your own
recycling in the provider because your model has bypassed the
page_pool recycling - which to me is 90% of the utility of the
page_pool. To make matters worse, the bypass is only there if the
netmems are returned from io_uring, and not bypassed when the netmems
are returned from driver/tcp stack. I'm guessing if you reused the
page_pool recycling in the io_uring return path then it would remove
the need for your provider to implement its own recycling for the
io_uring return case.

Is letting providers bypass and override the page_pool's recycling in
some code paths OK? IMO, no. A maintainer will make the judgement call
and speak authoritatively here and I will follow, but I do think it's
a (much) worse design.
Mina Almasry Oct. 10, 2024, 8:58 p.m. UTC | #7
On Thu, Oct 10, 2024 at 1:53 PM Mina Almasry <almasrymina@google.com> wrote:
>
> > >>>>
> > >>>> Once the user is done with the buffer processing, it must return it back
> > >>>> via the refill queue, from where our ->alloc_netmems implementation can
> > >>>> grab it, check references, put IO_ZC_RX_UREF, and recycle the buffer if
> > >>>> there are no more users left. As we place such buffers right back into
> > >>>> the page pools fast cache and they didn't go through the normal pp
> > >>>> release path, they are still considered "allocated" and no pp hold_cnt
> > >>>> is required.
> > >>>
> > >>> Why is this needed? In general the provider is to allocate free memory
> > >>
> > >> I don't get it, what "this"? If it's refill queue, that's because
> > >> I don't like actively returning buffers back via syscall / setsockopt
> > >> and trying to transfer them into the napi context (i.e.
> > >> napi_pp_put_page) hoping it works / cached well.
> > >>
> > >> If "this" is IO_ZC_RX_UREF, it's because we need to track when a
> > >> buffer is given to the userspace, and I don't think some kind of
> > >> map / xarray in the hot path is the best for performance solution.
> > >>
> > >
> > > Sorry I wasn't clear. By 'this' I'm referring to:
> > >
> > > "from where our ->alloc_netmems implementation can grab it, check
> > > references, put IO_ZC_RX_UREF, and recycle the buffer if there are no
> > > more users left"
> > >
> > > This is the part that I'm not able to stomach at the moment. Maybe if
> > > I look deeper it would make more sense, but my first feelings is that
> > > it's really not acceptable.
> > >
> > > alloc_netmems (and more generically page_pool_alloc_netmem), just
> > > allocates a netmem and gives it to the page_pool code to decide
> >
> > That how it works because that's how devmem needs it and you
> > tailored it, not the other way around. It could've pretty well
> > been a callback that fills the cache as an intermediate, from
> > where page pool can grab netmems and return back to the user,
> > and it would've been a pretty clean interface as well.
> >
>
> It could have been, but that would be a much worse design IMO. The
> whole point of memory proivders is that they provide memory to the
> page_pool and the page_pool does its things (among which is recycling)
> with that memory. In this patch you seem to have implemented a
> provider which, if the page is returned by io_uring, then it's not
> returned to the page_pool, it's returned directly to the provider. In
> other code paths the memory will be returned to the page_pool.
>
> I.e allocation is always:
> provider -> pp -> driver
>
> freeing from io_uring is:
> io_uring -> provider -> pp
>
> freeing from tcp stack or driver I'm guessing will be:
> tcp stack/driver -> pp -> provider
>
> I'm recommending that the model for memory providers must be in line
> with what we do for pages, devmem TCP, and Jakub's out of tree huge
> page provider (i.e. everything else using the page_pool). The model is
> the streamlined:
>
> allocation:
> provider -> pp -> driver
>
> freeing (always):
> tcp stack/io_uring/driver/whatever else -> pp -> driver
>

Should be:

> freeing (always):
> tcp stack/io_uring/driver/whatever else -> pp -> provider

I.e. the pp frees to the provider, not the driver.
Pavel Begunkov Oct. 10, 2024, 9:22 p.m. UTC | #8
On 10/10/24 21:53, Mina Almasry wrote:
> On Thu, Oct 10, 2024 at 1:26 PM Pavel Begunkov <asml.silence@gmail.com> wrote:
...
>>>
>>> Sorry I wasn't clear. By 'this' I'm referring to:
>>>
>>> "from where our ->alloc_netmems implementation can grab it, check
>>> references, put IO_ZC_RX_UREF, and recycle the buffer if there are no
>>> more users left"
>>>
>>> This is the part that I'm not able to stomach at the moment. Maybe if
>>> I look deeper it would make more sense, but my first feelings is that
>>> it's really not acceptable.
>>>
>>> alloc_netmems (and more generically page_pool_alloc_netmem), just
>>> allocates a netmem and gives it to the page_pool code to decide
>>
>> That how it works because that's how devmem needs it and you
>> tailored it, not the other way around. It could've pretty well
>> been a callback that fills the cache as an intermediate, from
>> where page pool can grab netmems and return back to the user,
>> and it would've been a pretty clean interface as well.
>>
> 
> It could have been, but that would be a much worse design IMO. The
> whole point of memory proivders is that they provide memory to the
> page_pool and the page_pool does its things (among which is recycling)
> with that memory. In this patch you seem to have implemented a
> provider which, if the page is returned by io_uring, then it's not
> returned to the page_pool, it's returned directly to the provider. In
> other code paths the memory will be returned to the page_pool.
> 
> I.e allocation is always:
> provider -> pp -> driver
> 
> freeing from io_uring is:
> io_uring -> provider -> pp
> 
> freeing from tcp stack or driver I'm guessing will be:
> tcp stack/driver -> pp -> provider
> 
> I'm recommending that the model for memory providers must be in line
> with what we do for pages, devmem TCP, and Jakub's out of tree huge
> page provider (i.e. everything else using the page_pool). The model is
> the streamlined:

Let's not go into the normal pages, because 1) it can't work
any other way in general case, it has to cross the context from
whenever page is to the napi / page pool, and 2) because devmem
TCP and io_uring already deviate from the standard page pool,
by extending lifetime of buffers to user space and more.

And then that's exactly what I'm saying, you recommend it to be
aligned with devmem TCP. And let's not forget that you had to add
batching to that exact syscall return path because of
performance...

...
>>> I doubt this is true or at least there needs to be more info here. The
>>
>> If you don't believe me, then, please, go ahead and do your testing,
>> or look through patches addressing it across the stack like [1],
>> but you'll be able to find many more. I don't have any recent numbers
>> on indirect calls, but I did a fair share of testing before for
>> different kinds of overhead, it has always been expensive, can easily
>> be 1-2% per fast block request, which could be much worse if it's per
>> page.
>>
>> [1] https://lore.kernel.org/netdev/cover.1543836966.git.pabeni@redhat.com/
>>
>>
>>> page_pool_alloc_netmem() pretty much allocates 1 buffer per callback
>>> for all its current users (regular memory & dmabuf), and that's good
>>> enough to drive 200gbps NICs. What is special about io_uring use case
>>> that this is not good enough?
>>>
>>> The reason it is good enough in my experience is that
>>> page_pool_alloc_netmem() is a slow path. netmems are allocated from
>>> that function and heavily recycled by the page_pool afterwards.
>>
>> That's because how you return buffers back to the page pool, with
>> io_uring it is a hot path, even though ammortised exactly because
>> it doesn't just return one buffer at a time.
>>
> 
> Right, I guess I understand now. You need to implement your own
> recycling in the provider because your model has bypassed the
> page_pool recycling - which to me is 90% of the utility of the

So the utility of the page pool is a fast return path for the
standard page mode, i.e. napi_pp_put_page, which it is and is
important, I agree. But then even though we have a better IMO
approach for this "extended to userspace buffer life cycle"
scenario, it has to use that very same return path because...?

> page_pool. To make matters worse, the bypass is only there if the
> netmems are returned from io_uring, and not bypassed when the netmems
> are returned from driver/tcp stack. I'm guessing if you reused the
> page_pool recycling in the io_uring return path then it would remove
> the need for your provider to implement its own recycling for the
> io_uring return case.
> 
> Is letting providers bypass and override the page_pool's recycling in
> some code paths OK? IMO, no. A maintainer will make the judgement call

Mina, frankly, that's nonsense. If we extend the same logic,
devmem overrides page allocation rules with callbacks, devmem
overrides and violates page pool buffer lifetimes by extending
it to user space, devmem violates and overrides the page pool
object lifetime by binding buffers to sockets. And all of it
I'd rather name extends and enhances to fit in the devmem use
case.

> and speak authoritatively here and I will follow, but I do think it's
> a (much) worse design.

Sure, I have a completely opposite opinion, that's a much
better approach than returning through a syscall, but I will
agree with you that ultimately the maintainers will say if
that's acceptable for the networking or not.
Mina Almasry Oct. 11, 2024, 12:32 a.m. UTC | #9
On Thu, Oct 10, 2024 at 2:22 PM Pavel Begunkov <asml.silence@gmail.com> wrote:
>
> > page_pool. To make matters worse, the bypass is only there if the
> > netmems are returned from io_uring, and not bypassed when the netmems
> > are returned from driver/tcp stack. I'm guessing if you reused the
> > page_pool recycling in the io_uring return path then it would remove
> > the need for your provider to implement its own recycling for the
> > io_uring return case.
> >
> > Is letting providers bypass and override the page_pool's recycling in
> > some code paths OK? IMO, no. A maintainer will make the judgement call
>
> Mina, frankly, that's nonsense. If we extend the same logic,
> devmem overrides page allocation rules with callbacks, devmem
> overrides and violates page pool buffer lifetimes by extending
> it to user space, devmem violates and overrides the page pool
> object lifetime by binding buffers to sockets. And all of it
> I'd rather name extends and enhances to fit in the devmem use
> case.
>
> > and speak authoritatively here and I will follow, but I do think it's
> > a (much) worse design.
>
> Sure, I have a completely opposite opinion, that's a much
> better approach than returning through a syscall, but I will
> agree with you that ultimately the maintainers will say if
> that's acceptable for the networking or not.
>

Right, I'm not suggesting that you return the pages through a syscall.
That will add syscall overhead when it's better not to have that
especially in io_uring context. Devmem TCP needed a syscall because I
couldn't figure out a non-syscall way with sockets for the userspace
to tell the kernel that it's done with some netmems. You do not need
to follow that at all. Sorry if I made it seem like so.

However, I'm suggesting that when io_uring figures out that the
userspace is done with a netmem, that you feed that netmem back to the
pp, and utilize the pp's recycling, rather than adding your own
recycling in the provider.

From your commit message:

"we extend the lifetime by recycling buffers only after the user space
acknowledges that it's done processing the data via the refill queue"

It seems to me that you get some signal from the userspace that data
is ready to be reuse via that refill queue (whatever it is, very
sorry, I'm not that familiar with io_uring). My suggestion here is
when the userspace tells you that a netmem is ready for reuse (however
it does that), that you feed that page back to the pp via something
like napi_pp_put_page() or page_pool_put_page_bulk() if that makes
sense to you. FWIW I'm trying to look through your code to understand
what that refill queue is and where - if anywhere - it may be possible
to feed pages back to the pp, rather than directly to the provider.
Pavel Begunkov Oct. 11, 2024, 1:49 a.m. UTC | #10
On 10/11/24 01:32, Mina Almasry wrote:
> On Thu, Oct 10, 2024 at 2:22 PM Pavel Begunkov <asml.silence@gmail.com> wrote:
>>
>>> page_pool. To make matters worse, the bypass is only there if the
>>> netmems are returned from io_uring, and not bypassed when the netmems
>>> are returned from driver/tcp stack. I'm guessing if you reused the
>>> page_pool recycling in the io_uring return path then it would remove
>>> the need for your provider to implement its own recycling for the
>>> io_uring return case.
>>>
>>> Is letting providers bypass and override the page_pool's recycling in
>>> some code paths OK? IMO, no. A maintainer will make the judgement call
>>
>> Mina, frankly, that's nonsense. If we extend the same logic,
>> devmem overrides page allocation rules with callbacks, devmem
>> overrides and violates page pool buffer lifetimes by extending
>> it to user space, devmem violates and overrides the page pool
>> object lifetime by binding buffers to sockets. And all of it
>> I'd rather name extends and enhances to fit in the devmem use
>> case.
>>
>>> and speak authoritatively here and I will follow, but I do think it's
>>> a (much) worse design.
>>
>> Sure, I have a completely opposite opinion, that's a much
>> better approach than returning through a syscall, but I will
>> agree with you that ultimately the maintainers will say if
>> that's acceptable for the networking or not.
>>
> 
> Right, I'm not suggesting that you return the pages through a syscall.
> That will add syscall overhead when it's better not to have that
> especially in io_uring context. Devmem TCP needed a syscall because I
> couldn't figure out a non-syscall way with sockets for the userspace
> to tell the kernel that it's done with some netmems. You do not need
> to follow that at all. Sorry if I made it seem like so.
> 
> However, I'm suggesting that when io_uring figures out that the
> userspace is done with a netmem, that you feed that netmem back to the
> pp, and utilize the pp's recycling, rather than adding your own
> recycling in the provider.

I should spell it out somewhere in commits, the difference is that we
let the page pool to pull buffers instead of having a syscall to push
like devmem TCP does. With pushing, you'll be doing it from some task
context, and it'll need to find a way back into the page pool, via ptr
ring or with the opportunistic optimisations napi_pp_put_page() provides.
And if you do it this way, the function is very useful.

With pulling though, returning already happens from within the page
pool's allocation path, just in the right context that doesn't need
any additional locking / sync to access page pool's napi/bh protected
caches/etc.. That's why it has a potential to be faster, and why
optimisation wise napi_pp_put_page() doesn't make sense for this
case, i.e. no need to jump through hoops of finding how to transfer
a buffer to the page pool's context because we're already in there.

>  From your commit message:
> 
> "we extend the lifetime by recycling buffers only after the user space
> acknowledges that it's done processing the data via the refill queue"
> 
> It seems to me that you get some signal from the userspace that data

You don't even need to signal it, the page pool will take buffers
when it needs to allocate memory.

> is ready to be reuse via that refill queue (whatever it is, very
> sorry, I'm not that familiar with io_uring). My suggestion here is
> when the userspace tells you that a netmem is ready for reuse (however
> it does that), that you feed that page back to the pp via something
> like napi_pp_put_page() or page_pool_put_page_bulk() if that makes
> sense to you. FWIW I'm trying to look through your code to understand
> what that refill queue is and where - if anywhere - it may be possible
> to feed pages back to the pp, rather than directly to the provider.
diff mbox series

Patch

diff --git a/include/linux/io_uring/net.h b/include/linux/io_uring/net.h
index b58f39fed4d5..610b35b451fd 100644
--- a/include/linux/io_uring/net.h
+++ b/include/linux/io_uring/net.h
@@ -5,6 +5,11 @@ 
 struct io_uring_cmd;
 
 #if defined(CONFIG_IO_URING)
+
+#if defined(CONFIG_PAGE_POOL)
+extern const struct memory_provider_ops io_uring_pp_zc_ops;
+#endif
+
 int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags);
 
 #else
diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
index 8382129402ac..6cd3dee8b90a 100644
--- a/io_uring/zcrx.c
+++ b/io_uring/zcrx.c
@@ -2,7 +2,11 @@ 
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/mm.h>
+#include <linux/nospec.h>
+#include <linux/netdevice.h>
 #include <linux/io_uring.h>
+#include <net/page_pool/helpers.h>
+#include <trace/events/page_pool.h>
 
 #include <uapi/linux/io_uring.h>
 
@@ -16,6 +20,13 @@ 
 
 #if defined(CONFIG_PAGE_POOL) && defined(CONFIG_INET)
 
+static inline struct io_zcrx_area *io_zcrx_iov_to_area(const struct net_iov *niov)
+{
+	struct net_iov_area *owner = net_iov_owner(niov);
+
+	return container_of(owner, struct io_zcrx_area, nia);
+}
+
 static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq,
 				 struct io_uring_zcrx_ifq_reg *reg)
 {
@@ -101,6 +112,9 @@  static int io_zcrx_create_area(struct io_ring_ctx *ctx,
 		goto err;
 
 	for (i = 0; i < nr_pages; i++) {
+		struct net_iov *niov = &area->nia.niovs[i];
+
+		niov->owner = &area->nia;
 		area->freelist[i] = i;
 	}
 
@@ -233,4 +247,219 @@  void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx)
 	lockdep_assert_held(&ctx->uring_lock);
 }
 
+static bool io_zcrx_niov_put(struct net_iov *niov, int nr)
+{
+	return atomic_long_sub_and_test(nr, &niov->pp_ref_count);
+}
+
+static bool io_zcrx_put_niov_uref(struct net_iov *niov)
+{
+	if (atomic_long_read(&niov->pp_ref_count) < IO_ZC_RX_UREF)
+		return false;
+
+	return io_zcrx_niov_put(niov, IO_ZC_RX_UREF);
+}
+
+static inline void io_zc_add_pp_cache(struct page_pool *pp,
+				      struct net_iov *niov)
+{
+	netmem_ref netmem = net_iov_to_netmem(niov);
+
+#if defined(CONFIG_HAS_DMA) && defined(CONFIG_DMA_NEED_SYNC)
+	if (pp->dma_sync && dma_dev_need_sync(pp->p.dev)) {
+		dma_addr_t dma_addr = page_pool_get_dma_addr_netmem(netmem);
+
+		dma_sync_single_range_for_device(pp->p.dev, dma_addr,
+						 pp->p.offset, pp->p.max_len,
+						 pp->p.dma_dir);
+	}
+#endif
+
+	page_pool_fragment_netmem(netmem, 1);
+	pp->alloc.cache[pp->alloc.count++] = netmem;
+}
+
+static inline u32 io_zcrx_rqring_entries(struct io_zcrx_ifq *ifq)
+{
+	u32 entries;
+
+	entries = smp_load_acquire(&ifq->rq_ring->tail) - ifq->cached_rq_head;
+	return min(entries, ifq->rq_entries);
+}
+
+static struct io_uring_zcrx_rqe *io_zcrx_get_rqe(struct io_zcrx_ifq *ifq,
+						 unsigned mask)
+{
+	unsigned int idx = ifq->cached_rq_head++ & mask;
+
+	return &ifq->rqes[idx];
+}
+
+static void io_zcrx_ring_refill(struct page_pool *pp,
+				struct io_zcrx_ifq *ifq)
+{
+	unsigned int entries = io_zcrx_rqring_entries(ifq);
+	unsigned int mask = ifq->rq_entries - 1;
+
+	entries = min_t(unsigned, entries, PP_ALLOC_CACHE_REFILL - pp->alloc.count);
+	if (unlikely(!entries))
+		return;
+
+	do {
+		struct io_uring_zcrx_rqe *rqe = io_zcrx_get_rqe(ifq, mask);
+		struct io_zcrx_area *area;
+		struct net_iov *niov;
+		unsigned niov_idx, area_idx;
+
+		area_idx = rqe->off >> IORING_ZCRX_AREA_SHIFT;
+		niov_idx = (rqe->off & ~IORING_ZCRX_AREA_MASK) / PAGE_SIZE;
+
+		if (unlikely(rqe->__pad || area_idx))
+			continue;
+		area = ifq->area;
+
+		if (unlikely(niov_idx >= area->nia.num_niovs))
+			continue;
+		niov_idx = array_index_nospec(niov_idx, area->nia.num_niovs);
+
+		niov = &area->nia.niovs[niov_idx];
+		if (!io_zcrx_put_niov_uref(niov))
+			continue;
+		io_zc_add_pp_cache(pp, niov);
+	} while (--entries);
+
+	smp_store_release(&ifq->rq_ring->head, ifq->cached_rq_head);
+}
+
+static void io_zcrx_refill_slow(struct page_pool *pp, struct io_zcrx_ifq *ifq)
+{
+	struct io_zcrx_area *area = ifq->area;
+
+	spin_lock_bh(&area->freelist_lock);
+	while (area->free_count && pp->alloc.count < PP_ALLOC_CACHE_REFILL) {
+		struct net_iov *niov;
+		u32 pgid;
+
+		pgid = area->freelist[--area->free_count];
+		niov = &area->nia.niovs[pgid];
+
+		io_zc_add_pp_cache(pp, niov);
+
+		pp->pages_state_hold_cnt++;
+		trace_page_pool_state_hold(pp, net_iov_to_netmem(niov),
+					   pp->pages_state_hold_cnt);
+	}
+	spin_unlock_bh(&area->freelist_lock);
+}
+
+static void io_zcrx_recycle_niov(struct net_iov *niov)
+{
+	struct io_zcrx_area *area = io_zcrx_iov_to_area(niov);
+
+	spin_lock_bh(&area->freelist_lock);
+	area->freelist[area->free_count++] = net_iov_idx(niov);
+	spin_unlock_bh(&area->freelist_lock);
+}
+
+static netmem_ref io_pp_zc_alloc_netmems(struct page_pool *pp, gfp_t gfp)
+{
+	struct io_zcrx_ifq *ifq = pp->mp_priv;
+
+	/* pp should already be ensuring that */
+	if (unlikely(pp->alloc.count))
+		goto out_return;
+
+	io_zcrx_ring_refill(pp, ifq);
+	if (likely(pp->alloc.count))
+		goto out_return;
+
+	io_zcrx_refill_slow(pp, ifq);
+	if (!pp->alloc.count)
+		return 0;
+out_return:
+	return pp->alloc.cache[--pp->alloc.count];
+}
+
+static bool io_pp_zc_release_netmem(struct page_pool *pp, netmem_ref netmem)
+{
+	struct net_iov *niov;
+
+	if (WARN_ON_ONCE(!netmem_is_net_iov(netmem)))
+		return false;
+
+	niov = netmem_to_net_iov(netmem);
+
+	if (io_zcrx_niov_put(niov, 1))
+		io_zcrx_recycle_niov(niov);
+	return false;
+}
+
+static void io_pp_zc_scrub(struct page_pool *pp)
+{
+	struct io_zcrx_ifq *ifq = pp->mp_priv;
+	struct io_zcrx_area *area = ifq->area;
+	int i;
+
+	/* Reclaim back all buffers given to the user space. */
+	for (i = 0; i < area->nia.num_niovs; i++) {
+		struct net_iov *niov = &area->nia.niovs[i];
+		int count;
+
+		if (!io_zcrx_put_niov_uref(niov))
+			continue;
+		io_zcrx_recycle_niov(niov);
+
+		count = atomic_inc_return_relaxed(&pp->pages_state_release_cnt);
+		trace_page_pool_state_release(pp, net_iov_to_netmem(niov), count);
+	}
+}
+
+static int io_pp_zc_init(struct page_pool *pp)
+{
+	struct io_zcrx_ifq *ifq = pp->mp_priv;
+	struct io_zcrx_area *area = ifq->area;
+	int ret;
+
+	if (!ifq)
+		return -EINVAL;
+	if (pp->p.order != 0)
+		return -EINVAL;
+	if (!pp->p.napi)
+		return -EINVAL;
+	if (!pp->p.napi->napi_id)
+		return -EINVAL;
+
+	ret = page_pool_init_paged_area(pp, &area->nia, area->pages);
+	if (ret)
+		return ret;
+
+	ifq->napi_id = pp->p.napi->napi_id;
+	percpu_ref_get(&ifq->ctx->refs);
+	ifq->pp = pp;
+	return 0;
+}
+
+static void io_pp_zc_destroy(struct page_pool *pp)
+{
+	struct io_zcrx_ifq *ifq = pp->mp_priv;
+	struct io_zcrx_area *area = ifq->area;
+
+	page_pool_release_area(pp, &ifq->area->nia);
+
+	ifq->pp = NULL;
+	ifq->napi_id = 0;
+
+	if (WARN_ON_ONCE(area->free_count != area->nia.num_niovs))
+		return;
+	percpu_ref_put(&ifq->ctx->refs);
+}
+
+const struct memory_provider_ops io_uring_pp_zc_ops = {
+	.alloc_netmems		= io_pp_zc_alloc_netmems,
+	.release_netmem		= io_pp_zc_release_netmem,
+	.init			= io_pp_zc_init,
+	.destroy		= io_pp_zc_destroy,
+	.scrub			= io_pp_zc_scrub,
+};
+
 #endif
diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h
index 2fcbeb3d5501..67512fc69cc4 100644
--- a/io_uring/zcrx.h
+++ b/io_uring/zcrx.h
@@ -5,6 +5,9 @@ 
 #include <linux/io_uring_types.h>
 #include <net/page_pool/types.h>
 
+#define IO_ZC_RX_UREF			0x10000
+#define IO_ZC_RX_KREF_MASK		(IO_ZC_RX_UREF - 1)
+
 struct io_zcrx_area {
 	struct net_iov_area	nia;
 	struct io_zcrx_ifq	*ifq;
@@ -22,15 +25,18 @@  struct io_zcrx_ifq {
 	struct io_ring_ctx		*ctx;
 	struct net_device		*dev;
 	struct io_zcrx_area		*area;
+	struct page_pool		*pp;
 
 	struct io_uring			*rq_ring;
 	struct io_uring_zcrx_rqe 	*rqes;
 	u32				rq_entries;
+	u32				cached_rq_head;
 
 	unsigned short			n_rqe_pages;
 	struct page			**rqe_pages;
 
 	u32				if_rxq;
+	unsigned			napi_id;
 };
 
 #if defined(CONFIG_PAGE_POOL) && defined(CONFIG_INET)