diff mbox series

[RFC,3/8] memory-provider: dmabuf devmem memory provider

Message ID 20231113130041.58124-4-linyunsheng@huawei.com (mailing list archive)
State RFC
Delegated to: Netdev Maintainers
Headers show
Series A possible proposal for intergating dmabuf to page pool | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Guessed tree name to be net-next
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 1273 this patch: 1273
netdev/cc_maintainers success CCed 6 of 6 maintainers
netdev/build_clang success Errors and warnings before: 1162 this patch: 1162
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 1314 this patch: 1314
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 142 lines checked
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 2 this patch: 2
netdev/source_inline success Was 0 now: 0

Commit Message

Yunsheng Lin Nov. 13, 2023, 1 p.m. UTC
From: Mina Almasry <almasrymina@google.com>

Implement a memory provider that allocates dmabuf devmem page_pool_iovs.

Support of PP_FLAG_DMA_MAP and PP_FLAG_DMA_SYNC_DEV is omitted for
simplicity.

The provider receives a reference to the struct netdev_dmabuf_binding
via the pool->mp_priv pointer. The driver needs to set this pointer for
the provider in the page_pool_params.

The provider obtains a reference on the netdev_dmabuf_binding which
guarantees the binding and the underlying mapping remains alive until
the provider is destroyed.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Kaiyuan Zhang <kaiyuanz@google.com>
Signed-off-by: Mina Almasry <almasrymina@google.com>
Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
---
 include/net/page_pool/types.h | 28 +++++++++++
 net/core/page_pool.c          | 93 +++++++++++++++++++++++++++++++++++
 2 files changed, 121 insertions(+)

Comments

Mina Almasry Nov. 13, 2023, 1:42 p.m. UTC | #1
On Mon, Nov 13, 2023 at 5:00 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:
>
> From: Mina Almasry <almasrymina@google.com>
>
> Implement a memory provider that allocates dmabuf devmem page_pool_iovs.
>
> Support of PP_FLAG_DMA_MAP and PP_FLAG_DMA_SYNC_DEV is omitted for
> simplicity.
>
> The provider receives a reference to the struct netdev_dmabuf_binding
> via the pool->mp_priv pointer. The driver needs to set this pointer for
> the provider in the page_pool_params.
>
> The provider obtains a reference on the netdev_dmabuf_binding which
> guarantees the binding and the underlying mapping remains alive until
> the provider is destroyed.
>
> Signed-off-by: Willem de Bruijn <willemb@google.com>
> Signed-off-by: Kaiyuan Zhang <kaiyuanz@google.com>
> Signed-off-by: Mina Almasry <almasrymina@google.com>
> Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
> ---
>  include/net/page_pool/types.h | 28 +++++++++++
>  net/core/page_pool.c          | 93 +++++++++++++++++++++++++++++++++++
>  2 files changed, 121 insertions(+)
>
> diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h
> index 5e4fcd45ba50..52e4cf98ebc6 100644
> --- a/include/net/page_pool/types.h
> +++ b/include/net/page_pool/types.h
> @@ -124,6 +124,7 @@ struct mem_provider;
>
>  enum pp_memory_provider_type {
>         __PP_MP_NONE, /* Use system allocator directly */
> +       PP_MP_DMABUF_DEVMEM, /* dmabuf devmem provider */
>  };
>
>  struct pp_memory_provider_ops {
> @@ -134,6 +135,33 @@ struct pp_memory_provider_ops {
>         void (*free_pages)(struct page_pool *pool, struct page *page);
>  };
>
> +extern const struct pp_memory_provider_ops dmabuf_devmem_ops;
> +
> +struct page_pool_iov {
> +       unsigned long res0;
> +       unsigned long pp_magic;
> +       struct page_pool *pp;
> +       struct page *page;  /* dmabuf memory provider specific field */
> +       unsigned long dma_addr;
> +       atomic_long_t pp_frag_count;
> +       unsigned int res1;
> +       refcount_t _refcount;
> +};
> +
> +#define PAGE_POOL_MATCH(pg, iov)                               \
> +       static_assert(offsetof(struct page, pg) ==              \
> +                     offsetof(struct page_pool_iov, iov))
> +PAGE_POOL_MATCH(flags, res0);
> +PAGE_POOL_MATCH(pp_magic, pp_magic);
> +PAGE_POOL_MATCH(pp, pp);
> +PAGE_POOL_MATCH(_pp_mapping_pad, page);
> +PAGE_POOL_MATCH(dma_addr, dma_addr);
> +PAGE_POOL_MATCH(pp_frag_count, pp_frag_count);
> +PAGE_POOL_MATCH(_mapcount, res1);
> +PAGE_POOL_MATCH(_refcount, _refcount);
> +#undef PAGE_POOL_MATCH
> +static_assert(sizeof(struct page_pool_iov) <= sizeof(struct page));
> +

You're doing exactly what I think you're doing, and what was nacked in RFC v1.

You've converted 'struct page_pool_iov' to essentially become a
duplicate of 'struct page'. Then, you're casting page_pool_iov* into
struct page* in mp_dmabuf_devmem_alloc_pages(), then, you're calling
mm APIs like page_ref_*() on the page_pool_iov* because you've fooled
the mm stack into thinking dma-buf memory is a struct page.

RFC v1 was almost exactly the same, except instead of creating a
duplicate definition of struct page, it just allocated 'struct page'
instead of allocating another struct that is identical to struct page
and casting it into struct page.

I don't think what you're doing here reverses the nacks I got in RFC
v1. You also did not CC any dma-buf or mm people on this proposal that
would bring up these concerns again.

>  struct page_pool {
>         struct page_pool_params p;
>
> diff --git a/net/core/page_pool.c b/net/core/page_pool.c
> index 6c502bea842b..1bd7a2306f09 100644
> --- a/net/core/page_pool.c
> +++ b/net/core/page_pool.c
> @@ -231,6 +231,9 @@ static int page_pool_init(struct page_pool *pool,
>         switch (pool->p.memory_provider) {
>         case __PP_MP_NONE:
>                 break;
> +       case PP_MP_DMABUF_DEVMEM:
> +               pool->mp_ops = &dmabuf_devmem_ops;
> +               break;
>         default:
>                 err = -EINVAL;
>                 goto free_ptr_ring;
> @@ -1010,3 +1013,93 @@ void page_pool_update_nid(struct page_pool *pool, int new_nid)
>         }
>  }
>  EXPORT_SYMBOL(page_pool_update_nid);
> +
> +/*** "Dmabuf devmem memory provider" ***/
> +
> +static int mp_dmabuf_devmem_init(struct page_pool *pool)
> +{
> +       if (pool->p.flags & PP_FLAG_DMA_MAP ||
> +           pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
> +               return -EOPNOTSUPP;
> +       return 0;
> +}
> +
> +static struct page *mp_dmabuf_devmem_alloc_pages(struct page_pool *pool,
> +                                                gfp_t gfp)
> +{
> +       struct page_pool_iov *ppiov;
> +       struct page *page;
> +       dma_addr_t dma;
> +
> +       ppiov = kvmalloc(sizeof(*ppiov), gfp | __GFP_ZERO);
> +       if (!ppiov)
> +               return NULL;
> +
> +       page = alloc_pages_node(pool->p.nid, gfp, pool->p.order);
> +       if (!page) {
> +               kvfree(ppiov);
> +               return NULL;
> +       }
> +
> +       dma = dma_map_page_attrs(pool->p.dev, page, 0,
> +                                (PAGE_SIZE << pool->p.order),
> +                                pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC |
> +                                                 DMA_ATTR_WEAK_ORDERING);
> +       if (dma_mapping_error(pool->p.dev, dma)) {
> +               put_page(page);
> +               kvfree(ppiov);
> +               return NULL;
> +       }
> +
> +       ppiov->pp = pool;
> +       ppiov->pp_magic = PP_SIGNATURE;
> +       ppiov->page = page;
> +       refcount_set(&ppiov->_refcount, 1);
> +       page_pool_fragment_page((struct page *)ppiov, 1);
> +       page_pool_set_dma_addr((struct page *)ppiov, dma);
> +       pool->pages_state_hold_cnt++;
> +       trace_page_pool_state_hold(pool, (struct page *)ppiov,
> +                                  pool->pages_state_hold_cnt);
> +       return (struct page *)ppiov;
> +}
> +
> +static void mp_dmabuf_devmem_destroy(struct page_pool *pool)
> +{
> +}
> +
> +static void mp_dmabuf_devmem_release_page(struct page_pool *pool,
> +                                         struct page *page)
> +{
> +       struct page_pool_iov *ppiov = (struct page_pool_iov *)page;
> +       dma_addr_t dma;
> +
> +       dma = page_pool_get_dma_addr(page);
> +
> +       /* When page is unmapped, it cannot be returned to our pool */
> +       dma_unmap_page_attrs(pool->p.dev, dma,
> +                            PAGE_SIZE << pool->p.order, pool->p.dma_dir,
> +                            DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING);
> +       page_pool_set_dma_addr(page, 0);
> +
> +       put_page(ppiov->page);
> +}
> +
> +static void mp_dmabuf_devmem_free_pages(struct page_pool *pool,
> +                                       struct page *page)
> +{
> +       int count;
> +
> +       count = atomic_inc_return_relaxed(&pool->pages_state_release_cnt);
> +       trace_page_pool_state_release(pool, page, count);
> +
> +       kvfree(page);
> +}
> +
> +const struct pp_memory_provider_ops dmabuf_devmem_ops = {
> +       .init                   = mp_dmabuf_devmem_init,
> +       .destroy                = mp_dmabuf_devmem_destroy,
> +       .alloc_pages            = mp_dmabuf_devmem_alloc_pages,
> +       .release_page           = mp_dmabuf_devmem_release_page,
> +       .free_pages             = mp_dmabuf_devmem_free_pages,
> +};
> +EXPORT_SYMBOL(dmabuf_devmem_ops);
> --
> 2.33.0
>
Jakub Kicinski Nov. 13, 2023, 11:05 p.m. UTC | #2
On Mon, 13 Nov 2023 05:42:16 -0800 Mina Almasry wrote:
> You're doing exactly what I think you're doing, and what was nacked in RFC v1.
> 
> You've converted 'struct page_pool_iov' to essentially become a
> duplicate of 'struct page'. Then, you're casting page_pool_iov* into
> struct page* in mp_dmabuf_devmem_alloc_pages(), then, you're calling
> mm APIs like page_ref_*() on the page_pool_iov* because you've fooled
> the mm stack into thinking dma-buf memory is a struct page.
> 
> RFC v1 was almost exactly the same, except instead of creating a
> duplicate definition of struct page, it just allocated 'struct page'
> instead of allocating another struct that is identical to struct page
> and casting it into struct page.
> 
> I don't think what you're doing here reverses the nacks I got in RFC
> v1. You also did not CC any dma-buf or mm people on this proposal that
> would bring up these concerns again.

Right, but the mirror struct has some appeal to a non-mm person like
myself. The problem IIUC is that this patch is the wrong way around, we
should be converting everyone who can deal with non-host mem to struct
page_pool_iov. Using page_address() on ppiov which hns3 seems to do in
this series does not compute for me.

Then we can turn the existing non-iov helpers to be a thin wrapper with
just a cast from struct page to struct page_pool_iov, and a call of the
iov helper. Again - never cast the other way around.

Also I think this conversion can be done completely separately from the
mem provider changes. Just add struct page_pool_iov and start using it.

Does that make more sense?
Yunsheng Lin Nov. 14, 2023, 8:23 a.m. UTC | #3
+cc Christian, Jason and Willy

On 2023/11/14 7:05, Jakub Kicinski wrote:
> On Mon, 13 Nov 2023 05:42:16 -0800 Mina Almasry wrote:
>> You're doing exactly what I think you're doing, and what was nacked in RFC v1.
>>
>> You've converted 'struct page_pool_iov' to essentially become a
>> duplicate of 'struct page'. Then, you're casting page_pool_iov* into
>> struct page* in mp_dmabuf_devmem_alloc_pages(), then, you're calling
>> mm APIs like page_ref_*() on the page_pool_iov* because you've fooled
>> the mm stack into thinking dma-buf memory is a struct page.

Yes, something like above, but I am not sure about the 'fooled the mm
stack into thinking dma-buf memory is a struct page' part, because:
1. We never let the 'struct page' for devmem leaking out of net stacking
   through the 'not kmap()able and not readable' checking in your patchset.
2. We inititiate page->_refcount for devmem to one and it remains as one,
   we will never call page_ref_inc()/page_ref_dec()/get_page()/put_page(),
   instead, we use page pool's pp_frag_count to do reference counting for
   devmem page in patch 6.

>>
>> RFC v1 was almost exactly the same, except instead of creating a
>> duplicate definition of struct page, it just allocated 'struct page'
>> instead of allocating another struct that is identical to struct page
>> and casting it into struct page.

Perhaps it is more accurate to say this is something between RFC v1 and
RFC v3, in order to decouple 'struct page' for devmem from mm subsystem,
but still have most unified handling for both normal memory and devmem
in page pool and net stack.

The main difference between this patchset and RFC v1:
1. The mm subsystem is not supposed to see the 'struct page' for devmem
   in this patchset, I guess we could say it is decoupled from the mm
   subsystem even though we still call PageTail()/page_ref_count()/
   page_is_pfmemalloc() on 'struct page' for devmem.

The main difference between this patchset and RFC v3:
1. It reuses the 'struct page' to have more unified handling between
   normal page and devmem page for net stack.
2. It relies on the page->pp_frag_count to do reference counting.

>>
>> I don't think what you're doing here reverses the nacks I got in RFC
>> v1. You also did not CC any dma-buf or mm people on this proposal that
>> would bring up these concerns again.
> 
> Right, but the mirror struct has some appeal to a non-mm person like
> myself. The problem IIUC is that this patch is the wrong way around, we
> should be converting everyone who can deal with non-host mem to struct
> page_pool_iov. Using page_address() on ppiov which hns3 seems to do in
> this series does not compute for me.

The hacking use of ppiov in hns3 is only used to do the some prototype
testing, so ignore it.

> 
> Then we can turn the existing non-iov helpers to be a thin wrapper with
> just a cast from struct page to struct page_pool_iov, and a call of the
> iov helper. Again - never cast the other way around.

I am agreed that a cast from struct page to struct page_pool_iov is allowed,
but a cast from struct page_pool_iov to struct page is not allowed if I am
understanding you correctly.

Before we can also completely decouple 'struct page' allocated using buddy
allocator directly from mm subsystem in netstack, below is what I have in
mind in order to support different memory provider.

                                +--------------+
                                |   Netstack   |
                                |'struct page' |
                                +--------------+
                                        ^
                                        |
                                        |
                                        v
                              +---------------------+
+----------------------+      |                     |      +---------------+
|      devmem MP       |<---->|     Page pool       |----->|    **** MP    |
|'struct page_pool_iov'|      |   'struct page'     |      |'struct **_iov'|
+----------------------+      |                     |      +---------------+
                              +---------------------+
                                        ^
                                        |
                                        |
                                        v
                                +---------------+
                                |    Driver     |
                                | 'struct page' |
                                +---------------+

I would expect net stack, page pool, driver still see the 'struct page',
only memory provider see the specific struct for itself, for the above,
devmem memory provider sees the 'struct page_pool_iov'.

The reason I still expect driver to see the 'struct page' is that driver
will still need to support normal memory besides devmem.

> 
> Also I think this conversion can be done completely separately from the
> mem provider changes. Just add struct page_pool_iov and start using it.

I am not sure I understand what does "Just add struct page_pool_iov and
start using it" mean yet.

> 
> Does that make more sense?
> 
> .
>
Mina Almasry Nov. 14, 2023, 12:21 p.m. UTC | #4
On Tue, Nov 14, 2023 at 12:23 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:
>
> +cc Christian, Jason and Willy
>
> On 2023/11/14 7:05, Jakub Kicinski wrote:
> > On Mon, 13 Nov 2023 05:42:16 -0800 Mina Almasry wrote:
> >> You're doing exactly what I think you're doing, and what was nacked in RFC v1.
> >>
> >> You've converted 'struct page_pool_iov' to essentially become a
> >> duplicate of 'struct page'. Then, you're casting page_pool_iov* into
> >> struct page* in mp_dmabuf_devmem_alloc_pages(), then, you're calling
> >> mm APIs like page_ref_*() on the page_pool_iov* because you've fooled
> >> the mm stack into thinking dma-buf memory is a struct page.
>
> Yes, something like above, but I am not sure about the 'fooled the mm
> stack into thinking dma-buf memory is a struct page' part, because:
> 1. We never let the 'struct page' for devmem leaking out of net stacking
>    through the 'not kmap()able and not readable' checking in your patchset.

RFC never used dma-buf pages outside the net stack, so that is the same.

You are not able to get rid of the 'net kmap()able and not readable'
checking with this approach, because dma-buf memory is fundamentally
unkmapable and unreadable. This approach would still need
skb_frags_not_readable checks in net stack, so that is also the same.

> 2. We inititiate page->_refcount for devmem to one and it remains as one,
>    we will never call page_ref_inc()/page_ref_dec()/get_page()/put_page(),
>    instead, we use page pool's pp_frag_count to do reference counting for
>    devmem page in patch 6.
>

I'm not sure that moves the needle in terms of allowing dma-buf
memory to look like struct pages.

> >>
> >> RFC v1 was almost exactly the same, except instead of creating a
> >> duplicate definition of struct page, it just allocated 'struct page'
> >> instead of allocating another struct that is identical to struct page
> >> and casting it into struct page.
>
> Perhaps it is more accurate to say this is something between RFC v1 and
> RFC v3, in order to decouple 'struct page' for devmem from mm subsystem,
> but still have most unified handling for both normal memory and devmem
> in page pool and net stack.
>
> The main difference between this patchset and RFC v1:
> 1. The mm subsystem is not supposed to see the 'struct page' for devmem
>    in this patchset, I guess we could say it is decoupled from the mm
>    subsystem even though we still call PageTail()/page_ref_count()/
>    page_is_pfmemalloc() on 'struct page' for devmem.
>

In this patchset you pretty much allocate a struct page for your
dma-buf memory, and then cast it into a struct page, so all the mm
calls in page_pool.c are seeing a struct page when it's really dma-buf
memory.

'even though we still call
PageTail()/page_ref_count()/page_is_pfmemalloc() on 'struct page' for
devmem' is basically making dma-buf memory look like struct pages.

Actually because you put the 'strtuct page for devmem' in
skb->bv_frag, the net stack will grab the 'struct page' for devmem
using skb_frag_page() then call things like page_address(), kmap,
get_page, put_page, etc, etc, etc.

> The main difference between this patchset and RFC v3:
> 1. It reuses the 'struct page' to have more unified handling between
>    normal page and devmem page for net stack.

This is what was nacked in RFC v1.

> 2. It relies on the page->pp_frag_count to do reference counting.
>

I don't see you change any of the page_ref_* calls in page_pool.c, for
example this one:

https://elixir.bootlin.com/linux/latest/source/net/core/page_pool.c#L601

So the reference the page_pool is seeing is actually page->_refcount,
not page->pp_frag_count? I'm confused here. Is this a bug in the
patchset?

> >>
> >> I don't think what you're doing here reverses the nacks I got in RFC
> >> v1. You also did not CC any dma-buf or mm people on this proposal that
> >> would bring up these concerns again.
> >
> > Right, but the mirror struct has some appeal to a non-mm person like
> > myself. The problem IIUC is that this patch is the wrong way around, we
> > should be converting everyone who can deal with non-host mem to struct
> > page_pool_iov. Using page_address() on ppiov which hns3 seems to do in
> > this series does not compute for me.
>
> The hacking use of ppiov in hns3 is only used to do the some prototype
> testing, so ignore it.
>
> >
> > Then we can turn the existing non-iov helpers to be a thin wrapper with
> > just a cast from struct page to struct page_pool_iov, and a call of the
> > iov helper. Again - never cast the other way around.
>
> I am agreed that a cast from struct page to struct page_pool_iov is allowed,
> but a cast from struct page_pool_iov to struct page is not allowed if I am
> understanding you correctly.
>
> Before we can also completely decouple 'struct page' allocated using buddy
> allocator directly from mm subsystem in netstack, below is what I have in
> mind in order to support different memory provider.
>
>                                 +--------------+
>                                 |   Netstack   |
>                                 |'struct page' |
>                                 +--------------+
>                                         ^
>                                         |
>                                         |
>                                         v
>                               +---------------------+
> +----------------------+      |                     |      +---------------+
> |      devmem MP       |<---->|     Page pool       |----->|    **** MP    |
> |'struct page_pool_iov'|      |   'struct page'     |      |'struct **_iov'|
> +----------------------+      |                     |      +---------------+
>                               +---------------------+
>                                         ^
>                                         |
>                                         |
>                                         v
>                                 +---------------+
>                                 |    Driver     |
>                                 | 'struct page' |
>                                 +---------------+
>
> I would expect net stack, page pool, driver still see the 'struct page',
> only memory provider see the specific struct for itself, for the above,
> devmem memory provider sees the 'struct page_pool_iov'.
>
> The reason I still expect driver to see the 'struct page' is that driver
> will still need to support normal memory besides devmem.
>
> >
> > Also I think this conversion can be done completely separately from the
> > mem provider changes. Just add struct page_pool_iov and start using it.
>
> I am not sure I understand what does "Just add struct page_pool_iov and
> start using it" mean yet.
>
> >
> > Does that make more sense?
> >
> > .
> >



--
Thanks,
Mina
Yunsheng Lin Nov. 14, 2023, 12:49 p.m. UTC | #5
On 2023/11/14 20:21, Mina Almasry wrote:
> On Tue, Nov 14, 2023 at 12:23 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:
>>
>> +cc Christian, Jason and Willy
>>
>> On 2023/11/14 7:05, Jakub Kicinski wrote:
>>> On Mon, 13 Nov 2023 05:42:16 -0800 Mina Almasry wrote:
>>>> You're doing exactly what I think you're doing, and what was nacked in RFC v1.
>>>>
>>>> You've converted 'struct page_pool_iov' to essentially become a
>>>> duplicate of 'struct page'. Then, you're casting page_pool_iov* into
>>>> struct page* in mp_dmabuf_devmem_alloc_pages(), then, you're calling
>>>> mm APIs like page_ref_*() on the page_pool_iov* because you've fooled
>>>> the mm stack into thinking dma-buf memory is a struct page.
>>
>> Yes, something like above, but I am not sure about the 'fooled the mm
>> stack into thinking dma-buf memory is a struct page' part, because:
>> 1. We never let the 'struct page' for devmem leaking out of net stacking
>>    through the 'not kmap()able and not readable' checking in your patchset.
> 
> RFC never used dma-buf pages outside the net stack, so that is the same.
> 
> You are not able to get rid of the 'net kmap()able and not readable'
> checking with this approach, because dma-buf memory is fundamentally
> unkmapable and unreadable. This approach would still need
> skb_frags_not_readable checks in net stack, so that is also the same.

Yes, I am agreed that checking is still needed whatever the proposal is.

> 
>> 2. We inititiate page->_refcount for devmem to one and it remains as one,
>>    we will never call page_ref_inc()/page_ref_dec()/get_page()/put_page(),
>>    instead, we use page pool's pp_frag_count to do reference counting for
>>    devmem page in patch 6.
>>
> 
> I'm not sure that moves the needle in terms of allowing dma-buf
> memory to look like struct pages.
> 
>>>>
>>>> RFC v1 was almost exactly the same, except instead of creating a
>>>> duplicate definition of struct page, it just allocated 'struct page'
>>>> instead of allocating another struct that is identical to struct page
>>>> and casting it into struct page.
>>
>> Perhaps it is more accurate to say this is something between RFC v1 and
>> RFC v3, in order to decouple 'struct page' for devmem from mm subsystem,
>> but still have most unified handling for both normal memory and devmem
>> in page pool and net stack.
>>
>> The main difference between this patchset and RFC v1:
>> 1. The mm subsystem is not supposed to see the 'struct page' for devmem
>>    in this patchset, I guess we could say it is decoupled from the mm
>>    subsystem even though we still call PageTail()/page_ref_count()/
>>    page_is_pfmemalloc() on 'struct page' for devmem.
>>
> 
> In this patchset you pretty much allocate a struct page for your
> dma-buf memory, and then cast it into a struct page, so all the mm
> calls in page_pool.c are seeing a struct page when it's really dma-buf
> memory.
> 
> 'even though we still call
> PageTail()/page_ref_count()/page_is_pfmemalloc() on 'struct page' for
> devmem' is basically making dma-buf memory look like struct pages.
> 
> Actually because you put the 'strtuct page for devmem' in
> skb->bv_frag, the net stack will grab the 'struct page' for devmem
> using skb_frag_page() then call things like page_address(), kmap,
> get_page, put_page, etc, etc, etc.

Yes, as above, skb_frags_not_readable() checking is still needed for
kmap() and page_address().

get_page, put_page related calling is avoided in page_pool_frag_ref()
and napi_pp_put_page() for devmem page as the above checking is true
for devmem page:
(pp_iov->pp_magic & ~0x3UL) == PP_SIGNATURE

> 
>> The main difference between this patchset and RFC v3:
>> 1. It reuses the 'struct page' to have more unified handling between
>>    normal page and devmem page for net stack.
> 
> This is what was nacked in RFC v1.
> 
>> 2. It relies on the page->pp_frag_count to do reference counting.
>>
> 
> I don't see you change any of the page_ref_* calls in page_pool.c, for
> example this one:
> 
> https://elixir.bootlin.com/linux/latest/source/net/core/page_pool.c#L601
> 
> So the reference the page_pool is seeing is actually page->_refcount,
> not page->pp_frag_count? I'm confused here. Is this a bug in the
> patchset?

page->_refcount is the same as page_pool_iov->_refcount for devmem, which
is ensured by the 'PAGE_POOL_MATCH(_refcount, _refcount);', and
page_pool_iov->_refcount is set to one in mp_dmabuf_devmem_alloc_pages()
by calling 'refcount_set(&ppiov->_refcount, 1)' and always remains as one.

So the 'page_ref_count(page) == 1' checking is always true for devmem page.
Mina Almasry Nov. 14, 2023, 12:58 p.m. UTC | #6
On Tue, Nov 14, 2023 at 4:49 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:
>
> On 2023/11/14 20:21, Mina Almasry wrote:
> > On Tue, Nov 14, 2023 at 12:23 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:
> >>
> >> +cc Christian, Jason and Willy
> >>
> >> On 2023/11/14 7:05, Jakub Kicinski wrote:
> >>> On Mon, 13 Nov 2023 05:42:16 -0800 Mina Almasry wrote:
> >>>> You're doing exactly what I think you're doing, and what was nacked in RFC v1.
> >>>>
> >>>> You've converted 'struct page_pool_iov' to essentially become a
> >>>> duplicate of 'struct page'. Then, you're casting page_pool_iov* into
> >>>> struct page* in mp_dmabuf_devmem_alloc_pages(), then, you're calling
> >>>> mm APIs like page_ref_*() on the page_pool_iov* because you've fooled
> >>>> the mm stack into thinking dma-buf memory is a struct page.
> >>
> >> Yes, something like above, but I am not sure about the 'fooled the mm
> >> stack into thinking dma-buf memory is a struct page' part, because:
> >> 1. We never let the 'struct page' for devmem leaking out of net stacking
> >>    through the 'not kmap()able and not readable' checking in your patchset.
> >
> > RFC never used dma-buf pages outside the net stack, so that is the same.
> >
> > You are not able to get rid of the 'net kmap()able and not readable'
> > checking with this approach, because dma-buf memory is fundamentally
> > unkmapable and unreadable. This approach would still need
> > skb_frags_not_readable checks in net stack, so that is also the same.
>
> Yes, I am agreed that checking is still needed whatever the proposal is.
>
> >
> >> 2. We inititiate page->_refcount for devmem to one and it remains as one,
> >>    we will never call page_ref_inc()/page_ref_dec()/get_page()/put_page(),
> >>    instead, we use page pool's pp_frag_count to do reference counting for
> >>    devmem page in patch 6.
> >>
> >
> > I'm not sure that moves the needle in terms of allowing dma-buf
> > memory to look like struct pages.
> >
> >>>>
> >>>> RFC v1 was almost exactly the same, except instead of creating a
> >>>> duplicate definition of struct page, it just allocated 'struct page'
> >>>> instead of allocating another struct that is identical to struct page
> >>>> and casting it into struct page.
> >>
> >> Perhaps it is more accurate to say this is something between RFC v1 and
> >> RFC v3, in order to decouple 'struct page' for devmem from mm subsystem,
> >> but still have most unified handling for both normal memory and devmem
> >> in page pool and net stack.
> >>
> >> The main difference between this patchset and RFC v1:
> >> 1. The mm subsystem is not supposed to see the 'struct page' for devmem
> >>    in this patchset, I guess we could say it is decoupled from the mm
> >>    subsystem even though we still call PageTail()/page_ref_count()/
> >>    page_is_pfmemalloc() on 'struct page' for devmem.
> >>
> >
> > In this patchset you pretty much allocate a struct page for your
> > dma-buf memory, and then cast it into a struct page, so all the mm
> > calls in page_pool.c are seeing a struct page when it's really dma-buf
> > memory.
> >
> > 'even though we still call
> > PageTail()/page_ref_count()/page_is_pfmemalloc() on 'struct page' for
> > devmem' is basically making dma-buf memory look like struct pages.
> >
> > Actually because you put the 'strtuct page for devmem' in
> > skb->bv_frag, the net stack will grab the 'struct page' for devmem
> > using skb_frag_page() then call things like page_address(), kmap,
> > get_page, put_page, etc, etc, etc.
>
> Yes, as above, skb_frags_not_readable() checking is still needed for
> kmap() and page_address().
>
> get_page, put_page related calling is avoided in page_pool_frag_ref()
> and napi_pp_put_page() for devmem page as the above checking is true
> for devmem page:
> (pp_iov->pp_magic & ~0x3UL) == PP_SIGNATURE
>

So, devmem needs special handling with if statement for refcounting,
even after using struct pages for devmem, which is not allowed (IIUC
the dma-buf maintainer).

> >
> >> The main difference between this patchset and RFC v3:
> >> 1. It reuses the 'struct page' to have more unified handling between
> >>    normal page and devmem page for net stack.
> >
> > This is what was nacked in RFC v1.
> >
> >> 2. It relies on the page->pp_frag_count to do reference counting.
> >>
> >
> > I don't see you change any of the page_ref_* calls in page_pool.c, for
> > example this one:
> >
> > https://elixir.bootlin.com/linux/latest/source/net/core/page_pool.c#L601
> >
> > So the reference the page_pool is seeing is actually page->_refcount,
> > not page->pp_frag_count? I'm confused here. Is this a bug in the
> > patchset?
>
> page->_refcount is the same as page_pool_iov->_refcount for devmem, which
> is ensured by the 'PAGE_POOL_MATCH(_refcount, _refcount);', and
> page_pool_iov->_refcount is set to one in mp_dmabuf_devmem_alloc_pages()
> by calling 'refcount_set(&ppiov->_refcount, 1)' and always remains as one.
>
> So the 'page_ref_count(page) == 1' checking is always true for devmem page.

Which, of course, is a bug in the patchset, and it only works because
it's a POC for you. devmem pages (which shouldn't exist according to
the dma-buf maintainer, IIUC) can't be recycled all the time. See
SO_DEVMEM_DONTNEED patch in my RFC and refcounting needed for devmem.
Jason Gunthorpe Nov. 14, 2023, 1:16 p.m. UTC | #7
On Tue, Nov 14, 2023 at 04:21:26AM -0800, Mina Almasry wrote:

> Actually because you put the 'strtuct page for devmem' in
> skb->bv_frag, the net stack will grab the 'struct page' for devmem
> using skb_frag_page() then call things like page_address(), kmap,
> get_page, put_page, etc, etc, etc.

Yikes, please no. If net has its own struct page look alike it has to
stay entirely inside net. A non-mm owned struct page should not be
passed into mm calls. It is just way too hacky to be seriously
considered :(

> > I would expect net stack, page pool, driver still see the 'struct page',
> > only memory provider see the specific struct for itself, for the above,
> > devmem memory provider sees the 'struct page_pool_iov'.
> >
> > The reason I still expect driver to see the 'struct page' is that driver
> > will still need to support normal memory besides devmem.

I wouldn't say this approach is unreasonable, but it does have to be
done carefully to isolate the mm. Keeping the struct page in the API
is going to make this very hard.

Jason
Yunsheng Lin Nov. 14, 2023, 1:19 p.m. UTC | #8
On 2023/11/14 20:58, Mina Almasry wrote:

>>
>> Yes, as above, skb_frags_not_readable() checking is still needed for
>> kmap() and page_address().
>>
>> get_page, put_page related calling is avoided in page_pool_frag_ref()
>> and napi_pp_put_page() for devmem page as the above checking is true
>> for devmem page:
>> (pp_iov->pp_magic & ~0x3UL) == PP_SIGNATURE
>>
> 
> So, devmem needs special handling with if statement for refcounting,
> even after using struct pages for devmem, which is not allowed (IIUC
> the dma-buf maintainer).

It reuses the already existing checking or optimization, that is the point
of 'mirror struct'.

> 
>>>
>>>> The main difference between this patchset and RFC v3:
>>>> 1. It reuses the 'struct page' to have more unified handling between
>>>>    normal page and devmem page for net stack.
>>>
>>> This is what was nacked in RFC v1.
>>>
>>>> 2. It relies on the page->pp_frag_count to do reference counting.
>>>>
>>>
>>> I don't see you change any of the page_ref_* calls in page_pool.c, for
>>> example this one:
>>>
>>> https://elixir.bootlin.com/linux/latest/source/net/core/page_pool.c#L601
>>>
>>> So the reference the page_pool is seeing is actually page->_refcount,
>>> not page->pp_frag_count? I'm confused here. Is this a bug in the
>>> patchset?
>>
>> page->_refcount is the same as page_pool_iov->_refcount for devmem, which
>> is ensured by the 'PAGE_POOL_MATCH(_refcount, _refcount);', and
>> page_pool_iov->_refcount is set to one in mp_dmabuf_devmem_alloc_pages()
>> by calling 'refcount_set(&ppiov->_refcount, 1)' and always remains as one.
>>
>> So the 'page_ref_count(page) == 1' checking is always true for devmem page.
> 
> Which, of course, is a bug in the patchset, and it only works because
> it's a POC for you. devmem pages (which shouldn't exist according to
> the dma-buf maintainer, IIUC) can't be recycled all the time. See
> SO_DEVMEM_DONTNEED patch in my RFC and refcounting needed for devmem.

I am not sure dma-buf maintainer's concern is still there with this patchset.

Whatever name you calling it for the struct, however you arrange each field
in the struct, some metadata is always needed for dmabuf to intergrate into
page pool.

If the above is true, why not utilize the 'struct page' to have more unified
handling?

>
Willem de Bruijn Nov. 14, 2023, 3:41 p.m. UTC | #9
Yunsheng Lin wrote:
> On 2023/11/14 20:58, Mina Almasry wrote:
> 
> >>
> >> Yes, as above, skb_frags_not_readable() checking is still needed for
> >> kmap() and page_address().
> >>
> >> get_page, put_page related calling is avoided in page_pool_frag_ref()
> >> and napi_pp_put_page() for devmem page as the above checking is true
> >> for devmem page:
> >> (pp_iov->pp_magic & ~0x3UL) == PP_SIGNATURE
> >>
> > 
> > So, devmem needs special handling with if statement for refcounting,
> > even after using struct pages for devmem, which is not allowed (IIUC
> > the dma-buf maintainer).
> 
> It reuses the already existing checking or optimization, that is the point
> of 'mirror struct'.
> 
> > 
> >>>
> >>>> The main difference between this patchset and RFC v3:
> >>>> 1. It reuses the 'struct page' to have more unified handling between
> >>>>    normal page and devmem page for net stack.
> >>>
> >>> This is what was nacked in RFC v1.
> >>>
> >>>> 2. It relies on the page->pp_frag_count to do reference counting.
> >>>>
> >>>
> >>> I don't see you change any of the page_ref_* calls in page_pool.c, for
> >>> example this one:
> >>>
> >>> https://elixir.bootlin.com/linux/latest/source/net/core/page_pool.c#L601
> >>>
> >>> So the reference the page_pool is seeing is actually page->_refcount,
> >>> not page->pp_frag_count? I'm confused here. Is this a bug in the
> >>> patchset?
> >>
> >> page->_refcount is the same as page_pool_iov->_refcount for devmem, which
> >> is ensured by the 'PAGE_POOL_MATCH(_refcount, _refcount);', and
> >> page_pool_iov->_refcount is set to one in mp_dmabuf_devmem_alloc_pages()
> >> by calling 'refcount_set(&ppiov->_refcount, 1)' and always remains as one.
> >>
> >> So the 'page_ref_count(page) == 1' checking is always true for devmem page.
> > 
> > Which, of course, is a bug in the patchset, and it only works because
> > it's a POC for you. devmem pages (which shouldn't exist according to
> > the dma-buf maintainer, IIUC) can't be recycled all the time. See
> > SO_DEVMEM_DONTNEED patch in my RFC and refcounting needed for devmem.
> 
> I am not sure dma-buf maintainer's concern is still there with this patchset.
> 
> Whatever name you calling it for the struct, however you arrange each field
> in the struct, some metadata is always needed for dmabuf to intergrate into
> page pool.
> 
> If the above is true, why not utilize the 'struct page' to have more unified
> handling?

My understanding is that there is a general preference to simplify struct
page, and at the least not move in the other direction by overloading the
struct in new ways.

If using struct page for something that is not memory, there is ZONE_DEVICE.
But using that correctly is non-trivial:

https://lore.kernel.org/all/ZKyZBbKEpmkFkpWV@ziepe.ca/ 

Since all we need is a handle that does not leave the network stack,
a network specific struct like page_pool_iov entirely avoids this issue.
RFC v3 seems like a good simplification over RFC v1 in that regard to me.
I was also pleasantly surprised how minimal the change to the users of
skb_frag_t actually proved to be.
Jakub Kicinski Nov. 14, 2023, 10:25 p.m. UTC | #10
On Tue, 14 Nov 2023 16:23:29 +0800 Yunsheng Lin wrote:
> I would expect net stack, page pool, driver still see the 'struct page',
> only memory provider see the specific struct for itself, for the above,
> devmem memory provider sees the 'struct page_pool_iov'.

You can't lie to the driver that an _iov is a page either.
The driver must explicitly "opt-in" to using the _iov variant,
by calling the _iov set of APIs.

Only drivers which can support header-data split can reasonably
use the _iov API, for data pages.
Christian König Nov. 15, 2023, 6:46 a.m. UTC | #11
Am 14.11.23 um 14:16 schrieb Jason Gunthorpe:
> A non-mm owned struct page should not be
> passed into mm calls. It is just way too hacky to be seriously
> considered :(

Can we please print this sentence on T-Shirts? Or framed on the wall of 
meeting rooms?

I don't know how often I had to repeat that to people.

Regards,
Christian.
Yunsheng Lin Nov. 15, 2023, 9:21 a.m. UTC | #12
On 2023/11/14 21:16, Jason Gunthorpe wrote:
> On Tue, Nov 14, 2023 at 04:21:26AM -0800, Mina Almasry wrote:
> 
>> Actually because you put the 'strtuct page for devmem' in
>> skb->bv_frag, the net stack will grab the 'struct page' for devmem
>> using skb_frag_page() then call things like page_address(), kmap,
>> get_page, put_page, etc, etc, etc.
> 
> Yikes, please no. If net has its own struct page look alike it has to
> stay entirely inside net. A non-mm owned struct page should not be
> passed into mm calls. It is just way too hacky to be seriously
> considered :(

Yes, that is something this patchset is trying to do, defining its own
struct page look alike for page pool to support devmem.

struct page for devmem will not be called into the mm subsystem, so most
of the mm calls is avoided by calling into the devmem memory provider'
ops instead of calling mm calls.

As far as I see for now, only page_ref_count(), page_is_pfmemalloc() and
PageTail() is called for devmem page, which should be easy to ensure that
those call for devmem page is consistent with the struct page owned by mm.
I am not sure if we can use some kind of compile/runtime checking to ensure
those kinds of consistency?

> 
>>> I would expect net stack, page pool, driver still see the 'struct page',
>>> only memory provider see the specific struct for itself, for the above,
>>> devmem memory provider sees the 'struct page_pool_iov'.
>>>
>>> The reason I still expect driver to see the 'struct page' is that driver
>>> will still need to support normal memory besides devmem.
> 
> I wouldn't say this approach is unreasonable, but it does have to be
> done carefully to isolate the mm. Keeping the struct page in the API
> is going to make this very hard.

I would expect that most of the isolation is done in page pool, as far as
I can see:

1. For control part: the driver may need to tell the page pool which memory
                     provider it want to use. Or the administrator specifies
                     which memory provider to use by some netlink-based cmd.

2. For data part: I am thinking that driver should only call page_pool_alloc(),
                  page_pool_free() and page_pool_get_dma_addr related function.

Of course the driver may need to be aware of that if it can call kmap() or
page_address() on the page returned from page_pool_alloc(), and maybe tell
net stack that those pages is not kmap()'able and page_address()'able.

> 
> Jason
> .
>
Yunsheng Lin Nov. 15, 2023, 9:29 a.m. UTC | #13
On 2023/11/14 23:41, Willem de Bruijn wrote:
>>
>> I am not sure dma-buf maintainer's concern is still there with this patchset.
>>
>> Whatever name you calling it for the struct, however you arrange each field
>> in the struct, some metadata is always needed for dmabuf to intergrate into
>> page pool.
>>
>> If the above is true, why not utilize the 'struct page' to have more unified
>> handling?
> 
> My understanding is that there is a general preference to simplify struct
> page, and at the least not move in the other direction by overloading the
> struct in new ways.

As my understanding, the new struct is just mirroring the struct page pool
is already using, see:
https://elixir.free-electrons.com/linux/v6.7-rc1/source/include/linux/mm_types.h#L119

If there is simplifying to the struct page_pool is using, I think the new
stuct the devmem memory provider is using can adjust accordingly.

As a matter of fact, I think the way 'struct page' for devmem is decoupled
from mm subsystem may provide a way to simplify or decoupled the already
existing 'struct page' used in netstack from mm subsystem, before this
patchset, it seems we have the below types of 'struct page':
1. page allocated in the netstack using page pool.
2. page allocated in the netstack using buddy allocator.
3. page allocated in other subsystem and passed to the netstack, such as
   zcopy or spliced page?

If we can decouple 'struct page' for devmem from mm subsystem, we may be able
to decouple the above 'struct page' from mm subsystem one by one.

> 
> If using struct page for something that is not memory, there is ZONE_DEVICE.
> But using that correctly is non-trivial:
> 
> https://lore.kernel.org/all/ZKyZBbKEpmkFkpWV@ziepe.ca/ 
> 
> Since all we need is a handle that does not leave the network stack,
> a network specific struct like page_pool_iov entirely avoids this issue.

Yes, I am agree about the network specific struct.
I am wondering if we can make the struct more generic if we want to
intergrate it into page_pool and use it in net stack.

> RFC v3 seems like a good simplification over RFC v1 in that regard to me.
> I was also pleasantly surprised how minimal the change to the users of
> skb_frag_t actually proved to be.

Yes, I am agreed about that too. Maybe we can make it simpler by using
a more abstract struct as page_pool, and utilize some features of
page_pool too.

For example, from the page_pool doc, page_pool have fast cache and
ptr-ring cache as below, but if napi_frag_unref() call
page_pool_page_put_many() and return the dmabuf chunk directly to
gen_pool in the memory provider, then it seems we are bypassing the
below caches in the page_pool.

    +------------------+
    |       Driver     |
    +------------------+
            ^
            |
            |
            |
            v
    +--------------------------------------------+
    |                request memory              |
    +--------------------------------------------+
        ^                                  ^
        |                                  |
        | Pool empty                       | Pool has entries
        |                                  |
        v                                  v
    +-----------------------+     +------------------------+
    | alloc (and map) pages |     |  get page from cache   |
    +-----------------------+     +------------------------+
                                    ^                    ^
                                    |                    |
                                    | cache available    | No entries, refill
                                    |                    | from ptr-ring
                                    |                    |
                                    v                    v
                          +-----------------+     +------------------+
                          |   Fast cache    |     |  ptr-ring cache  |
                          +-----------------+     +------------------+


> 
> .
>
Yunsheng Lin Nov. 15, 2023, 9:33 a.m. UTC | #14
On 2023/11/15 6:25, Jakub Kicinski wrote:
> On Tue, 14 Nov 2023 16:23:29 +0800 Yunsheng Lin wrote:
>> I would expect net stack, page pool, driver still see the 'struct page',
>> only memory provider see the specific struct for itself, for the above,
>> devmem memory provider sees the 'struct page_pool_iov'.
> 
> You can't lie to the driver that an _iov is a page either.

Yes, agreed about that.

As a matter of fact, the driver should be awared of what kind of
memory provider is using when it calls page_pool_create() during
init process.

> The driver must explicitly "opt-in" to using the _iov variant,
> by calling the _iov set of APIs.
> 
> Only drivers which can support header-data split can reasonably
> use the _iov API, for data pages.

But those drivers can still allow allocating normal memory, right?
sometimes for data and header part, and sometimes for the header part.

Do those drivers need to support two sets of APIs? the one with _iov
for devmem, and the one without _iov for normal memory. It seems somewhat
unnecessary from driver' point of veiw to support two sets of APIs?
The driver seems to know which type of page it is expecting when calling
page_pool_alloc() with a specific page_pool instance.

Or do we use the API with _iov to allocate both devmem and normal memory
in the new driver supporting devmem page?  If that is the case, does it
really matter if the API is with _iov or not?

> .
>
Jason Gunthorpe Nov. 15, 2023, 1:38 p.m. UTC | #15
On Wed, Nov 15, 2023 at 05:21:02PM +0800, Yunsheng Lin wrote:

> >>> I would expect net stack, page pool, driver still see the 'struct page',
> >>> only memory provider see the specific struct for itself, for the above,
> >>> devmem memory provider sees the 'struct page_pool_iov'.
> >>>
> >>> The reason I still expect driver to see the 'struct page' is that driver
> >>> will still need to support normal memory besides devmem.
> > 
> > I wouldn't say this approach is unreasonable, but it does have to be
> > done carefully to isolate the mm. Keeping the struct page in the API
> > is going to make this very hard.
> 
> I would expect that most of the isolation is done in page pool, as far as
> I can see:

It is the sort of thing that is important enough it should have
compiler help via types to prove that it is being done
properly. Otherwise it will be full of mistakes over time.

Jason
Mina Almasry Nov. 15, 2023, 5:44 p.m. UTC | #16
On Wed, Nov 15, 2023 at 1:21 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:
>
> On 2023/11/14 21:16, Jason Gunthorpe wrote:
> > On Tue, Nov 14, 2023 at 04:21:26AM -0800, Mina Almasry wrote:
> >
> >> Actually because you put the 'strtuct page for devmem' in
> >> skb->bv_frag, the net stack will grab the 'struct page' for devmem
> >> using skb_frag_page() then call things like page_address(), kmap,
> >> get_page, put_page, etc, etc, etc.
> >
> > Yikes, please no. If net has its own struct page look alike it has to
> > stay entirely inside net. A non-mm owned struct page should not be
> > passed into mm calls. It is just way too hacky to be seriously
> > considered :(
>
> Yes, that is something this patchset is trying to do, defining its own
> struct page look alike for page pool to support devmem.
>
> struct page for devmem will not be called into the mm subsystem, so most
> of the mm calls is avoided by calling into the devmem memory provider'
> ops instead of calling mm calls.
>
> As far as I see for now, only page_ref_count(), page_is_pfmemalloc() and
> PageTail() is called for devmem page, which should be easy to ensure that
> those call for devmem page is consistent with the struct page owned by mm.

I'm not sure this is true. These 3 calls are just the calls you're
aware of. In your proposal you're casting mirror pages into page* and
releasing them into the net stack. You need to scrub the entire net
stack for mm calls, i.e. all driver code and all skb_frag_page() call
sites. Of the top of my head, the driver is probably calling
page_address() and illegal_highdma() is calling PageHighMem(). TCP
zerocopy receive is calling vm_insert_pages().

> I am not sure if we can use some kind of compile/runtime checking to ensure
> those kinds of consistency?
>
> >
> >>> I would expect net stack, page pool, driver still see the 'struct page',
> >>> only memory provider see the specific struct for itself, for the above,
> >>> devmem memory provider sees the 'struct page_pool_iov'.
> >>>
> >>> The reason I still expect driver to see the 'struct page' is that driver
> >>> will still need to support normal memory besides devmem.
> >
> > I wouldn't say this approach is unreasonable, but it does have to be
> > done carefully to isolate the mm. Keeping the struct page in the API
> > is going to make this very hard.
>
> I would expect that most of the isolation is done in page pool, as far as
> I can see:
>
> 1. For control part: the driver may need to tell the page pool which memory
>                      provider it want to use. Or the administrator specifies
>                      which memory provider to use by some netlink-based cmd.
>
> 2. For data part: I am thinking that driver should only call page_pool_alloc(),
>                   page_pool_free() and page_pool_get_dma_addr related function.
>
> Of course the driver may need to be aware of that if it can call kmap() or
> page_address() on the page returned from page_pool_alloc(), and maybe tell
> net stack that those pages is not kmap()'able and page_address()'able.
>
> >
> > Jason
> > .
> >
David Ahern Nov. 15, 2023, 5:57 p.m. UTC | #17
On 11/15/23 2:21 AM, Yunsheng Lin wrote:
> On 2023/11/14 21:16, Jason Gunthorpe wrote:
>> On Tue, Nov 14, 2023 at 04:21:26AM -0800, Mina Almasry wrote:
>>
>>> Actually because you put the 'strtuct page for devmem' in
>>> skb->bv_frag, the net stack will grab the 'struct page' for devmem
>>> using skb_frag_page() then call things like page_address(), kmap,
>>> get_page, put_page, etc, etc, etc.
>>
>> Yikes, please no. If net has its own struct page look alike it has to
>> stay entirely inside net. A non-mm owned struct page should not be
>> passed into mm calls. It is just way too hacky to be seriously
>> considered :(
> 
> Yes, that is something this patchset is trying to do, defining its own
> struct page look alike for page pool to support devmem.
> 

Networking needs to be able to move away from struct page references.
The devmem and host memory for Rx use cases do not need to be page based.
Mina Almasry Nov. 15, 2023, 6:07 p.m. UTC | #18
On Wed, Nov 15, 2023 at 1:29 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:
>
> On 2023/11/14 23:41, Willem de Bruijn wrote:
> >>
> >> I am not sure dma-buf maintainer's concern is still there with this patchset.
> >>
> >> Whatever name you calling it for the struct, however you arrange each field
> >> in the struct, some metadata is always needed for dmabuf to intergrate into
> >> page pool.
> >>
> >> If the above is true, why not utilize the 'struct page' to have more unified
> >> handling?
> >
> > My understanding is that there is a general preference to simplify struct
> > page, and at the least not move in the other direction by overloading the
> > struct in new ways.
>
> As my understanding, the new struct is just mirroring the struct page pool
> is already using, see:
> https://elixir.free-electrons.com/linux/v6.7-rc1/source/include/linux/mm_types.h#L119
>
> If there is simplifying to the struct page_pool is using, I think the new
> stuct the devmem memory provider is using can adjust accordingly.
>
> As a matter of fact, I think the way 'struct page' for devmem is decoupled
> from mm subsystem may provide a way to simplify or decoupled the already
> existing 'struct page' used in netstack from mm subsystem, before this
> patchset, it seems we have the below types of 'struct page':
> 1. page allocated in the netstack using page pool.
> 2. page allocated in the netstack using buddy allocator.
> 3. page allocated in other subsystem and passed to the netstack, such as
>    zcopy or spliced page?
>
> If we can decouple 'struct page' for devmem from mm subsystem, we may be able
> to decouple the above 'struct page' from mm subsystem one by one.
>
> >
> > If using struct page for something that is not memory, there is ZONE_DEVICE.
> > But using that correctly is non-trivial:
> >
> > https://lore.kernel.org/all/ZKyZBbKEpmkFkpWV@ziepe.ca/
> >
> > Since all we need is a handle that does not leave the network stack,
> > a network specific struct like page_pool_iov entirely avoids this issue.
>
> Yes, I am agree about the network specific struct.
> I am wondering if we can make the struct more generic if we want to
> intergrate it into page_pool and use it in net stack.
>
> > RFC v3 seems like a good simplification over RFC v1 in that regard to me.
> > I was also pleasantly surprised how minimal the change to the users of
> > skb_frag_t actually proved to be.
>
> Yes, I am agreed about that too. Maybe we can make it simpler by using
> a more abstract struct as page_pool, and utilize some features of
> page_pool too.
>
> For example, from the page_pool doc, page_pool have fast cache and
> ptr-ring cache as below, but if napi_frag_unref() call
> page_pool_page_put_many() and return the dmabuf chunk directly to
> gen_pool in the memory provider, then it seems we are bypassing the
> below caches in the page_pool.
>

I think you're just misunderstanding the code. The page recycling
works with my patchset. napi_frag_unref() calls napi_pp_put_page() if
recycle == true, and that works the same with devmem as with regular
pages.

If recycle == false, we call page_pool_page_put_many() which will call
put_page() for regular pages and page_pool_iov_put_many() for devmem
pages. So, the memory recycling works exactly the same as before with
devmem as with regular pages. In my tests I do see the devmem being
recycled correctly. We are not bypassing any caches.


>     +------------------+
>     |       Driver     |
>     +------------------+
>             ^
>             |
>             |
>             |
>             v
>     +--------------------------------------------+
>     |                request memory              |
>     +--------------------------------------------+
>         ^                                  ^
>         |                                  |
>         | Pool empty                       | Pool has entries
>         |                                  |
>         v                                  v
>     +-----------------------+     +------------------------+
>     | alloc (and map) pages |     |  get page from cache   |
>     +-----------------------+     +------------------------+
>                                     ^                    ^
>                                     |                    |
>                                     | cache available    | No entries, refill
>                                     |                    | from ptr-ring
>                                     |                    |
>                                     v                    v
>                           +-----------------+     +------------------+
>                           |   Fast cache    |     |  ptr-ring cache  |
>                           +-----------------+     +------------------+
>
>
> >
> > .
> >
Mina Almasry Nov. 15, 2023, 7:05 p.m. UTC | #19
On Wed, Nov 15, 2023 at 10:07 AM Mina Almasry <almasrymina@google.com> wrote:
>
> On Wed, Nov 15, 2023 at 1:29 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:
> >
> > On 2023/11/14 23:41, Willem de Bruijn wrote:
> > >>
> > >> I am not sure dma-buf maintainer's concern is still there with this patchset.
> > >>
> > >> Whatever name you calling it for the struct, however you arrange each field
> > >> in the struct, some metadata is always needed for dmabuf to intergrate into
> > >> page pool.
> > >>
> > >> If the above is true, why not utilize the 'struct page' to have more unified
> > >> handling?
> > >
> > > My understanding is that there is a general preference to simplify struct
> > > page, and at the least not move in the other direction by overloading the
> > > struct in new ways.
> >
> > As my understanding, the new struct is just mirroring the struct page pool
> > is already using, see:
> > https://elixir.free-electrons.com/linux/v6.7-rc1/source/include/linux/mm_types.h#L119
> >
> > If there is simplifying to the struct page_pool is using, I think the new
> > stuct the devmem memory provider is using can adjust accordingly.
> >
> > As a matter of fact, I think the way 'struct page' for devmem is decoupled
> > from mm subsystem may provide a way to simplify or decoupled the already
> > existing 'struct page' used in netstack from mm subsystem, before this
> > patchset, it seems we have the below types of 'struct page':
> > 1. page allocated in the netstack using page pool.
> > 2. page allocated in the netstack using buddy allocator.
> > 3. page allocated in other subsystem and passed to the netstack, such as
> >    zcopy or spliced page?
> >
> > If we can decouple 'struct page' for devmem from mm subsystem, we may be able
> > to decouple the above 'struct page' from mm subsystem one by one.
> >
> > >
> > > If using struct page for something that is not memory, there is ZONE_DEVICE.
> > > But using that correctly is non-trivial:
> > >
> > > https://lore.kernel.org/all/ZKyZBbKEpmkFkpWV@ziepe.ca/
> > >
> > > Since all we need is a handle that does not leave the network stack,
> > > a network specific struct like page_pool_iov entirely avoids this issue.
> >
> > Yes, I am agree about the network specific struct.
> > I am wondering if we can make the struct more generic if we want to
> > intergrate it into page_pool and use it in net stack.
> >
> > > RFC v3 seems like a good simplification over RFC v1 in that regard to me.
> > > I was also pleasantly surprised how minimal the change to the users of
> > > skb_frag_t actually proved to be.
> >
> > Yes, I am agreed about that too. Maybe we can make it simpler by using
> > a more abstract struct as page_pool, and utilize some features of
> > page_pool too.
> >
> > For example, from the page_pool doc, page_pool have fast cache and
> > ptr-ring cache as below, but if napi_frag_unref() call
> > page_pool_page_put_many() and return the dmabuf chunk directly to
> > gen_pool in the memory provider, then it seems we are bypassing the
> > below caches in the page_pool.
> >
>
> I think you're just misunderstanding the code. The page recycling
> works with my patchset. napi_frag_unref() calls napi_pp_put_page() if
> recycle == true, and that works the same with devmem as with regular
> pages.
>
> If recycle == false, we call page_pool_page_put_many() which will call
> put_page() for regular pages and page_pool_iov_put_many() for devmem
> pages. So, the memory recycling works exactly the same as before with
> devmem as with regular pages. In my tests I do see the devmem being
> recycled correctly. We are not bypassing any caches.
>
>

Ah, taking a closer look here, the devmem recycling works for me but I
think that's a side effect to the fact that the page_pool support I
implemented with GVE is unusual. I currently allocate pages from the
page_pool but do not set skb_mark_for_recycle(). The page recycling
still happens when GVE is done with the page and calls
page_pool_put_full_pgae(), as that eventually checks the refcount on
the devmem and recycles it.

I will fix up the GVE to call skb_mark_for_recycle() and ensure the
napi_pp_put_page() path recycles the devmem or page correctly in the
next version.

> >     +------------------+
> >     |       Driver     |
> >     +------------------+
> >             ^
> >             |
> >             |
> >             |
> >             v
> >     +--------------------------------------------+
> >     |                request memory              |
> >     +--------------------------------------------+
> >         ^                                  ^
> >         |                                  |
> >         | Pool empty                       | Pool has entries
> >         |                                  |
> >         v                                  v
> >     +-----------------------+     +------------------------+
> >     | alloc (and map) pages |     |  get page from cache   |
> >     +-----------------------+     +------------------------+
> >                                     ^                    ^
> >                                     |                    |
> >                                     | cache available    | No entries, refill
> >                                     |                    | from ptr-ring
> >                                     |                    |
> >                                     v                    v
> >                           +-----------------+     +------------------+
> >                           |   Fast cache    |     |  ptr-ring cache  |
> >                           +-----------------+     +------------------+
> >
> >
> > >
> > > .
> > >
>
>
>
> --
> Thanks,
> Mina
Yunsheng Lin Nov. 16, 2023, 11:10 a.m. UTC | #20
On 2023/11/15 21:38, Jason Gunthorpe wrote:
> On Wed, Nov 15, 2023 at 05:21:02PM +0800, Yunsheng Lin wrote:
> 
>>>>> I would expect net stack, page pool, driver still see the 'struct page',
>>>>> only memory provider see the specific struct for itself, for the above,
>>>>> devmem memory provider sees the 'struct page_pool_iov'.
>>>>>
>>>>> The reason I still expect driver to see the 'struct page' is that driver
>>>>> will still need to support normal memory besides devmem.
>>>
>>> I wouldn't say this approach is unreasonable, but it does have to be
>>> done carefully to isolate the mm. Keeping the struct page in the API
>>> is going to make this very hard.
>>
>> I would expect that most of the isolation is done in page pool, as far as
>> I can see:
> 
> It is the sort of thing that is important enough it should have
> compiler help via types to prove that it is being done
> properly. Otherwise it will be full of mistakes over time.

Yes, agreed.

I have done something similar as willy has done for some of
folio conversion as below:

+#define PAGE_POOL_MATCH(pg, iov)				\
+	static_assert(offsetof(struct page, pg) ==		\
+		      offsetof(struct page_pool_iov, iov))
+PAGE_POOL_MATCH(flags, res0);
+PAGE_POOL_MATCH(pp_magic, pp_magic);
+PAGE_POOL_MATCH(pp, pp);
...

Not sure if we need to add new API for driver to use when the
driver need the devmem support yet.


> 
> Jason
> .
>
Yunsheng Lin Nov. 16, 2023, 11:11 a.m. UTC | #21
On 2023/11/16 1:44, Mina Almasry wrote:
> On Wed, Nov 15, 2023 at 1:21 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:
>>
>> On 2023/11/14 21:16, Jason Gunthorpe wrote:
>>> On Tue, Nov 14, 2023 at 04:21:26AM -0800, Mina Almasry wrote:
>>>
>>>> Actually because you put the 'strtuct page for devmem' in
>>>> skb->bv_frag, the net stack will grab the 'struct page' for devmem
>>>> using skb_frag_page() then call things like page_address(), kmap,
>>>> get_page, put_page, etc, etc, etc.
>>>
>>> Yikes, please no. If net has its own struct page look alike it has to
>>> stay entirely inside net. A non-mm owned struct page should not be
>>> passed into mm calls. It is just way too hacky to be seriously
>>> considered :(
>>
>> Yes, that is something this patchset is trying to do, defining its own
>> struct page look alike for page pool to support devmem.
>>
>> struct page for devmem will not be called into the mm subsystem, so most
>> of the mm calls is avoided by calling into the devmem memory provider'
>> ops instead of calling mm calls.
>>
>> As far as I see for now, only page_ref_count(), page_is_pfmemalloc() and
>> PageTail() is called for devmem page, which should be easy to ensure that
>> those call for devmem page is consistent with the struct page owned by mm.
> 
> I'm not sure this is true. These 3 calls are just the calls you're
> aware of. In your proposal you're casting mirror pages into page* and
> releasing them into the net stack. You need to scrub the entire net
> stack for mm calls, i.e. all driver code and all skb_frag_page() call
> sites. Of the top of my head, the driver is probably calling
> page_address() and illegal_highdma() is calling PageHighMem(). TCP
> zerocopy receive is calling vm_insert_pages().

For net stack part, I believe your patch below is handling to aovid those
mm calls? I don't include it in this patchset as I thought it is obvious
that whatever the proposal is, we always need those checking.
Maybe we should have included it to avoid this kind of confusion.
https://lore.kernel.org/all/20231106024413.2801438-10-almasrymina@google.com/

For driver part, I was thinking if the driver supports devmem, it should check
that if it can call page_address() related call on a specific 'stuct page', or
maybe we should introduce a new helper to make it obvious?
Yunsheng Lin Nov. 16, 2023, 11:12 a.m. UTC | #22
On 2023/11/16 3:05, Mina Almasry wrote:
> On Wed, Nov 15, 2023 at 10:07 AM Mina Almasry <almasrymina@google.com> wrote:
>>
>> On Wed, Nov 15, 2023 at 1:29 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:
>>>
>>> On 2023/11/14 23:41, Willem de Bruijn wrote:
>>>>>
>>>>> I am not sure dma-buf maintainer's concern is still there with this patchset.
>>>>>
>>>>> Whatever name you calling it for the struct, however you arrange each field
>>>>> in the struct, some metadata is always needed for dmabuf to intergrate into
>>>>> page pool.
>>>>>
>>>>> If the above is true, why not utilize the 'struct page' to have more unified
>>>>> handling?
>>>>
>>>> My understanding is that there is a general preference to simplify struct
>>>> page, and at the least not move in the other direction by overloading the
>>>> struct in new ways.
>>>
>>> As my understanding, the new struct is just mirroring the struct page pool
>>> is already using, see:
>>> https://elixir.free-electrons.com/linux/v6.7-rc1/source/include/linux/mm_types.h#L119
>>>
>>> If there is simplifying to the struct page_pool is using, I think the new
>>> stuct the devmem memory provider is using can adjust accordingly.
>>>
>>> As a matter of fact, I think the way 'struct page' for devmem is decoupled
>>> from mm subsystem may provide a way to simplify or decoupled the already
>>> existing 'struct page' used in netstack from mm subsystem, before this
>>> patchset, it seems we have the below types of 'struct page':
>>> 1. page allocated in the netstack using page pool.
>>> 2. page allocated in the netstack using buddy allocator.
>>> 3. page allocated in other subsystem and passed to the netstack, such as
>>>    zcopy or spliced page?
>>>
>>> If we can decouple 'struct page' for devmem from mm subsystem, we may be able
>>> to decouple the above 'struct page' from mm subsystem one by one.
>>>
>>>>
>>>> If using struct page for something that is not memory, there is ZONE_DEVICE.
>>>> But using that correctly is non-trivial:
>>>>
>>>> https://lore.kernel.org/all/ZKyZBbKEpmkFkpWV@ziepe.ca/
>>>>
>>>> Since all we need is a handle that does not leave the network stack,
>>>> a network specific struct like page_pool_iov entirely avoids this issue.
>>>
>>> Yes, I am agree about the network specific struct.
>>> I am wondering if we can make the struct more generic if we want to
>>> intergrate it into page_pool and use it in net stack.
>>>
>>>> RFC v3 seems like a good simplification over RFC v1 in that regard to me.
>>>> I was also pleasantly surprised how minimal the change to the users of
>>>> skb_frag_t actually proved to be.
>>>
>>> Yes, I am agreed about that too. Maybe we can make it simpler by using
>>> a more abstract struct as page_pool, and utilize some features of
>>> page_pool too.
>>>
>>> For example, from the page_pool doc, page_pool have fast cache and
>>> ptr-ring cache as below, but if napi_frag_unref() call
>>> page_pool_page_put_many() and return the dmabuf chunk directly to
>>> gen_pool in the memory provider, then it seems we are bypassing the
>>> below caches in the page_pool.
>>>
>>
>> I think you're just misunderstanding the code. The page recycling
>> works with my patchset. napi_frag_unref() calls napi_pp_put_page() if
>> recycle == true, and that works the same with devmem as with regular
>> pages.
>>
>> If recycle == false, we call page_pool_page_put_many() which will call
>> put_page() for regular pages and page_pool_iov_put_many() for devmem
>> pages. So, the memory recycling works exactly the same as before with
>> devmem as with regular pages. In my tests I do see the devmem being
>> recycled correctly. We are not bypassing any caches.
>>
>>
> 
> Ah, taking a closer look here, the devmem recycling works for me but I
> think that's a side effect to the fact that the page_pool support I
> implemented with GVE is unusual. I currently allocate pages from the
> page_pool but do not set skb_mark_for_recycle(). The page recycling
> still happens when GVE is done with the page and calls
> page_pool_put_full_pgae(), as that eventually checks the refcount on
> the devmem and recycles it.
> 
> I will fix up the GVE to call skb_mark_for_recycle() and ensure the
> napi_pp_put_page() path recycles the devmem or page correctly in the
> next version.

What about other features? Like dma mapping optimization and frag support
in the page pool.

I understand that you use some trick in the gen_gool to avoid the per chunk
dma_addr storage in the 'struct page_pool_iov' and do not need frag support
for now.

But for other memory provider, if they need those supports, we probably need
to extend 'struct page_pool_iov' to support those features, then we may have
a 'struct page_pool_iov' to be looking alike to the sepcific union for page_pool
in 'struct page', see:

https://elixir.free-electrons.com/linux/v6.7-rc1/source/include/linux/mm_types.h#L119

We currently don't have a way to decouple the existing 'struct page' from mm
subsystem yet as my understanding, if we don't mirror the above union in the
'struct page', we may have more 'if' checking added in the future.
Yunsheng Lin Nov. 16, 2023, 11:12 a.m. UTC | #23
On 2023/11/16 1:57, David Ahern wrote:
> On 11/15/23 2:21 AM, Yunsheng Lin wrote:
>> On 2023/11/14 21:16, Jason Gunthorpe wrote:
>>> On Tue, Nov 14, 2023 at 04:21:26AM -0800, Mina Almasry wrote:
>>>
>>>> Actually because you put the 'strtuct page for devmem' in
>>>> skb->bv_frag, the net stack will grab the 'struct page' for devmem
>>>> using skb_frag_page() then call things like page_address(), kmap,
>>>> get_page, put_page, etc, etc, etc.
>>>
>>> Yikes, please no. If net has its own struct page look alike it has to
>>> stay entirely inside net. A non-mm owned struct page should not be
>>> passed into mm calls. It is just way too hacky to be seriously
>>> considered :(
>>
>> Yes, that is something this patchset is trying to do, defining its own
>> struct page look alike for page pool to support devmem.
>>
> 
> Networking needs to be able to move away from struct page references.
> The devmem and host memory for Rx use cases do not need to be page based.

Yes, I am agreed the ultimate goal is to move away from struct page
references. But I am not sure if we can do it right away as there
still are different types of existing 'struct page' in the netstack,
see:

https://lore.kernel.org/all/8b7d25eb-1f10-3e37-8753-92b42da3fb34@huawei.com/

> 
> 
> .
>
Mina Almasry Nov. 16, 2023, 11:30 a.m. UTC | #24
On Thu, Nov 16, 2023 at 3:12 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:
>
> On 2023/11/16 3:05, Mina Almasry wrote:
> > On Wed, Nov 15, 2023 at 10:07 AM Mina Almasry <almasrymina@google.com> wrote:
> >>
> >> On Wed, Nov 15, 2023 at 1:29 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:
> >>>
> >>> On 2023/11/14 23:41, Willem de Bruijn wrote:
> >>>>>
> >>>>> I am not sure dma-buf maintainer's concern is still there with this patchset.
> >>>>>
> >>>>> Whatever name you calling it for the struct, however you arrange each field
> >>>>> in the struct, some metadata is always needed for dmabuf to intergrate into
> >>>>> page pool.
> >>>>>
> >>>>> If the above is true, why not utilize the 'struct page' to have more unified
> >>>>> handling?
> >>>>
> >>>> My understanding is that there is a general preference to simplify struct
> >>>> page, and at the least not move in the other direction by overloading the
> >>>> struct in new ways.
> >>>
> >>> As my understanding, the new struct is just mirroring the struct page pool
> >>> is already using, see:
> >>> https://elixir.free-electrons.com/linux/v6.7-rc1/source/include/linux/mm_types.h#L119
> >>>
> >>> If there is simplifying to the struct page_pool is using, I think the new
> >>> stuct the devmem memory provider is using can adjust accordingly.
> >>>
> >>> As a matter of fact, I think the way 'struct page' for devmem is decoupled
> >>> from mm subsystem may provide a way to simplify or decoupled the already
> >>> existing 'struct page' used in netstack from mm subsystem, before this
> >>> patchset, it seems we have the below types of 'struct page':
> >>> 1. page allocated in the netstack using page pool.
> >>> 2. page allocated in the netstack using buddy allocator.
> >>> 3. page allocated in other subsystem and passed to the netstack, such as
> >>>    zcopy or spliced page?
> >>>
> >>> If we can decouple 'struct page' for devmem from mm subsystem, we may be able
> >>> to decouple the above 'struct page' from mm subsystem one by one.
> >>>
> >>>>
> >>>> If using struct page for something that is not memory, there is ZONE_DEVICE.
> >>>> But using that correctly is non-trivial:
> >>>>
> >>>> https://lore.kernel.org/all/ZKyZBbKEpmkFkpWV@ziepe.ca/
> >>>>
> >>>> Since all we need is a handle that does not leave the network stack,
> >>>> a network specific struct like page_pool_iov entirely avoids this issue.
> >>>
> >>> Yes, I am agree about the network specific struct.
> >>> I am wondering if we can make the struct more generic if we want to
> >>> intergrate it into page_pool and use it in net stack.
> >>>
> >>>> RFC v3 seems like a good simplification over RFC v1 in that regard to me.
> >>>> I was also pleasantly surprised how minimal the change to the users of
> >>>> skb_frag_t actually proved to be.
> >>>
> >>> Yes, I am agreed about that too. Maybe we can make it simpler by using
> >>> a more abstract struct as page_pool, and utilize some features of
> >>> page_pool too.
> >>>
> >>> For example, from the page_pool doc, page_pool have fast cache and
> >>> ptr-ring cache as below, but if napi_frag_unref() call
> >>> page_pool_page_put_many() and return the dmabuf chunk directly to
> >>> gen_pool in the memory provider, then it seems we are bypassing the
> >>> below caches in the page_pool.
> >>>
> >>
> >> I think you're just misunderstanding the code. The page recycling
> >> works with my patchset. napi_frag_unref() calls napi_pp_put_page() if
> >> recycle == true, and that works the same with devmem as with regular
> >> pages.
> >>
> >> If recycle == false, we call page_pool_page_put_many() which will call
> >> put_page() for regular pages and page_pool_iov_put_many() for devmem
> >> pages. So, the memory recycling works exactly the same as before with
> >> devmem as with regular pages. In my tests I do see the devmem being
> >> recycled correctly. We are not bypassing any caches.
> >>
> >>
> >
> > Ah, taking a closer look here, the devmem recycling works for me but I
> > think that's a side effect to the fact that the page_pool support I
> > implemented with GVE is unusual. I currently allocate pages from the
> > page_pool but do not set skb_mark_for_recycle(). The page recycling
> > still happens when GVE is done with the page and calls
> > page_pool_put_full_pgae(), as that eventually checks the refcount on
> > the devmem and recycles it.
> >
> > I will fix up the GVE to call skb_mark_for_recycle() and ensure the
> > napi_pp_put_page() path recycles the devmem or page correctly in the
> > next version.
>
> What about other features? Like dma mapping optimization and frag support
> in the page pool.
>

PP_FLAG_DMA_MAP will be supported and required in the next version per
Jakub's request.

frag support is something I disabled in the initial versions of the
patchset, but only out of convenience and to simplify the initial
implementation. At google we typically use page aligned MSS and the
frag support isn't really that useful for us. I don't see an issue
extending frag support to devmem and iovs in the future if needed.
We'd probably add the pp_frag_count field to page_pool_iov and handle
it similarly to how it's handled for pages.

> I understand that you use some trick in the gen_gool to avoid the per chunk
> dma_addr storage in the 'struct page_pool_iov' and do not need frag support
> for now.
>
> But for other memory provider, if they need those supports, we probably need
> to extend 'struct page_pool_iov' to support those features, then we may have
> a 'struct page_pool_iov' to be looking alike to the sepcific union for page_pool
> in 'struct page', see:
>
> https://elixir.free-electrons.com/linux/v6.7-rc1/source/include/linux/mm_types.h#L119
>
> We currently don't have a way to decouple the existing 'struct page' from mm
> subsystem yet as my understanding, if we don't mirror the above union in the
> 'struct page', we may have more 'if' checking added in the future.
Jason Gunthorpe Nov. 16, 2023, 3:31 p.m. UTC | #25
On Thu, Nov 16, 2023 at 07:10:01PM +0800, Yunsheng Lin wrote:
> On 2023/11/15 21:38, Jason Gunthorpe wrote:
> > On Wed, Nov 15, 2023 at 05:21:02PM +0800, Yunsheng Lin wrote:
> > 
> >>>>> I would expect net stack, page pool, driver still see the 'struct page',
> >>>>> only memory provider see the specific struct for itself, for the above,
> >>>>> devmem memory provider sees the 'struct page_pool_iov'.
> >>>>>
> >>>>> The reason I still expect driver to see the 'struct page' is that driver
> >>>>> will still need to support normal memory besides devmem.
> >>>
> >>> I wouldn't say this approach is unreasonable, but it does have to be
> >>> done carefully to isolate the mm. Keeping the struct page in the API
> >>> is going to make this very hard.
> >>
> >> I would expect that most of the isolation is done in page pool, as far as
> >> I can see:
> > 
> > It is the sort of thing that is important enough it should have
> > compiler help via types to prove that it is being done
> > properly. Otherwise it will be full of mistakes over time.
> 
> Yes, agreed.
> 
> I have done something similar as willy has done for some of
> folio conversion as below:

That is not at all what I mean, I mean you should not use
struct page * types at all in code that flows from the _iov version
except via limited accessors that can be audited and have appropriate
assertions.

Just releasing struct page * that is not a struct page * everywhere
without type safety will never be correct long term.

Jason
David Ahern Nov. 16, 2023, 3:58 p.m. UTC | #26
On 11/16/23 4:12 AM, Yunsheng Lin wrote:
> On 2023/11/16 1:57, David Ahern wrote:
>> On 11/15/23 2:21 AM, Yunsheng Lin wrote:
>>> On 2023/11/14 21:16, Jason Gunthorpe wrote:
>>>> On Tue, Nov 14, 2023 at 04:21:26AM -0800, Mina Almasry wrote:
>>>>
>>>>> Actually because you put the 'strtuct page for devmem' in
>>>>> skb->bv_frag, the net stack will grab the 'struct page' for devmem
>>>>> using skb_frag_page() then call things like page_address(), kmap,
>>>>> get_page, put_page, etc, etc, etc.
>>>>
>>>> Yikes, please no. If net has its own struct page look alike it has to
>>>> stay entirely inside net. A non-mm owned struct page should not be
>>>> passed into mm calls. It is just way too hacky to be seriously
>>>> considered :(
>>>
>>> Yes, that is something this patchset is trying to do, defining its own
>>> struct page look alike for page pool to support devmem.
>>>
>>
>> Networking needs to be able to move away from struct page references.
>> The devmem and host memory for Rx use cases do not need to be page based.
> 
> Yes, I am agreed the ultimate goal is to move away from struct page
> references. But I am not sure if we can do it right away as there
> still are different types of existing 'struct page' in the netstack,
> see:
> 
> https://lore.kernel.org/all/8b7d25eb-1f10-3e37-8753-92b42da3fb34@huawei.com/

yes, that is the point of a blended approach -- pages and buffers (or
iov) -- leveraging the LSB of the address. That proposal is the right
direction to be moving for co-existence. Adding fake struct page
instances is the wrong direction.
Yunsheng Lin Nov. 17, 2023, 11:27 a.m. UTC | #27
On 2023/11/16 23:58, David Ahern wrote:
> On 11/16/23 4:12 AM, Yunsheng Lin wrote:
>> On 2023/11/16 1:57, David Ahern wrote:
>>> On 11/15/23 2:21 AM, Yunsheng Lin wrote:
>>>> On 2023/11/14 21:16, Jason Gunthorpe wrote:
>>>>> On Tue, Nov 14, 2023 at 04:21:26AM -0800, Mina Almasry wrote:
>>>>>
>>>>>> Actually because you put the 'strtuct page for devmem' in
>>>>>> skb->bv_frag, the net stack will grab the 'struct page' for devmem
>>>>>> using skb_frag_page() then call things like page_address(), kmap,
>>>>>> get_page, put_page, etc, etc, etc.
>>>>>
>>>>> Yikes, please no. If net has its own struct page look alike it has to
>>>>> stay entirely inside net. A non-mm owned struct page should not be
>>>>> passed into mm calls. It is just way too hacky to be seriously
>>>>> considered :(
>>>>
>>>> Yes, that is something this patchset is trying to do, defining its own
>>>> struct page look alike for page pool to support devmem.
>>>>
>>>
>>> Networking needs to be able to move away from struct page references.
>>> The devmem and host memory for Rx use cases do not need to be page based.
>>
>> Yes, I am agreed the ultimate goal is to move away from struct page
>> references. But I am not sure if we can do it right away as there
>> still are different types of existing 'struct page' in the netstack,
>> see:
>>
>> https://lore.kernel.org/all/8b7d25eb-1f10-3e37-8753-92b42da3fb34@huawei.com/
> 
> yes, that is the point of a blended approach -- pages and buffers (or
> iov) -- leveraging the LSB of the address. That proposal is the right

I am not sure leveraging the LSB of the address is necessary yet, as it
does not seems to provide the type check protection, it seems to just
provide a way to demux between pages(including page pool owned page and
non-page pool owned page) and page pool owned buffer.
That info is avaliable through the page->pp_magic and page->pp->mp_*
too if we mirror the page pool specific union in 'struct page'.

> direction to be moving for co-existence. Adding fake struct page
> instances is the wrong direction.

Perhaps a fake struct page with type check protection is the right
direction?

Intergrating devmem to page pool without a unified metadata between
pages and buffers or without a proper abstract layer does not seems
like the good direction either.

> .
>
diff mbox series

Patch

diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h
index 5e4fcd45ba50..52e4cf98ebc6 100644
--- a/include/net/page_pool/types.h
+++ b/include/net/page_pool/types.h
@@ -124,6 +124,7 @@  struct mem_provider;
 
 enum pp_memory_provider_type {
 	__PP_MP_NONE, /* Use system allocator directly */
+	PP_MP_DMABUF_DEVMEM, /* dmabuf devmem provider */
 };
 
 struct pp_memory_provider_ops {
@@ -134,6 +135,33 @@  struct pp_memory_provider_ops {
 	void (*free_pages)(struct page_pool *pool, struct page *page);
 };
 
+extern const struct pp_memory_provider_ops dmabuf_devmem_ops;
+
+struct page_pool_iov {
+	unsigned long res0;
+	unsigned long pp_magic;
+	struct page_pool *pp;
+	struct page *page;  /* dmabuf memory provider specific field */
+	unsigned long dma_addr;
+	atomic_long_t pp_frag_count;
+	unsigned int res1;
+	refcount_t _refcount;
+};
+
+#define PAGE_POOL_MATCH(pg, iov)				\
+	static_assert(offsetof(struct page, pg) ==		\
+		      offsetof(struct page_pool_iov, iov))
+PAGE_POOL_MATCH(flags, res0);
+PAGE_POOL_MATCH(pp_magic, pp_magic);
+PAGE_POOL_MATCH(pp, pp);
+PAGE_POOL_MATCH(_pp_mapping_pad, page);
+PAGE_POOL_MATCH(dma_addr, dma_addr);
+PAGE_POOL_MATCH(pp_frag_count, pp_frag_count);
+PAGE_POOL_MATCH(_mapcount, res1);
+PAGE_POOL_MATCH(_refcount, _refcount);
+#undef PAGE_POOL_MATCH
+static_assert(sizeof(struct page_pool_iov) <= sizeof(struct page));
+
 struct page_pool {
 	struct page_pool_params p;
 
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 6c502bea842b..1bd7a2306f09 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -231,6 +231,9 @@  static int page_pool_init(struct page_pool *pool,
 	switch (pool->p.memory_provider) {
 	case __PP_MP_NONE:
 		break;
+	case PP_MP_DMABUF_DEVMEM:
+		pool->mp_ops = &dmabuf_devmem_ops;
+		break;
 	default:
 		err = -EINVAL;
 		goto free_ptr_ring;
@@ -1010,3 +1013,93 @@  void page_pool_update_nid(struct page_pool *pool, int new_nid)
 	}
 }
 EXPORT_SYMBOL(page_pool_update_nid);
+
+/*** "Dmabuf devmem memory provider" ***/
+
+static int mp_dmabuf_devmem_init(struct page_pool *pool)
+{
+	if (pool->p.flags & PP_FLAG_DMA_MAP ||
+	    pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
+		return -EOPNOTSUPP;
+	return 0;
+}
+
+static struct page *mp_dmabuf_devmem_alloc_pages(struct page_pool *pool,
+						 gfp_t gfp)
+{
+	struct page_pool_iov *ppiov;
+	struct page *page;
+	dma_addr_t dma;
+
+	ppiov = kvmalloc(sizeof(*ppiov), gfp | __GFP_ZERO);
+	if (!ppiov)
+		return NULL;
+
+	page = alloc_pages_node(pool->p.nid, gfp, pool->p.order);
+	if (!page) {
+		kvfree(ppiov);
+		return NULL;
+	}
+
+	dma = dma_map_page_attrs(pool->p.dev, page, 0,
+				 (PAGE_SIZE << pool->p.order),
+				 pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC |
+						  DMA_ATTR_WEAK_ORDERING);
+	if (dma_mapping_error(pool->p.dev, dma)) {
+		put_page(page);
+		kvfree(ppiov);
+		return NULL;
+	}
+
+	ppiov->pp = pool;
+	ppiov->pp_magic = PP_SIGNATURE;
+	ppiov->page = page;
+	refcount_set(&ppiov->_refcount, 1);
+	page_pool_fragment_page((struct page *)ppiov, 1);
+	page_pool_set_dma_addr((struct page *)ppiov, dma);
+	pool->pages_state_hold_cnt++;
+	trace_page_pool_state_hold(pool, (struct page *)ppiov,
+				   pool->pages_state_hold_cnt);
+	return (struct page *)ppiov;
+}
+
+static void mp_dmabuf_devmem_destroy(struct page_pool *pool)
+{
+}
+
+static void mp_dmabuf_devmem_release_page(struct page_pool *pool,
+					  struct page *page)
+{
+	struct page_pool_iov *ppiov = (struct page_pool_iov *)page;
+	dma_addr_t dma;
+
+	dma = page_pool_get_dma_addr(page);
+
+	/* When page is unmapped, it cannot be returned to our pool */
+	dma_unmap_page_attrs(pool->p.dev, dma,
+			     PAGE_SIZE << pool->p.order, pool->p.dma_dir,
+			     DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING);
+	page_pool_set_dma_addr(page, 0);
+
+	put_page(ppiov->page);
+}
+
+static void mp_dmabuf_devmem_free_pages(struct page_pool *pool,
+					struct page *page)
+{
+	int count;
+
+	count = atomic_inc_return_relaxed(&pool->pages_state_release_cnt);
+	trace_page_pool_state_release(pool, page, count);
+
+	kvfree(page);
+}
+
+const struct pp_memory_provider_ops dmabuf_devmem_ops = {
+	.init			= mp_dmabuf_devmem_init,
+	.destroy		= mp_dmabuf_devmem_destroy,
+	.alloc_pages		= mp_dmabuf_devmem_alloc_pages,
+	.release_page		= mp_dmabuf_devmem_release_page,
+	.free_pages		= mp_dmabuf_devmem_free_pages,
+};
+EXPORT_SYMBOL(dmabuf_devmem_ops);