Message ID | 20231113130041.58124-4-linyunsheng@huawei.com (mailing list archive) |
---|---|
State | RFC |
Delegated to: | Netdev Maintainers |
Headers | show |
Series | A possible proposal for intergating dmabuf to page pool | expand |
On Mon, Nov 13, 2023 at 5:00 AM Yunsheng Lin <linyunsheng@huawei.com> wrote: > > From: Mina Almasry <almasrymina@google.com> > > Implement a memory provider that allocates dmabuf devmem page_pool_iovs. > > Support of PP_FLAG_DMA_MAP and PP_FLAG_DMA_SYNC_DEV is omitted for > simplicity. > > The provider receives a reference to the struct netdev_dmabuf_binding > via the pool->mp_priv pointer. The driver needs to set this pointer for > the provider in the page_pool_params. > > The provider obtains a reference on the netdev_dmabuf_binding which > guarantees the binding and the underlying mapping remains alive until > the provider is destroyed. > > Signed-off-by: Willem de Bruijn <willemb@google.com> > Signed-off-by: Kaiyuan Zhang <kaiyuanz@google.com> > Signed-off-by: Mina Almasry <almasrymina@google.com> > Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com> > --- > include/net/page_pool/types.h | 28 +++++++++++ > net/core/page_pool.c | 93 +++++++++++++++++++++++++++++++++++ > 2 files changed, 121 insertions(+) > > diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h > index 5e4fcd45ba50..52e4cf98ebc6 100644 > --- a/include/net/page_pool/types.h > +++ b/include/net/page_pool/types.h > @@ -124,6 +124,7 @@ struct mem_provider; > > enum pp_memory_provider_type { > __PP_MP_NONE, /* Use system allocator directly */ > + PP_MP_DMABUF_DEVMEM, /* dmabuf devmem provider */ > }; > > struct pp_memory_provider_ops { > @@ -134,6 +135,33 @@ struct pp_memory_provider_ops { > void (*free_pages)(struct page_pool *pool, struct page *page); > }; > > +extern const struct pp_memory_provider_ops dmabuf_devmem_ops; > + > +struct page_pool_iov { > + unsigned long res0; > + unsigned long pp_magic; > + struct page_pool *pp; > + struct page *page; /* dmabuf memory provider specific field */ > + unsigned long dma_addr; > + atomic_long_t pp_frag_count; > + unsigned int res1; > + refcount_t _refcount; > +}; > + > +#define PAGE_POOL_MATCH(pg, iov) \ > + static_assert(offsetof(struct page, pg) == \ > + offsetof(struct page_pool_iov, iov)) > +PAGE_POOL_MATCH(flags, res0); > +PAGE_POOL_MATCH(pp_magic, pp_magic); > +PAGE_POOL_MATCH(pp, pp); > +PAGE_POOL_MATCH(_pp_mapping_pad, page); > +PAGE_POOL_MATCH(dma_addr, dma_addr); > +PAGE_POOL_MATCH(pp_frag_count, pp_frag_count); > +PAGE_POOL_MATCH(_mapcount, res1); > +PAGE_POOL_MATCH(_refcount, _refcount); > +#undef PAGE_POOL_MATCH > +static_assert(sizeof(struct page_pool_iov) <= sizeof(struct page)); > + You're doing exactly what I think you're doing, and what was nacked in RFC v1. You've converted 'struct page_pool_iov' to essentially become a duplicate of 'struct page'. Then, you're casting page_pool_iov* into struct page* in mp_dmabuf_devmem_alloc_pages(), then, you're calling mm APIs like page_ref_*() on the page_pool_iov* because you've fooled the mm stack into thinking dma-buf memory is a struct page. RFC v1 was almost exactly the same, except instead of creating a duplicate definition of struct page, it just allocated 'struct page' instead of allocating another struct that is identical to struct page and casting it into struct page. I don't think what you're doing here reverses the nacks I got in RFC v1. You also did not CC any dma-buf or mm people on this proposal that would bring up these concerns again. > struct page_pool { > struct page_pool_params p; > > diff --git a/net/core/page_pool.c b/net/core/page_pool.c > index 6c502bea842b..1bd7a2306f09 100644 > --- a/net/core/page_pool.c > +++ b/net/core/page_pool.c > @@ -231,6 +231,9 @@ static int page_pool_init(struct page_pool *pool, > switch (pool->p.memory_provider) { > case __PP_MP_NONE: > break; > + case PP_MP_DMABUF_DEVMEM: > + pool->mp_ops = &dmabuf_devmem_ops; > + break; > default: > err = -EINVAL; > goto free_ptr_ring; > @@ -1010,3 +1013,93 @@ void page_pool_update_nid(struct page_pool *pool, int new_nid) > } > } > EXPORT_SYMBOL(page_pool_update_nid); > + > +/*** "Dmabuf devmem memory provider" ***/ > + > +static int mp_dmabuf_devmem_init(struct page_pool *pool) > +{ > + if (pool->p.flags & PP_FLAG_DMA_MAP || > + pool->p.flags & PP_FLAG_DMA_SYNC_DEV) > + return -EOPNOTSUPP; > + return 0; > +} > + > +static struct page *mp_dmabuf_devmem_alloc_pages(struct page_pool *pool, > + gfp_t gfp) > +{ > + struct page_pool_iov *ppiov; > + struct page *page; > + dma_addr_t dma; > + > + ppiov = kvmalloc(sizeof(*ppiov), gfp | __GFP_ZERO); > + if (!ppiov) > + return NULL; > + > + page = alloc_pages_node(pool->p.nid, gfp, pool->p.order); > + if (!page) { > + kvfree(ppiov); > + return NULL; > + } > + > + dma = dma_map_page_attrs(pool->p.dev, page, 0, > + (PAGE_SIZE << pool->p.order), > + pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC | > + DMA_ATTR_WEAK_ORDERING); > + if (dma_mapping_error(pool->p.dev, dma)) { > + put_page(page); > + kvfree(ppiov); > + return NULL; > + } > + > + ppiov->pp = pool; > + ppiov->pp_magic = PP_SIGNATURE; > + ppiov->page = page; > + refcount_set(&ppiov->_refcount, 1); > + page_pool_fragment_page((struct page *)ppiov, 1); > + page_pool_set_dma_addr((struct page *)ppiov, dma); > + pool->pages_state_hold_cnt++; > + trace_page_pool_state_hold(pool, (struct page *)ppiov, > + pool->pages_state_hold_cnt); > + return (struct page *)ppiov; > +} > + > +static void mp_dmabuf_devmem_destroy(struct page_pool *pool) > +{ > +} > + > +static void mp_dmabuf_devmem_release_page(struct page_pool *pool, > + struct page *page) > +{ > + struct page_pool_iov *ppiov = (struct page_pool_iov *)page; > + dma_addr_t dma; > + > + dma = page_pool_get_dma_addr(page); > + > + /* When page is unmapped, it cannot be returned to our pool */ > + dma_unmap_page_attrs(pool->p.dev, dma, > + PAGE_SIZE << pool->p.order, pool->p.dma_dir, > + DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING); > + page_pool_set_dma_addr(page, 0); > + > + put_page(ppiov->page); > +} > + > +static void mp_dmabuf_devmem_free_pages(struct page_pool *pool, > + struct page *page) > +{ > + int count; > + > + count = atomic_inc_return_relaxed(&pool->pages_state_release_cnt); > + trace_page_pool_state_release(pool, page, count); > + > + kvfree(page); > +} > + > +const struct pp_memory_provider_ops dmabuf_devmem_ops = { > + .init = mp_dmabuf_devmem_init, > + .destroy = mp_dmabuf_devmem_destroy, > + .alloc_pages = mp_dmabuf_devmem_alloc_pages, > + .release_page = mp_dmabuf_devmem_release_page, > + .free_pages = mp_dmabuf_devmem_free_pages, > +}; > +EXPORT_SYMBOL(dmabuf_devmem_ops); > -- > 2.33.0 >
On Mon, 13 Nov 2023 05:42:16 -0800 Mina Almasry wrote: > You're doing exactly what I think you're doing, and what was nacked in RFC v1. > > You've converted 'struct page_pool_iov' to essentially become a > duplicate of 'struct page'. Then, you're casting page_pool_iov* into > struct page* in mp_dmabuf_devmem_alloc_pages(), then, you're calling > mm APIs like page_ref_*() on the page_pool_iov* because you've fooled > the mm stack into thinking dma-buf memory is a struct page. > > RFC v1 was almost exactly the same, except instead of creating a > duplicate definition of struct page, it just allocated 'struct page' > instead of allocating another struct that is identical to struct page > and casting it into struct page. > > I don't think what you're doing here reverses the nacks I got in RFC > v1. You also did not CC any dma-buf or mm people on this proposal that > would bring up these concerns again. Right, but the mirror struct has some appeal to a non-mm person like myself. The problem IIUC is that this patch is the wrong way around, we should be converting everyone who can deal with non-host mem to struct page_pool_iov. Using page_address() on ppiov which hns3 seems to do in this series does not compute for me. Then we can turn the existing non-iov helpers to be a thin wrapper with just a cast from struct page to struct page_pool_iov, and a call of the iov helper. Again - never cast the other way around. Also I think this conversion can be done completely separately from the mem provider changes. Just add struct page_pool_iov and start using it. Does that make more sense?
+cc Christian, Jason and Willy On 2023/11/14 7:05, Jakub Kicinski wrote: > On Mon, 13 Nov 2023 05:42:16 -0800 Mina Almasry wrote: >> You're doing exactly what I think you're doing, and what was nacked in RFC v1. >> >> You've converted 'struct page_pool_iov' to essentially become a >> duplicate of 'struct page'. Then, you're casting page_pool_iov* into >> struct page* in mp_dmabuf_devmem_alloc_pages(), then, you're calling >> mm APIs like page_ref_*() on the page_pool_iov* because you've fooled >> the mm stack into thinking dma-buf memory is a struct page. Yes, something like above, but I am not sure about the 'fooled the mm stack into thinking dma-buf memory is a struct page' part, because: 1. We never let the 'struct page' for devmem leaking out of net stacking through the 'not kmap()able and not readable' checking in your patchset. 2. We inititiate page->_refcount for devmem to one and it remains as one, we will never call page_ref_inc()/page_ref_dec()/get_page()/put_page(), instead, we use page pool's pp_frag_count to do reference counting for devmem page in patch 6. >> >> RFC v1 was almost exactly the same, except instead of creating a >> duplicate definition of struct page, it just allocated 'struct page' >> instead of allocating another struct that is identical to struct page >> and casting it into struct page. Perhaps it is more accurate to say this is something between RFC v1 and RFC v3, in order to decouple 'struct page' for devmem from mm subsystem, but still have most unified handling for both normal memory and devmem in page pool and net stack. The main difference between this patchset and RFC v1: 1. The mm subsystem is not supposed to see the 'struct page' for devmem in this patchset, I guess we could say it is decoupled from the mm subsystem even though we still call PageTail()/page_ref_count()/ page_is_pfmemalloc() on 'struct page' for devmem. The main difference between this patchset and RFC v3: 1. It reuses the 'struct page' to have more unified handling between normal page and devmem page for net stack. 2. It relies on the page->pp_frag_count to do reference counting. >> >> I don't think what you're doing here reverses the nacks I got in RFC >> v1. You also did not CC any dma-buf or mm people on this proposal that >> would bring up these concerns again. > > Right, but the mirror struct has some appeal to a non-mm person like > myself. The problem IIUC is that this patch is the wrong way around, we > should be converting everyone who can deal with non-host mem to struct > page_pool_iov. Using page_address() on ppiov which hns3 seems to do in > this series does not compute for me. The hacking use of ppiov in hns3 is only used to do the some prototype testing, so ignore it. > > Then we can turn the existing non-iov helpers to be a thin wrapper with > just a cast from struct page to struct page_pool_iov, and a call of the > iov helper. Again - never cast the other way around. I am agreed that a cast from struct page to struct page_pool_iov is allowed, but a cast from struct page_pool_iov to struct page is not allowed if I am understanding you correctly. Before we can also completely decouple 'struct page' allocated using buddy allocator directly from mm subsystem in netstack, below is what I have in mind in order to support different memory provider. +--------------+ | Netstack | |'struct page' | +--------------+ ^ | | v +---------------------+ +----------------------+ | | +---------------+ | devmem MP |<---->| Page pool |----->| **** MP | |'struct page_pool_iov'| | 'struct page' | |'struct **_iov'| +----------------------+ | | +---------------+ +---------------------+ ^ | | v +---------------+ | Driver | | 'struct page' | +---------------+ I would expect net stack, page pool, driver still see the 'struct page', only memory provider see the specific struct for itself, for the above, devmem memory provider sees the 'struct page_pool_iov'. The reason I still expect driver to see the 'struct page' is that driver will still need to support normal memory besides devmem. > > Also I think this conversion can be done completely separately from the > mem provider changes. Just add struct page_pool_iov and start using it. I am not sure I understand what does "Just add struct page_pool_iov and start using it" mean yet. > > Does that make more sense? > > . >
On Tue, Nov 14, 2023 at 12:23 AM Yunsheng Lin <linyunsheng@huawei.com> wrote: > > +cc Christian, Jason and Willy > > On 2023/11/14 7:05, Jakub Kicinski wrote: > > On Mon, 13 Nov 2023 05:42:16 -0800 Mina Almasry wrote: > >> You're doing exactly what I think you're doing, and what was nacked in RFC v1. > >> > >> You've converted 'struct page_pool_iov' to essentially become a > >> duplicate of 'struct page'. Then, you're casting page_pool_iov* into > >> struct page* in mp_dmabuf_devmem_alloc_pages(), then, you're calling > >> mm APIs like page_ref_*() on the page_pool_iov* because you've fooled > >> the mm stack into thinking dma-buf memory is a struct page. > > Yes, something like above, but I am not sure about the 'fooled the mm > stack into thinking dma-buf memory is a struct page' part, because: > 1. We never let the 'struct page' for devmem leaking out of net stacking > through the 'not kmap()able and not readable' checking in your patchset. RFC never used dma-buf pages outside the net stack, so that is the same. You are not able to get rid of the 'net kmap()able and not readable' checking with this approach, because dma-buf memory is fundamentally unkmapable and unreadable. This approach would still need skb_frags_not_readable checks in net stack, so that is also the same. > 2. We inititiate page->_refcount for devmem to one and it remains as one, > we will never call page_ref_inc()/page_ref_dec()/get_page()/put_page(), > instead, we use page pool's pp_frag_count to do reference counting for > devmem page in patch 6. > I'm not sure that moves the needle in terms of allowing dma-buf memory to look like struct pages. > >> > >> RFC v1 was almost exactly the same, except instead of creating a > >> duplicate definition of struct page, it just allocated 'struct page' > >> instead of allocating another struct that is identical to struct page > >> and casting it into struct page. > > Perhaps it is more accurate to say this is something between RFC v1 and > RFC v3, in order to decouple 'struct page' for devmem from mm subsystem, > but still have most unified handling for both normal memory and devmem > in page pool and net stack. > > The main difference between this patchset and RFC v1: > 1. The mm subsystem is not supposed to see the 'struct page' for devmem > in this patchset, I guess we could say it is decoupled from the mm > subsystem even though we still call PageTail()/page_ref_count()/ > page_is_pfmemalloc() on 'struct page' for devmem. > In this patchset you pretty much allocate a struct page for your dma-buf memory, and then cast it into a struct page, so all the mm calls in page_pool.c are seeing a struct page when it's really dma-buf memory. 'even though we still call PageTail()/page_ref_count()/page_is_pfmemalloc() on 'struct page' for devmem' is basically making dma-buf memory look like struct pages. Actually because you put the 'strtuct page for devmem' in skb->bv_frag, the net stack will grab the 'struct page' for devmem using skb_frag_page() then call things like page_address(), kmap, get_page, put_page, etc, etc, etc. > The main difference between this patchset and RFC v3: > 1. It reuses the 'struct page' to have more unified handling between > normal page and devmem page for net stack. This is what was nacked in RFC v1. > 2. It relies on the page->pp_frag_count to do reference counting. > I don't see you change any of the page_ref_* calls in page_pool.c, for example this one: https://elixir.bootlin.com/linux/latest/source/net/core/page_pool.c#L601 So the reference the page_pool is seeing is actually page->_refcount, not page->pp_frag_count? I'm confused here. Is this a bug in the patchset? > >> > >> I don't think what you're doing here reverses the nacks I got in RFC > >> v1. You also did not CC any dma-buf or mm people on this proposal that > >> would bring up these concerns again. > > > > Right, but the mirror struct has some appeal to a non-mm person like > > myself. The problem IIUC is that this patch is the wrong way around, we > > should be converting everyone who can deal with non-host mem to struct > > page_pool_iov. Using page_address() on ppiov which hns3 seems to do in > > this series does not compute for me. > > The hacking use of ppiov in hns3 is only used to do the some prototype > testing, so ignore it. > > > > > Then we can turn the existing non-iov helpers to be a thin wrapper with > > just a cast from struct page to struct page_pool_iov, and a call of the > > iov helper. Again - never cast the other way around. > > I am agreed that a cast from struct page to struct page_pool_iov is allowed, > but a cast from struct page_pool_iov to struct page is not allowed if I am > understanding you correctly. > > Before we can also completely decouple 'struct page' allocated using buddy > allocator directly from mm subsystem in netstack, below is what I have in > mind in order to support different memory provider. > > +--------------+ > | Netstack | > |'struct page' | > +--------------+ > ^ > | > | > v > +---------------------+ > +----------------------+ | | +---------------+ > | devmem MP |<---->| Page pool |----->| **** MP | > |'struct page_pool_iov'| | 'struct page' | |'struct **_iov'| > +----------------------+ | | +---------------+ > +---------------------+ > ^ > | > | > v > +---------------+ > | Driver | > | 'struct page' | > +---------------+ > > I would expect net stack, page pool, driver still see the 'struct page', > only memory provider see the specific struct for itself, for the above, > devmem memory provider sees the 'struct page_pool_iov'. > > The reason I still expect driver to see the 'struct page' is that driver > will still need to support normal memory besides devmem. > > > > > Also I think this conversion can be done completely separately from the > > mem provider changes. Just add struct page_pool_iov and start using it. > > I am not sure I understand what does "Just add struct page_pool_iov and > start using it" mean yet. > > > > > Does that make more sense? > > > > . > > -- Thanks, Mina
On 2023/11/14 20:21, Mina Almasry wrote: > On Tue, Nov 14, 2023 at 12:23 AM Yunsheng Lin <linyunsheng@huawei.com> wrote: >> >> +cc Christian, Jason and Willy >> >> On 2023/11/14 7:05, Jakub Kicinski wrote: >>> On Mon, 13 Nov 2023 05:42:16 -0800 Mina Almasry wrote: >>>> You're doing exactly what I think you're doing, and what was nacked in RFC v1. >>>> >>>> You've converted 'struct page_pool_iov' to essentially become a >>>> duplicate of 'struct page'. Then, you're casting page_pool_iov* into >>>> struct page* in mp_dmabuf_devmem_alloc_pages(), then, you're calling >>>> mm APIs like page_ref_*() on the page_pool_iov* because you've fooled >>>> the mm stack into thinking dma-buf memory is a struct page. >> >> Yes, something like above, but I am not sure about the 'fooled the mm >> stack into thinking dma-buf memory is a struct page' part, because: >> 1. We never let the 'struct page' for devmem leaking out of net stacking >> through the 'not kmap()able and not readable' checking in your patchset. > > RFC never used dma-buf pages outside the net stack, so that is the same. > > You are not able to get rid of the 'net kmap()able and not readable' > checking with this approach, because dma-buf memory is fundamentally > unkmapable and unreadable. This approach would still need > skb_frags_not_readable checks in net stack, so that is also the same. Yes, I am agreed that checking is still needed whatever the proposal is. > >> 2. We inititiate page->_refcount for devmem to one and it remains as one, >> we will never call page_ref_inc()/page_ref_dec()/get_page()/put_page(), >> instead, we use page pool's pp_frag_count to do reference counting for >> devmem page in patch 6. >> > > I'm not sure that moves the needle in terms of allowing dma-buf > memory to look like struct pages. > >>>> >>>> RFC v1 was almost exactly the same, except instead of creating a >>>> duplicate definition of struct page, it just allocated 'struct page' >>>> instead of allocating another struct that is identical to struct page >>>> and casting it into struct page. >> >> Perhaps it is more accurate to say this is something between RFC v1 and >> RFC v3, in order to decouple 'struct page' for devmem from mm subsystem, >> but still have most unified handling for both normal memory and devmem >> in page pool and net stack. >> >> The main difference between this patchset and RFC v1: >> 1. The mm subsystem is not supposed to see the 'struct page' for devmem >> in this patchset, I guess we could say it is decoupled from the mm >> subsystem even though we still call PageTail()/page_ref_count()/ >> page_is_pfmemalloc() on 'struct page' for devmem. >> > > In this patchset you pretty much allocate a struct page for your > dma-buf memory, and then cast it into a struct page, so all the mm > calls in page_pool.c are seeing a struct page when it's really dma-buf > memory. > > 'even though we still call > PageTail()/page_ref_count()/page_is_pfmemalloc() on 'struct page' for > devmem' is basically making dma-buf memory look like struct pages. > > Actually because you put the 'strtuct page for devmem' in > skb->bv_frag, the net stack will grab the 'struct page' for devmem > using skb_frag_page() then call things like page_address(), kmap, > get_page, put_page, etc, etc, etc. Yes, as above, skb_frags_not_readable() checking is still needed for kmap() and page_address(). get_page, put_page related calling is avoided in page_pool_frag_ref() and napi_pp_put_page() for devmem page as the above checking is true for devmem page: (pp_iov->pp_magic & ~0x3UL) == PP_SIGNATURE > >> The main difference between this patchset and RFC v3: >> 1. It reuses the 'struct page' to have more unified handling between >> normal page and devmem page for net stack. > > This is what was nacked in RFC v1. > >> 2. It relies on the page->pp_frag_count to do reference counting. >> > > I don't see you change any of the page_ref_* calls in page_pool.c, for > example this one: > > https://elixir.bootlin.com/linux/latest/source/net/core/page_pool.c#L601 > > So the reference the page_pool is seeing is actually page->_refcount, > not page->pp_frag_count? I'm confused here. Is this a bug in the > patchset? page->_refcount is the same as page_pool_iov->_refcount for devmem, which is ensured by the 'PAGE_POOL_MATCH(_refcount, _refcount);', and page_pool_iov->_refcount is set to one in mp_dmabuf_devmem_alloc_pages() by calling 'refcount_set(&ppiov->_refcount, 1)' and always remains as one. So the 'page_ref_count(page) == 1' checking is always true for devmem page.
On Tue, Nov 14, 2023 at 4:49 AM Yunsheng Lin <linyunsheng@huawei.com> wrote: > > On 2023/11/14 20:21, Mina Almasry wrote: > > On Tue, Nov 14, 2023 at 12:23 AM Yunsheng Lin <linyunsheng@huawei.com> wrote: > >> > >> +cc Christian, Jason and Willy > >> > >> On 2023/11/14 7:05, Jakub Kicinski wrote: > >>> On Mon, 13 Nov 2023 05:42:16 -0800 Mina Almasry wrote: > >>>> You're doing exactly what I think you're doing, and what was nacked in RFC v1. > >>>> > >>>> You've converted 'struct page_pool_iov' to essentially become a > >>>> duplicate of 'struct page'. Then, you're casting page_pool_iov* into > >>>> struct page* in mp_dmabuf_devmem_alloc_pages(), then, you're calling > >>>> mm APIs like page_ref_*() on the page_pool_iov* because you've fooled > >>>> the mm stack into thinking dma-buf memory is a struct page. > >> > >> Yes, something like above, but I am not sure about the 'fooled the mm > >> stack into thinking dma-buf memory is a struct page' part, because: > >> 1. We never let the 'struct page' for devmem leaking out of net stacking > >> through the 'not kmap()able and not readable' checking in your patchset. > > > > RFC never used dma-buf pages outside the net stack, so that is the same. > > > > You are not able to get rid of the 'net kmap()able and not readable' > > checking with this approach, because dma-buf memory is fundamentally > > unkmapable and unreadable. This approach would still need > > skb_frags_not_readable checks in net stack, so that is also the same. > > Yes, I am agreed that checking is still needed whatever the proposal is. > > > > >> 2. We inititiate page->_refcount for devmem to one and it remains as one, > >> we will never call page_ref_inc()/page_ref_dec()/get_page()/put_page(), > >> instead, we use page pool's pp_frag_count to do reference counting for > >> devmem page in patch 6. > >> > > > > I'm not sure that moves the needle in terms of allowing dma-buf > > memory to look like struct pages. > > > >>>> > >>>> RFC v1 was almost exactly the same, except instead of creating a > >>>> duplicate definition of struct page, it just allocated 'struct page' > >>>> instead of allocating another struct that is identical to struct page > >>>> and casting it into struct page. > >> > >> Perhaps it is more accurate to say this is something between RFC v1 and > >> RFC v3, in order to decouple 'struct page' for devmem from mm subsystem, > >> but still have most unified handling for both normal memory and devmem > >> in page pool and net stack. > >> > >> The main difference between this patchset and RFC v1: > >> 1. The mm subsystem is not supposed to see the 'struct page' for devmem > >> in this patchset, I guess we could say it is decoupled from the mm > >> subsystem even though we still call PageTail()/page_ref_count()/ > >> page_is_pfmemalloc() on 'struct page' for devmem. > >> > > > > In this patchset you pretty much allocate a struct page for your > > dma-buf memory, and then cast it into a struct page, so all the mm > > calls in page_pool.c are seeing a struct page when it's really dma-buf > > memory. > > > > 'even though we still call > > PageTail()/page_ref_count()/page_is_pfmemalloc() on 'struct page' for > > devmem' is basically making dma-buf memory look like struct pages. > > > > Actually because you put the 'strtuct page for devmem' in > > skb->bv_frag, the net stack will grab the 'struct page' for devmem > > using skb_frag_page() then call things like page_address(), kmap, > > get_page, put_page, etc, etc, etc. > > Yes, as above, skb_frags_not_readable() checking is still needed for > kmap() and page_address(). > > get_page, put_page related calling is avoided in page_pool_frag_ref() > and napi_pp_put_page() for devmem page as the above checking is true > for devmem page: > (pp_iov->pp_magic & ~0x3UL) == PP_SIGNATURE > So, devmem needs special handling with if statement for refcounting, even after using struct pages for devmem, which is not allowed (IIUC the dma-buf maintainer). > > > >> The main difference between this patchset and RFC v3: > >> 1. It reuses the 'struct page' to have more unified handling between > >> normal page and devmem page for net stack. > > > > This is what was nacked in RFC v1. > > > >> 2. It relies on the page->pp_frag_count to do reference counting. > >> > > > > I don't see you change any of the page_ref_* calls in page_pool.c, for > > example this one: > > > > https://elixir.bootlin.com/linux/latest/source/net/core/page_pool.c#L601 > > > > So the reference the page_pool is seeing is actually page->_refcount, > > not page->pp_frag_count? I'm confused here. Is this a bug in the > > patchset? > > page->_refcount is the same as page_pool_iov->_refcount for devmem, which > is ensured by the 'PAGE_POOL_MATCH(_refcount, _refcount);', and > page_pool_iov->_refcount is set to one in mp_dmabuf_devmem_alloc_pages() > by calling 'refcount_set(&ppiov->_refcount, 1)' and always remains as one. > > So the 'page_ref_count(page) == 1' checking is always true for devmem page. Which, of course, is a bug in the patchset, and it only works because it's a POC for you. devmem pages (which shouldn't exist according to the dma-buf maintainer, IIUC) can't be recycled all the time. See SO_DEVMEM_DONTNEED patch in my RFC and refcounting needed for devmem.
On Tue, Nov 14, 2023 at 04:21:26AM -0800, Mina Almasry wrote: > Actually because you put the 'strtuct page for devmem' in > skb->bv_frag, the net stack will grab the 'struct page' for devmem > using skb_frag_page() then call things like page_address(), kmap, > get_page, put_page, etc, etc, etc. Yikes, please no. If net has its own struct page look alike it has to stay entirely inside net. A non-mm owned struct page should not be passed into mm calls. It is just way too hacky to be seriously considered :( > > I would expect net stack, page pool, driver still see the 'struct page', > > only memory provider see the specific struct for itself, for the above, > > devmem memory provider sees the 'struct page_pool_iov'. > > > > The reason I still expect driver to see the 'struct page' is that driver > > will still need to support normal memory besides devmem. I wouldn't say this approach is unreasonable, but it does have to be done carefully to isolate the mm. Keeping the struct page in the API is going to make this very hard. Jason
On 2023/11/14 20:58, Mina Almasry wrote: >> >> Yes, as above, skb_frags_not_readable() checking is still needed for >> kmap() and page_address(). >> >> get_page, put_page related calling is avoided in page_pool_frag_ref() >> and napi_pp_put_page() for devmem page as the above checking is true >> for devmem page: >> (pp_iov->pp_magic & ~0x3UL) == PP_SIGNATURE >> > > So, devmem needs special handling with if statement for refcounting, > even after using struct pages for devmem, which is not allowed (IIUC > the dma-buf maintainer). It reuses the already existing checking or optimization, that is the point of 'mirror struct'. > >>> >>>> The main difference between this patchset and RFC v3: >>>> 1. It reuses the 'struct page' to have more unified handling between >>>> normal page and devmem page for net stack. >>> >>> This is what was nacked in RFC v1. >>> >>>> 2. It relies on the page->pp_frag_count to do reference counting. >>>> >>> >>> I don't see you change any of the page_ref_* calls in page_pool.c, for >>> example this one: >>> >>> https://elixir.bootlin.com/linux/latest/source/net/core/page_pool.c#L601 >>> >>> So the reference the page_pool is seeing is actually page->_refcount, >>> not page->pp_frag_count? I'm confused here. Is this a bug in the >>> patchset? >> >> page->_refcount is the same as page_pool_iov->_refcount for devmem, which >> is ensured by the 'PAGE_POOL_MATCH(_refcount, _refcount);', and >> page_pool_iov->_refcount is set to one in mp_dmabuf_devmem_alloc_pages() >> by calling 'refcount_set(&ppiov->_refcount, 1)' and always remains as one. >> >> So the 'page_ref_count(page) == 1' checking is always true for devmem page. > > Which, of course, is a bug in the patchset, and it only works because > it's a POC for you. devmem pages (which shouldn't exist according to > the dma-buf maintainer, IIUC) can't be recycled all the time. See > SO_DEVMEM_DONTNEED patch in my RFC and refcounting needed for devmem. I am not sure dma-buf maintainer's concern is still there with this patchset. Whatever name you calling it for the struct, however you arrange each field in the struct, some metadata is always needed for dmabuf to intergrate into page pool. If the above is true, why not utilize the 'struct page' to have more unified handling? >
Yunsheng Lin wrote: > On 2023/11/14 20:58, Mina Almasry wrote: > > >> > >> Yes, as above, skb_frags_not_readable() checking is still needed for > >> kmap() and page_address(). > >> > >> get_page, put_page related calling is avoided in page_pool_frag_ref() > >> and napi_pp_put_page() for devmem page as the above checking is true > >> for devmem page: > >> (pp_iov->pp_magic & ~0x3UL) == PP_SIGNATURE > >> > > > > So, devmem needs special handling with if statement for refcounting, > > even after using struct pages for devmem, which is not allowed (IIUC > > the dma-buf maintainer). > > It reuses the already existing checking or optimization, that is the point > of 'mirror struct'. > > > > >>> > >>>> The main difference between this patchset and RFC v3: > >>>> 1. It reuses the 'struct page' to have more unified handling between > >>>> normal page and devmem page for net stack. > >>> > >>> This is what was nacked in RFC v1. > >>> > >>>> 2. It relies on the page->pp_frag_count to do reference counting. > >>>> > >>> > >>> I don't see you change any of the page_ref_* calls in page_pool.c, for > >>> example this one: > >>> > >>> https://elixir.bootlin.com/linux/latest/source/net/core/page_pool.c#L601 > >>> > >>> So the reference the page_pool is seeing is actually page->_refcount, > >>> not page->pp_frag_count? I'm confused here. Is this a bug in the > >>> patchset? > >> > >> page->_refcount is the same as page_pool_iov->_refcount for devmem, which > >> is ensured by the 'PAGE_POOL_MATCH(_refcount, _refcount);', and > >> page_pool_iov->_refcount is set to one in mp_dmabuf_devmem_alloc_pages() > >> by calling 'refcount_set(&ppiov->_refcount, 1)' and always remains as one. > >> > >> So the 'page_ref_count(page) == 1' checking is always true for devmem page. > > > > Which, of course, is a bug in the patchset, and it only works because > > it's a POC for you. devmem pages (which shouldn't exist according to > > the dma-buf maintainer, IIUC) can't be recycled all the time. See > > SO_DEVMEM_DONTNEED patch in my RFC and refcounting needed for devmem. > > I am not sure dma-buf maintainer's concern is still there with this patchset. > > Whatever name you calling it for the struct, however you arrange each field > in the struct, some metadata is always needed for dmabuf to intergrate into > page pool. > > If the above is true, why not utilize the 'struct page' to have more unified > handling? My understanding is that there is a general preference to simplify struct page, and at the least not move in the other direction by overloading the struct in new ways. If using struct page for something that is not memory, there is ZONE_DEVICE. But using that correctly is non-trivial: https://lore.kernel.org/all/ZKyZBbKEpmkFkpWV@ziepe.ca/ Since all we need is a handle that does not leave the network stack, a network specific struct like page_pool_iov entirely avoids this issue. RFC v3 seems like a good simplification over RFC v1 in that regard to me. I was also pleasantly surprised how minimal the change to the users of skb_frag_t actually proved to be.
On Tue, 14 Nov 2023 16:23:29 +0800 Yunsheng Lin wrote: > I would expect net stack, page pool, driver still see the 'struct page', > only memory provider see the specific struct for itself, for the above, > devmem memory provider sees the 'struct page_pool_iov'. You can't lie to the driver that an _iov is a page either. The driver must explicitly "opt-in" to using the _iov variant, by calling the _iov set of APIs. Only drivers which can support header-data split can reasonably use the _iov API, for data pages.
Am 14.11.23 um 14:16 schrieb Jason Gunthorpe: > A non-mm owned struct page should not be > passed into mm calls. It is just way too hacky to be seriously > considered :( Can we please print this sentence on T-Shirts? Or framed on the wall of meeting rooms? I don't know how often I had to repeat that to people. Regards, Christian.
On 2023/11/14 21:16, Jason Gunthorpe wrote: > On Tue, Nov 14, 2023 at 04:21:26AM -0800, Mina Almasry wrote: > >> Actually because you put the 'strtuct page for devmem' in >> skb->bv_frag, the net stack will grab the 'struct page' for devmem >> using skb_frag_page() then call things like page_address(), kmap, >> get_page, put_page, etc, etc, etc. > > Yikes, please no. If net has its own struct page look alike it has to > stay entirely inside net. A non-mm owned struct page should not be > passed into mm calls. It is just way too hacky to be seriously > considered :( Yes, that is something this patchset is trying to do, defining its own struct page look alike for page pool to support devmem. struct page for devmem will not be called into the mm subsystem, so most of the mm calls is avoided by calling into the devmem memory provider' ops instead of calling mm calls. As far as I see for now, only page_ref_count(), page_is_pfmemalloc() and PageTail() is called for devmem page, which should be easy to ensure that those call for devmem page is consistent with the struct page owned by mm. I am not sure if we can use some kind of compile/runtime checking to ensure those kinds of consistency? > >>> I would expect net stack, page pool, driver still see the 'struct page', >>> only memory provider see the specific struct for itself, for the above, >>> devmem memory provider sees the 'struct page_pool_iov'. >>> >>> The reason I still expect driver to see the 'struct page' is that driver >>> will still need to support normal memory besides devmem. > > I wouldn't say this approach is unreasonable, but it does have to be > done carefully to isolate the mm. Keeping the struct page in the API > is going to make this very hard. I would expect that most of the isolation is done in page pool, as far as I can see: 1. For control part: the driver may need to tell the page pool which memory provider it want to use. Or the administrator specifies which memory provider to use by some netlink-based cmd. 2. For data part: I am thinking that driver should only call page_pool_alloc(), page_pool_free() and page_pool_get_dma_addr related function. Of course the driver may need to be aware of that if it can call kmap() or page_address() on the page returned from page_pool_alloc(), and maybe tell net stack that those pages is not kmap()'able and page_address()'able. > > Jason > . >
On 2023/11/14 23:41, Willem de Bruijn wrote: >> >> I am not sure dma-buf maintainer's concern is still there with this patchset. >> >> Whatever name you calling it for the struct, however you arrange each field >> in the struct, some metadata is always needed for dmabuf to intergrate into >> page pool. >> >> If the above is true, why not utilize the 'struct page' to have more unified >> handling? > > My understanding is that there is a general preference to simplify struct > page, and at the least not move in the other direction by overloading the > struct in new ways. As my understanding, the new struct is just mirroring the struct page pool is already using, see: https://elixir.free-electrons.com/linux/v6.7-rc1/source/include/linux/mm_types.h#L119 If there is simplifying to the struct page_pool is using, I think the new stuct the devmem memory provider is using can adjust accordingly. As a matter of fact, I think the way 'struct page' for devmem is decoupled from mm subsystem may provide a way to simplify or decoupled the already existing 'struct page' used in netstack from mm subsystem, before this patchset, it seems we have the below types of 'struct page': 1. page allocated in the netstack using page pool. 2. page allocated in the netstack using buddy allocator. 3. page allocated in other subsystem and passed to the netstack, such as zcopy or spliced page? If we can decouple 'struct page' for devmem from mm subsystem, we may be able to decouple the above 'struct page' from mm subsystem one by one. > > If using struct page for something that is not memory, there is ZONE_DEVICE. > But using that correctly is non-trivial: > > https://lore.kernel.org/all/ZKyZBbKEpmkFkpWV@ziepe.ca/ > > Since all we need is a handle that does not leave the network stack, > a network specific struct like page_pool_iov entirely avoids this issue. Yes, I am agree about the network specific struct. I am wondering if we can make the struct more generic if we want to intergrate it into page_pool and use it in net stack. > RFC v3 seems like a good simplification over RFC v1 in that regard to me. > I was also pleasantly surprised how minimal the change to the users of > skb_frag_t actually proved to be. Yes, I am agreed about that too. Maybe we can make it simpler by using a more abstract struct as page_pool, and utilize some features of page_pool too. For example, from the page_pool doc, page_pool have fast cache and ptr-ring cache as below, but if napi_frag_unref() call page_pool_page_put_many() and return the dmabuf chunk directly to gen_pool in the memory provider, then it seems we are bypassing the below caches in the page_pool. +------------------+ | Driver | +------------------+ ^ | | | v +--------------------------------------------+ | request memory | +--------------------------------------------+ ^ ^ | | | Pool empty | Pool has entries | | v v +-----------------------+ +------------------------+ | alloc (and map) pages | | get page from cache | +-----------------------+ +------------------------+ ^ ^ | | | cache available | No entries, refill | | from ptr-ring | | v v +-----------------+ +------------------+ | Fast cache | | ptr-ring cache | +-----------------+ +------------------+ > > . >
On 2023/11/15 6:25, Jakub Kicinski wrote: > On Tue, 14 Nov 2023 16:23:29 +0800 Yunsheng Lin wrote: >> I would expect net stack, page pool, driver still see the 'struct page', >> only memory provider see the specific struct for itself, for the above, >> devmem memory provider sees the 'struct page_pool_iov'. > > You can't lie to the driver that an _iov is a page either. Yes, agreed about that. As a matter of fact, the driver should be awared of what kind of memory provider is using when it calls page_pool_create() during init process. > The driver must explicitly "opt-in" to using the _iov variant, > by calling the _iov set of APIs. > > Only drivers which can support header-data split can reasonably > use the _iov API, for data pages. But those drivers can still allow allocating normal memory, right? sometimes for data and header part, and sometimes for the header part. Do those drivers need to support two sets of APIs? the one with _iov for devmem, and the one without _iov for normal memory. It seems somewhat unnecessary from driver' point of veiw to support two sets of APIs? The driver seems to know which type of page it is expecting when calling page_pool_alloc() with a specific page_pool instance. Or do we use the API with _iov to allocate both devmem and normal memory in the new driver supporting devmem page? If that is the case, does it really matter if the API is with _iov or not? > . >
On Wed, Nov 15, 2023 at 05:21:02PM +0800, Yunsheng Lin wrote: > >>> I would expect net stack, page pool, driver still see the 'struct page', > >>> only memory provider see the specific struct for itself, for the above, > >>> devmem memory provider sees the 'struct page_pool_iov'. > >>> > >>> The reason I still expect driver to see the 'struct page' is that driver > >>> will still need to support normal memory besides devmem. > > > > I wouldn't say this approach is unreasonable, but it does have to be > > done carefully to isolate the mm. Keeping the struct page in the API > > is going to make this very hard. > > I would expect that most of the isolation is done in page pool, as far as > I can see: It is the sort of thing that is important enough it should have compiler help via types to prove that it is being done properly. Otherwise it will be full of mistakes over time. Jason
On Wed, Nov 15, 2023 at 1:21 AM Yunsheng Lin <linyunsheng@huawei.com> wrote: > > On 2023/11/14 21:16, Jason Gunthorpe wrote: > > On Tue, Nov 14, 2023 at 04:21:26AM -0800, Mina Almasry wrote: > > > >> Actually because you put the 'strtuct page for devmem' in > >> skb->bv_frag, the net stack will grab the 'struct page' for devmem > >> using skb_frag_page() then call things like page_address(), kmap, > >> get_page, put_page, etc, etc, etc. > > > > Yikes, please no. If net has its own struct page look alike it has to > > stay entirely inside net. A non-mm owned struct page should not be > > passed into mm calls. It is just way too hacky to be seriously > > considered :( > > Yes, that is something this patchset is trying to do, defining its own > struct page look alike for page pool to support devmem. > > struct page for devmem will not be called into the mm subsystem, so most > of the mm calls is avoided by calling into the devmem memory provider' > ops instead of calling mm calls. > > As far as I see for now, only page_ref_count(), page_is_pfmemalloc() and > PageTail() is called for devmem page, which should be easy to ensure that > those call for devmem page is consistent with the struct page owned by mm. I'm not sure this is true. These 3 calls are just the calls you're aware of. In your proposal you're casting mirror pages into page* and releasing them into the net stack. You need to scrub the entire net stack for mm calls, i.e. all driver code and all skb_frag_page() call sites. Of the top of my head, the driver is probably calling page_address() and illegal_highdma() is calling PageHighMem(). TCP zerocopy receive is calling vm_insert_pages(). > I am not sure if we can use some kind of compile/runtime checking to ensure > those kinds of consistency? > > > > >>> I would expect net stack, page pool, driver still see the 'struct page', > >>> only memory provider see the specific struct for itself, for the above, > >>> devmem memory provider sees the 'struct page_pool_iov'. > >>> > >>> The reason I still expect driver to see the 'struct page' is that driver > >>> will still need to support normal memory besides devmem. > > > > I wouldn't say this approach is unreasonable, but it does have to be > > done carefully to isolate the mm. Keeping the struct page in the API > > is going to make this very hard. > > I would expect that most of the isolation is done in page pool, as far as > I can see: > > 1. For control part: the driver may need to tell the page pool which memory > provider it want to use. Or the administrator specifies > which memory provider to use by some netlink-based cmd. > > 2. For data part: I am thinking that driver should only call page_pool_alloc(), > page_pool_free() and page_pool_get_dma_addr related function. > > Of course the driver may need to be aware of that if it can call kmap() or > page_address() on the page returned from page_pool_alloc(), and maybe tell > net stack that those pages is not kmap()'able and page_address()'able. > > > > > Jason > > . > >
On 11/15/23 2:21 AM, Yunsheng Lin wrote: > On 2023/11/14 21:16, Jason Gunthorpe wrote: >> On Tue, Nov 14, 2023 at 04:21:26AM -0800, Mina Almasry wrote: >> >>> Actually because you put the 'strtuct page for devmem' in >>> skb->bv_frag, the net stack will grab the 'struct page' for devmem >>> using skb_frag_page() then call things like page_address(), kmap, >>> get_page, put_page, etc, etc, etc. >> >> Yikes, please no. If net has its own struct page look alike it has to >> stay entirely inside net. A non-mm owned struct page should not be >> passed into mm calls. It is just way too hacky to be seriously >> considered :( > > Yes, that is something this patchset is trying to do, defining its own > struct page look alike for page pool to support devmem. > Networking needs to be able to move away from struct page references. The devmem and host memory for Rx use cases do not need to be page based.
On Wed, Nov 15, 2023 at 1:29 AM Yunsheng Lin <linyunsheng@huawei.com> wrote: > > On 2023/11/14 23:41, Willem de Bruijn wrote: > >> > >> I am not sure dma-buf maintainer's concern is still there with this patchset. > >> > >> Whatever name you calling it for the struct, however you arrange each field > >> in the struct, some metadata is always needed for dmabuf to intergrate into > >> page pool. > >> > >> If the above is true, why not utilize the 'struct page' to have more unified > >> handling? > > > > My understanding is that there is a general preference to simplify struct > > page, and at the least not move in the other direction by overloading the > > struct in new ways. > > As my understanding, the new struct is just mirroring the struct page pool > is already using, see: > https://elixir.free-electrons.com/linux/v6.7-rc1/source/include/linux/mm_types.h#L119 > > If there is simplifying to the struct page_pool is using, I think the new > stuct the devmem memory provider is using can adjust accordingly. > > As a matter of fact, I think the way 'struct page' for devmem is decoupled > from mm subsystem may provide a way to simplify or decoupled the already > existing 'struct page' used in netstack from mm subsystem, before this > patchset, it seems we have the below types of 'struct page': > 1. page allocated in the netstack using page pool. > 2. page allocated in the netstack using buddy allocator. > 3. page allocated in other subsystem and passed to the netstack, such as > zcopy or spliced page? > > If we can decouple 'struct page' for devmem from mm subsystem, we may be able > to decouple the above 'struct page' from mm subsystem one by one. > > > > > If using struct page for something that is not memory, there is ZONE_DEVICE. > > But using that correctly is non-trivial: > > > > https://lore.kernel.org/all/ZKyZBbKEpmkFkpWV@ziepe.ca/ > > > > Since all we need is a handle that does not leave the network stack, > > a network specific struct like page_pool_iov entirely avoids this issue. > > Yes, I am agree about the network specific struct. > I am wondering if we can make the struct more generic if we want to > intergrate it into page_pool and use it in net stack. > > > RFC v3 seems like a good simplification over RFC v1 in that regard to me. > > I was also pleasantly surprised how minimal the change to the users of > > skb_frag_t actually proved to be. > > Yes, I am agreed about that too. Maybe we can make it simpler by using > a more abstract struct as page_pool, and utilize some features of > page_pool too. > > For example, from the page_pool doc, page_pool have fast cache and > ptr-ring cache as below, but if napi_frag_unref() call > page_pool_page_put_many() and return the dmabuf chunk directly to > gen_pool in the memory provider, then it seems we are bypassing the > below caches in the page_pool. > I think you're just misunderstanding the code. The page recycling works with my patchset. napi_frag_unref() calls napi_pp_put_page() if recycle == true, and that works the same with devmem as with regular pages. If recycle == false, we call page_pool_page_put_many() which will call put_page() for regular pages and page_pool_iov_put_many() for devmem pages. So, the memory recycling works exactly the same as before with devmem as with regular pages. In my tests I do see the devmem being recycled correctly. We are not bypassing any caches. > +------------------+ > | Driver | > +------------------+ > ^ > | > | > | > v > +--------------------------------------------+ > | request memory | > +--------------------------------------------+ > ^ ^ > | | > | Pool empty | Pool has entries > | | > v v > +-----------------------+ +------------------------+ > | alloc (and map) pages | | get page from cache | > +-----------------------+ +------------------------+ > ^ ^ > | | > | cache available | No entries, refill > | | from ptr-ring > | | > v v > +-----------------+ +------------------+ > | Fast cache | | ptr-ring cache | > +-----------------+ +------------------+ > > > > > > . > >
On Wed, Nov 15, 2023 at 10:07 AM Mina Almasry <almasrymina@google.com> wrote: > > On Wed, Nov 15, 2023 at 1:29 AM Yunsheng Lin <linyunsheng@huawei.com> wrote: > > > > On 2023/11/14 23:41, Willem de Bruijn wrote: > > >> > > >> I am not sure dma-buf maintainer's concern is still there with this patchset. > > >> > > >> Whatever name you calling it for the struct, however you arrange each field > > >> in the struct, some metadata is always needed for dmabuf to intergrate into > > >> page pool. > > >> > > >> If the above is true, why not utilize the 'struct page' to have more unified > > >> handling? > > > > > > My understanding is that there is a general preference to simplify struct > > > page, and at the least not move in the other direction by overloading the > > > struct in new ways. > > > > As my understanding, the new struct is just mirroring the struct page pool > > is already using, see: > > https://elixir.free-electrons.com/linux/v6.7-rc1/source/include/linux/mm_types.h#L119 > > > > If there is simplifying to the struct page_pool is using, I think the new > > stuct the devmem memory provider is using can adjust accordingly. > > > > As a matter of fact, I think the way 'struct page' for devmem is decoupled > > from mm subsystem may provide a way to simplify or decoupled the already > > existing 'struct page' used in netstack from mm subsystem, before this > > patchset, it seems we have the below types of 'struct page': > > 1. page allocated in the netstack using page pool. > > 2. page allocated in the netstack using buddy allocator. > > 3. page allocated in other subsystem and passed to the netstack, such as > > zcopy or spliced page? > > > > If we can decouple 'struct page' for devmem from mm subsystem, we may be able > > to decouple the above 'struct page' from mm subsystem one by one. > > > > > > > > If using struct page for something that is not memory, there is ZONE_DEVICE. > > > But using that correctly is non-trivial: > > > > > > https://lore.kernel.org/all/ZKyZBbKEpmkFkpWV@ziepe.ca/ > > > > > > Since all we need is a handle that does not leave the network stack, > > > a network specific struct like page_pool_iov entirely avoids this issue. > > > > Yes, I am agree about the network specific struct. > > I am wondering if we can make the struct more generic if we want to > > intergrate it into page_pool and use it in net stack. > > > > > RFC v3 seems like a good simplification over RFC v1 in that regard to me. > > > I was also pleasantly surprised how minimal the change to the users of > > > skb_frag_t actually proved to be. > > > > Yes, I am agreed about that too. Maybe we can make it simpler by using > > a more abstract struct as page_pool, and utilize some features of > > page_pool too. > > > > For example, from the page_pool doc, page_pool have fast cache and > > ptr-ring cache as below, but if napi_frag_unref() call > > page_pool_page_put_many() and return the dmabuf chunk directly to > > gen_pool in the memory provider, then it seems we are bypassing the > > below caches in the page_pool. > > > > I think you're just misunderstanding the code. The page recycling > works with my patchset. napi_frag_unref() calls napi_pp_put_page() if > recycle == true, and that works the same with devmem as with regular > pages. > > If recycle == false, we call page_pool_page_put_many() which will call > put_page() for regular pages and page_pool_iov_put_many() for devmem > pages. So, the memory recycling works exactly the same as before with > devmem as with regular pages. In my tests I do see the devmem being > recycled correctly. We are not bypassing any caches. > > Ah, taking a closer look here, the devmem recycling works for me but I think that's a side effect to the fact that the page_pool support I implemented with GVE is unusual. I currently allocate pages from the page_pool but do not set skb_mark_for_recycle(). The page recycling still happens when GVE is done with the page and calls page_pool_put_full_pgae(), as that eventually checks the refcount on the devmem and recycles it. I will fix up the GVE to call skb_mark_for_recycle() and ensure the napi_pp_put_page() path recycles the devmem or page correctly in the next version. > > +------------------+ > > | Driver | > > +------------------+ > > ^ > > | > > | > > | > > v > > +--------------------------------------------+ > > | request memory | > > +--------------------------------------------+ > > ^ ^ > > | | > > | Pool empty | Pool has entries > > | | > > v v > > +-----------------------+ +------------------------+ > > | alloc (and map) pages | | get page from cache | > > +-----------------------+ +------------------------+ > > ^ ^ > > | | > > | cache available | No entries, refill > > | | from ptr-ring > > | | > > v v > > +-----------------+ +------------------+ > > | Fast cache | | ptr-ring cache | > > +-----------------+ +------------------+ > > > > > > > > > > . > > > > > > > -- > Thanks, > Mina
On 2023/11/15 21:38, Jason Gunthorpe wrote: > On Wed, Nov 15, 2023 at 05:21:02PM +0800, Yunsheng Lin wrote: > >>>>> I would expect net stack, page pool, driver still see the 'struct page', >>>>> only memory provider see the specific struct for itself, for the above, >>>>> devmem memory provider sees the 'struct page_pool_iov'. >>>>> >>>>> The reason I still expect driver to see the 'struct page' is that driver >>>>> will still need to support normal memory besides devmem. >>> >>> I wouldn't say this approach is unreasonable, but it does have to be >>> done carefully to isolate the mm. Keeping the struct page in the API >>> is going to make this very hard. >> >> I would expect that most of the isolation is done in page pool, as far as >> I can see: > > It is the sort of thing that is important enough it should have > compiler help via types to prove that it is being done > properly. Otherwise it will be full of mistakes over time. Yes, agreed. I have done something similar as willy has done for some of folio conversion as below: +#define PAGE_POOL_MATCH(pg, iov) \ + static_assert(offsetof(struct page, pg) == \ + offsetof(struct page_pool_iov, iov)) +PAGE_POOL_MATCH(flags, res0); +PAGE_POOL_MATCH(pp_magic, pp_magic); +PAGE_POOL_MATCH(pp, pp); ... Not sure if we need to add new API for driver to use when the driver need the devmem support yet. > > Jason > . >
On 2023/11/16 1:44, Mina Almasry wrote: > On Wed, Nov 15, 2023 at 1:21 AM Yunsheng Lin <linyunsheng@huawei.com> wrote: >> >> On 2023/11/14 21:16, Jason Gunthorpe wrote: >>> On Tue, Nov 14, 2023 at 04:21:26AM -0800, Mina Almasry wrote: >>> >>>> Actually because you put the 'strtuct page for devmem' in >>>> skb->bv_frag, the net stack will grab the 'struct page' for devmem >>>> using skb_frag_page() then call things like page_address(), kmap, >>>> get_page, put_page, etc, etc, etc. >>> >>> Yikes, please no. If net has its own struct page look alike it has to >>> stay entirely inside net. A non-mm owned struct page should not be >>> passed into mm calls. It is just way too hacky to be seriously >>> considered :( >> >> Yes, that is something this patchset is trying to do, defining its own >> struct page look alike for page pool to support devmem. >> >> struct page for devmem will not be called into the mm subsystem, so most >> of the mm calls is avoided by calling into the devmem memory provider' >> ops instead of calling mm calls. >> >> As far as I see for now, only page_ref_count(), page_is_pfmemalloc() and >> PageTail() is called for devmem page, which should be easy to ensure that >> those call for devmem page is consistent with the struct page owned by mm. > > I'm not sure this is true. These 3 calls are just the calls you're > aware of. In your proposal you're casting mirror pages into page* and > releasing them into the net stack. You need to scrub the entire net > stack for mm calls, i.e. all driver code and all skb_frag_page() call > sites. Of the top of my head, the driver is probably calling > page_address() and illegal_highdma() is calling PageHighMem(). TCP > zerocopy receive is calling vm_insert_pages(). For net stack part, I believe your patch below is handling to aovid those mm calls? I don't include it in this patchset as I thought it is obvious that whatever the proposal is, we always need those checking. Maybe we should have included it to avoid this kind of confusion. https://lore.kernel.org/all/20231106024413.2801438-10-almasrymina@google.com/ For driver part, I was thinking if the driver supports devmem, it should check that if it can call page_address() related call on a specific 'stuct page', or maybe we should introduce a new helper to make it obvious?
On 2023/11/16 3:05, Mina Almasry wrote: > On Wed, Nov 15, 2023 at 10:07 AM Mina Almasry <almasrymina@google.com> wrote: >> >> On Wed, Nov 15, 2023 at 1:29 AM Yunsheng Lin <linyunsheng@huawei.com> wrote: >>> >>> On 2023/11/14 23:41, Willem de Bruijn wrote: >>>>> >>>>> I am not sure dma-buf maintainer's concern is still there with this patchset. >>>>> >>>>> Whatever name you calling it for the struct, however you arrange each field >>>>> in the struct, some metadata is always needed for dmabuf to intergrate into >>>>> page pool. >>>>> >>>>> If the above is true, why not utilize the 'struct page' to have more unified >>>>> handling? >>>> >>>> My understanding is that there is a general preference to simplify struct >>>> page, and at the least not move in the other direction by overloading the >>>> struct in new ways. >>> >>> As my understanding, the new struct is just mirroring the struct page pool >>> is already using, see: >>> https://elixir.free-electrons.com/linux/v6.7-rc1/source/include/linux/mm_types.h#L119 >>> >>> If there is simplifying to the struct page_pool is using, I think the new >>> stuct the devmem memory provider is using can adjust accordingly. >>> >>> As a matter of fact, I think the way 'struct page' for devmem is decoupled >>> from mm subsystem may provide a way to simplify or decoupled the already >>> existing 'struct page' used in netstack from mm subsystem, before this >>> patchset, it seems we have the below types of 'struct page': >>> 1. page allocated in the netstack using page pool. >>> 2. page allocated in the netstack using buddy allocator. >>> 3. page allocated in other subsystem and passed to the netstack, such as >>> zcopy or spliced page? >>> >>> If we can decouple 'struct page' for devmem from mm subsystem, we may be able >>> to decouple the above 'struct page' from mm subsystem one by one. >>> >>>> >>>> If using struct page for something that is not memory, there is ZONE_DEVICE. >>>> But using that correctly is non-trivial: >>>> >>>> https://lore.kernel.org/all/ZKyZBbKEpmkFkpWV@ziepe.ca/ >>>> >>>> Since all we need is a handle that does not leave the network stack, >>>> a network specific struct like page_pool_iov entirely avoids this issue. >>> >>> Yes, I am agree about the network specific struct. >>> I am wondering if we can make the struct more generic if we want to >>> intergrate it into page_pool and use it in net stack. >>> >>>> RFC v3 seems like a good simplification over RFC v1 in that regard to me. >>>> I was also pleasantly surprised how minimal the change to the users of >>>> skb_frag_t actually proved to be. >>> >>> Yes, I am agreed about that too. Maybe we can make it simpler by using >>> a more abstract struct as page_pool, and utilize some features of >>> page_pool too. >>> >>> For example, from the page_pool doc, page_pool have fast cache and >>> ptr-ring cache as below, but if napi_frag_unref() call >>> page_pool_page_put_many() and return the dmabuf chunk directly to >>> gen_pool in the memory provider, then it seems we are bypassing the >>> below caches in the page_pool. >>> >> >> I think you're just misunderstanding the code. The page recycling >> works with my patchset. napi_frag_unref() calls napi_pp_put_page() if >> recycle == true, and that works the same with devmem as with regular >> pages. >> >> If recycle == false, we call page_pool_page_put_many() which will call >> put_page() for regular pages and page_pool_iov_put_many() for devmem >> pages. So, the memory recycling works exactly the same as before with >> devmem as with regular pages. In my tests I do see the devmem being >> recycled correctly. We are not bypassing any caches. >> >> > > Ah, taking a closer look here, the devmem recycling works for me but I > think that's a side effect to the fact that the page_pool support I > implemented with GVE is unusual. I currently allocate pages from the > page_pool but do not set skb_mark_for_recycle(). The page recycling > still happens when GVE is done with the page and calls > page_pool_put_full_pgae(), as that eventually checks the refcount on > the devmem and recycles it. > > I will fix up the GVE to call skb_mark_for_recycle() and ensure the > napi_pp_put_page() path recycles the devmem or page correctly in the > next version. What about other features? Like dma mapping optimization and frag support in the page pool. I understand that you use some trick in the gen_gool to avoid the per chunk dma_addr storage in the 'struct page_pool_iov' and do not need frag support for now. But for other memory provider, if they need those supports, we probably need to extend 'struct page_pool_iov' to support those features, then we may have a 'struct page_pool_iov' to be looking alike to the sepcific union for page_pool in 'struct page', see: https://elixir.free-electrons.com/linux/v6.7-rc1/source/include/linux/mm_types.h#L119 We currently don't have a way to decouple the existing 'struct page' from mm subsystem yet as my understanding, if we don't mirror the above union in the 'struct page', we may have more 'if' checking added in the future.
On 2023/11/16 1:57, David Ahern wrote: > On 11/15/23 2:21 AM, Yunsheng Lin wrote: >> On 2023/11/14 21:16, Jason Gunthorpe wrote: >>> On Tue, Nov 14, 2023 at 04:21:26AM -0800, Mina Almasry wrote: >>> >>>> Actually because you put the 'strtuct page for devmem' in >>>> skb->bv_frag, the net stack will grab the 'struct page' for devmem >>>> using skb_frag_page() then call things like page_address(), kmap, >>>> get_page, put_page, etc, etc, etc. >>> >>> Yikes, please no. If net has its own struct page look alike it has to >>> stay entirely inside net. A non-mm owned struct page should not be >>> passed into mm calls. It is just way too hacky to be seriously >>> considered :( >> >> Yes, that is something this patchset is trying to do, defining its own >> struct page look alike for page pool to support devmem. >> > > Networking needs to be able to move away from struct page references. > The devmem and host memory for Rx use cases do not need to be page based. Yes, I am agreed the ultimate goal is to move away from struct page references. But I am not sure if we can do it right away as there still are different types of existing 'struct page' in the netstack, see: https://lore.kernel.org/all/8b7d25eb-1f10-3e37-8753-92b42da3fb34@huawei.com/ > > > . >
On Thu, Nov 16, 2023 at 3:12 AM Yunsheng Lin <linyunsheng@huawei.com> wrote: > > On 2023/11/16 3:05, Mina Almasry wrote: > > On Wed, Nov 15, 2023 at 10:07 AM Mina Almasry <almasrymina@google.com> wrote: > >> > >> On Wed, Nov 15, 2023 at 1:29 AM Yunsheng Lin <linyunsheng@huawei.com> wrote: > >>> > >>> On 2023/11/14 23:41, Willem de Bruijn wrote: > >>>>> > >>>>> I am not sure dma-buf maintainer's concern is still there with this patchset. > >>>>> > >>>>> Whatever name you calling it for the struct, however you arrange each field > >>>>> in the struct, some metadata is always needed for dmabuf to intergrate into > >>>>> page pool. > >>>>> > >>>>> If the above is true, why not utilize the 'struct page' to have more unified > >>>>> handling? > >>>> > >>>> My understanding is that there is a general preference to simplify struct > >>>> page, and at the least not move in the other direction by overloading the > >>>> struct in new ways. > >>> > >>> As my understanding, the new struct is just mirroring the struct page pool > >>> is already using, see: > >>> https://elixir.free-electrons.com/linux/v6.7-rc1/source/include/linux/mm_types.h#L119 > >>> > >>> If there is simplifying to the struct page_pool is using, I think the new > >>> stuct the devmem memory provider is using can adjust accordingly. > >>> > >>> As a matter of fact, I think the way 'struct page' for devmem is decoupled > >>> from mm subsystem may provide a way to simplify or decoupled the already > >>> existing 'struct page' used in netstack from mm subsystem, before this > >>> patchset, it seems we have the below types of 'struct page': > >>> 1. page allocated in the netstack using page pool. > >>> 2. page allocated in the netstack using buddy allocator. > >>> 3. page allocated in other subsystem and passed to the netstack, such as > >>> zcopy or spliced page? > >>> > >>> If we can decouple 'struct page' for devmem from mm subsystem, we may be able > >>> to decouple the above 'struct page' from mm subsystem one by one. > >>> > >>>> > >>>> If using struct page for something that is not memory, there is ZONE_DEVICE. > >>>> But using that correctly is non-trivial: > >>>> > >>>> https://lore.kernel.org/all/ZKyZBbKEpmkFkpWV@ziepe.ca/ > >>>> > >>>> Since all we need is a handle that does not leave the network stack, > >>>> a network specific struct like page_pool_iov entirely avoids this issue. > >>> > >>> Yes, I am agree about the network specific struct. > >>> I am wondering if we can make the struct more generic if we want to > >>> intergrate it into page_pool and use it in net stack. > >>> > >>>> RFC v3 seems like a good simplification over RFC v1 in that regard to me. > >>>> I was also pleasantly surprised how minimal the change to the users of > >>>> skb_frag_t actually proved to be. > >>> > >>> Yes, I am agreed about that too. Maybe we can make it simpler by using > >>> a more abstract struct as page_pool, and utilize some features of > >>> page_pool too. > >>> > >>> For example, from the page_pool doc, page_pool have fast cache and > >>> ptr-ring cache as below, but if napi_frag_unref() call > >>> page_pool_page_put_many() and return the dmabuf chunk directly to > >>> gen_pool in the memory provider, then it seems we are bypassing the > >>> below caches in the page_pool. > >>> > >> > >> I think you're just misunderstanding the code. The page recycling > >> works with my patchset. napi_frag_unref() calls napi_pp_put_page() if > >> recycle == true, and that works the same with devmem as with regular > >> pages. > >> > >> If recycle == false, we call page_pool_page_put_many() which will call > >> put_page() for regular pages and page_pool_iov_put_many() for devmem > >> pages. So, the memory recycling works exactly the same as before with > >> devmem as with regular pages. In my tests I do see the devmem being > >> recycled correctly. We are not bypassing any caches. > >> > >> > > > > Ah, taking a closer look here, the devmem recycling works for me but I > > think that's a side effect to the fact that the page_pool support I > > implemented with GVE is unusual. I currently allocate pages from the > > page_pool but do not set skb_mark_for_recycle(). The page recycling > > still happens when GVE is done with the page and calls > > page_pool_put_full_pgae(), as that eventually checks the refcount on > > the devmem and recycles it. > > > > I will fix up the GVE to call skb_mark_for_recycle() and ensure the > > napi_pp_put_page() path recycles the devmem or page correctly in the > > next version. > > What about other features? Like dma mapping optimization and frag support > in the page pool. > PP_FLAG_DMA_MAP will be supported and required in the next version per Jakub's request. frag support is something I disabled in the initial versions of the patchset, but only out of convenience and to simplify the initial implementation. At google we typically use page aligned MSS and the frag support isn't really that useful for us. I don't see an issue extending frag support to devmem and iovs in the future if needed. We'd probably add the pp_frag_count field to page_pool_iov and handle it similarly to how it's handled for pages. > I understand that you use some trick in the gen_gool to avoid the per chunk > dma_addr storage in the 'struct page_pool_iov' and do not need frag support > for now. > > But for other memory provider, if they need those supports, we probably need > to extend 'struct page_pool_iov' to support those features, then we may have > a 'struct page_pool_iov' to be looking alike to the sepcific union for page_pool > in 'struct page', see: > > https://elixir.free-electrons.com/linux/v6.7-rc1/source/include/linux/mm_types.h#L119 > > We currently don't have a way to decouple the existing 'struct page' from mm > subsystem yet as my understanding, if we don't mirror the above union in the > 'struct page', we may have more 'if' checking added in the future.
On Thu, Nov 16, 2023 at 07:10:01PM +0800, Yunsheng Lin wrote: > On 2023/11/15 21:38, Jason Gunthorpe wrote: > > On Wed, Nov 15, 2023 at 05:21:02PM +0800, Yunsheng Lin wrote: > > > >>>>> I would expect net stack, page pool, driver still see the 'struct page', > >>>>> only memory provider see the specific struct for itself, for the above, > >>>>> devmem memory provider sees the 'struct page_pool_iov'. > >>>>> > >>>>> The reason I still expect driver to see the 'struct page' is that driver > >>>>> will still need to support normal memory besides devmem. > >>> > >>> I wouldn't say this approach is unreasonable, but it does have to be > >>> done carefully to isolate the mm. Keeping the struct page in the API > >>> is going to make this very hard. > >> > >> I would expect that most of the isolation is done in page pool, as far as > >> I can see: > > > > It is the sort of thing that is important enough it should have > > compiler help via types to prove that it is being done > > properly. Otherwise it will be full of mistakes over time. > > Yes, agreed. > > I have done something similar as willy has done for some of > folio conversion as below: That is not at all what I mean, I mean you should not use struct page * types at all in code that flows from the _iov version except via limited accessors that can be audited and have appropriate assertions. Just releasing struct page * that is not a struct page * everywhere without type safety will never be correct long term. Jason
On 11/16/23 4:12 AM, Yunsheng Lin wrote: > On 2023/11/16 1:57, David Ahern wrote: >> On 11/15/23 2:21 AM, Yunsheng Lin wrote: >>> On 2023/11/14 21:16, Jason Gunthorpe wrote: >>>> On Tue, Nov 14, 2023 at 04:21:26AM -0800, Mina Almasry wrote: >>>> >>>>> Actually because you put the 'strtuct page for devmem' in >>>>> skb->bv_frag, the net stack will grab the 'struct page' for devmem >>>>> using skb_frag_page() then call things like page_address(), kmap, >>>>> get_page, put_page, etc, etc, etc. >>>> >>>> Yikes, please no. If net has its own struct page look alike it has to >>>> stay entirely inside net. A non-mm owned struct page should not be >>>> passed into mm calls. It is just way too hacky to be seriously >>>> considered :( >>> >>> Yes, that is something this patchset is trying to do, defining its own >>> struct page look alike for page pool to support devmem. >>> >> >> Networking needs to be able to move away from struct page references. >> The devmem and host memory for Rx use cases do not need to be page based. > > Yes, I am agreed the ultimate goal is to move away from struct page > references. But I am not sure if we can do it right away as there > still are different types of existing 'struct page' in the netstack, > see: > > https://lore.kernel.org/all/8b7d25eb-1f10-3e37-8753-92b42da3fb34@huawei.com/ yes, that is the point of a blended approach -- pages and buffers (or iov) -- leveraging the LSB of the address. That proposal is the right direction to be moving for co-existence. Adding fake struct page instances is the wrong direction.
On 2023/11/16 23:58, David Ahern wrote: > On 11/16/23 4:12 AM, Yunsheng Lin wrote: >> On 2023/11/16 1:57, David Ahern wrote: >>> On 11/15/23 2:21 AM, Yunsheng Lin wrote: >>>> On 2023/11/14 21:16, Jason Gunthorpe wrote: >>>>> On Tue, Nov 14, 2023 at 04:21:26AM -0800, Mina Almasry wrote: >>>>> >>>>>> Actually because you put the 'strtuct page for devmem' in >>>>>> skb->bv_frag, the net stack will grab the 'struct page' for devmem >>>>>> using skb_frag_page() then call things like page_address(), kmap, >>>>>> get_page, put_page, etc, etc, etc. >>>>> >>>>> Yikes, please no. If net has its own struct page look alike it has to >>>>> stay entirely inside net. A non-mm owned struct page should not be >>>>> passed into mm calls. It is just way too hacky to be seriously >>>>> considered :( >>>> >>>> Yes, that is something this patchset is trying to do, defining its own >>>> struct page look alike for page pool to support devmem. >>>> >>> >>> Networking needs to be able to move away from struct page references. >>> The devmem and host memory for Rx use cases do not need to be page based. >> >> Yes, I am agreed the ultimate goal is to move away from struct page >> references. But I am not sure if we can do it right away as there >> still are different types of existing 'struct page' in the netstack, >> see: >> >> https://lore.kernel.org/all/8b7d25eb-1f10-3e37-8753-92b42da3fb34@huawei.com/ > > yes, that is the point of a blended approach -- pages and buffers (or > iov) -- leveraging the LSB of the address. That proposal is the right I am not sure leveraging the LSB of the address is necessary yet, as it does not seems to provide the type check protection, it seems to just provide a way to demux between pages(including page pool owned page and non-page pool owned page) and page pool owned buffer. That info is avaliable through the page->pp_magic and page->pp->mp_* too if we mirror the page pool specific union in 'struct page'. > direction to be moving for co-existence. Adding fake struct page > instances is the wrong direction. Perhaps a fake struct page with type check protection is the right direction? Intergrating devmem to page pool without a unified metadata between pages and buffers or without a proper abstract layer does not seems like the good direction either. > . >
diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h index 5e4fcd45ba50..52e4cf98ebc6 100644 --- a/include/net/page_pool/types.h +++ b/include/net/page_pool/types.h @@ -124,6 +124,7 @@ struct mem_provider; enum pp_memory_provider_type { __PP_MP_NONE, /* Use system allocator directly */ + PP_MP_DMABUF_DEVMEM, /* dmabuf devmem provider */ }; struct pp_memory_provider_ops { @@ -134,6 +135,33 @@ struct pp_memory_provider_ops { void (*free_pages)(struct page_pool *pool, struct page *page); }; +extern const struct pp_memory_provider_ops dmabuf_devmem_ops; + +struct page_pool_iov { + unsigned long res0; + unsigned long pp_magic; + struct page_pool *pp; + struct page *page; /* dmabuf memory provider specific field */ + unsigned long dma_addr; + atomic_long_t pp_frag_count; + unsigned int res1; + refcount_t _refcount; +}; + +#define PAGE_POOL_MATCH(pg, iov) \ + static_assert(offsetof(struct page, pg) == \ + offsetof(struct page_pool_iov, iov)) +PAGE_POOL_MATCH(flags, res0); +PAGE_POOL_MATCH(pp_magic, pp_magic); +PAGE_POOL_MATCH(pp, pp); +PAGE_POOL_MATCH(_pp_mapping_pad, page); +PAGE_POOL_MATCH(dma_addr, dma_addr); +PAGE_POOL_MATCH(pp_frag_count, pp_frag_count); +PAGE_POOL_MATCH(_mapcount, res1); +PAGE_POOL_MATCH(_refcount, _refcount); +#undef PAGE_POOL_MATCH +static_assert(sizeof(struct page_pool_iov) <= sizeof(struct page)); + struct page_pool { struct page_pool_params p; diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 6c502bea842b..1bd7a2306f09 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -231,6 +231,9 @@ static int page_pool_init(struct page_pool *pool, switch (pool->p.memory_provider) { case __PP_MP_NONE: break; + case PP_MP_DMABUF_DEVMEM: + pool->mp_ops = &dmabuf_devmem_ops; + break; default: err = -EINVAL; goto free_ptr_ring; @@ -1010,3 +1013,93 @@ void page_pool_update_nid(struct page_pool *pool, int new_nid) } } EXPORT_SYMBOL(page_pool_update_nid); + +/*** "Dmabuf devmem memory provider" ***/ + +static int mp_dmabuf_devmem_init(struct page_pool *pool) +{ + if (pool->p.flags & PP_FLAG_DMA_MAP || + pool->p.flags & PP_FLAG_DMA_SYNC_DEV) + return -EOPNOTSUPP; + return 0; +} + +static struct page *mp_dmabuf_devmem_alloc_pages(struct page_pool *pool, + gfp_t gfp) +{ + struct page_pool_iov *ppiov; + struct page *page; + dma_addr_t dma; + + ppiov = kvmalloc(sizeof(*ppiov), gfp | __GFP_ZERO); + if (!ppiov) + return NULL; + + page = alloc_pages_node(pool->p.nid, gfp, pool->p.order); + if (!page) { + kvfree(ppiov); + return NULL; + } + + dma = dma_map_page_attrs(pool->p.dev, page, 0, + (PAGE_SIZE << pool->p.order), + pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC | + DMA_ATTR_WEAK_ORDERING); + if (dma_mapping_error(pool->p.dev, dma)) { + put_page(page); + kvfree(ppiov); + return NULL; + } + + ppiov->pp = pool; + ppiov->pp_magic = PP_SIGNATURE; + ppiov->page = page; + refcount_set(&ppiov->_refcount, 1); + page_pool_fragment_page((struct page *)ppiov, 1); + page_pool_set_dma_addr((struct page *)ppiov, dma); + pool->pages_state_hold_cnt++; + trace_page_pool_state_hold(pool, (struct page *)ppiov, + pool->pages_state_hold_cnt); + return (struct page *)ppiov; +} + +static void mp_dmabuf_devmem_destroy(struct page_pool *pool) +{ +} + +static void mp_dmabuf_devmem_release_page(struct page_pool *pool, + struct page *page) +{ + struct page_pool_iov *ppiov = (struct page_pool_iov *)page; + dma_addr_t dma; + + dma = page_pool_get_dma_addr(page); + + /* When page is unmapped, it cannot be returned to our pool */ + dma_unmap_page_attrs(pool->p.dev, dma, + PAGE_SIZE << pool->p.order, pool->p.dma_dir, + DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING); + page_pool_set_dma_addr(page, 0); + + put_page(ppiov->page); +} + +static void mp_dmabuf_devmem_free_pages(struct page_pool *pool, + struct page *page) +{ + int count; + + count = atomic_inc_return_relaxed(&pool->pages_state_release_cnt); + trace_page_pool_state_release(pool, page, count); + + kvfree(page); +} + +const struct pp_memory_provider_ops dmabuf_devmem_ops = { + .init = mp_dmabuf_devmem_init, + .destroy = mp_dmabuf_devmem_destroy, + .alloc_pages = mp_dmabuf_devmem_alloc_pages, + .release_page = mp_dmabuf_devmem_release_page, + .free_pages = mp_dmabuf_devmem_free_pages, +}; +EXPORT_SYMBOL(dmabuf_devmem_ops);