diff mbox series

[RFC,2/3] dma-buf: system_heap: Add pagepool support to system heap

Message ID 20201217230612.32397-2-john.stultz@linaro.org (mailing list archive)
State New, archived
Headers show
Series [RFC,1/3] dma-buf: heaps: Add deferred-free-helper library code | expand

Commit Message

John Stultz Dec. 17, 2020, 11:06 p.m. UTC
Reuse/abuse the pagepool code from the network code to speed
up allocation performance.

This is similar to the ION pagepool usage, but tries to
utilize generic code instead of a custom implementation.

Cc: Sumit Semwal <sumit.semwal@linaro.org>
Cc: Liam Mark <lmark@codeaurora.org>
Cc: Chris Goldsworthy <cgoldswo@codeaurora.org>
Cc: Laura Abbott <labbott@kernel.org>
Cc: Brian Starkey <Brian.Starkey@arm.com>
Cc: Hridya Valsaraju <hridya@google.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Sandeep Patil <sspatil@google.com>
Cc: Daniel Mentz <danielmentz@google.com>
Cc: Ørjan Eide <orjan.eide@arm.com>
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Ezequiel Garcia <ezequiel@collabora.com>
Cc: Simon Ser <contact@emersion.fr>
Cc: James Jones <jajones@nvidia.com>
Cc: linux-media@vger.kernel.org
Cc: dri-devel@lists.freedesktop.org
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 drivers/dma-buf/heaps/Kconfig       |  1 +
 drivers/dma-buf/heaps/system_heap.c | 68 +++++++++++++++++++++++++++--
 2 files changed, 65 insertions(+), 4 deletions(-)

Comments

Daniel Vetter Dec. 18, 2020, 2:36 p.m. UTC | #1
On Thu, Dec 17, 2020 at 11:06:11PM +0000, John Stultz wrote:
> Reuse/abuse the pagepool code from the network code to speed
> up allocation performance.
> 
> This is similar to the ION pagepool usage, but tries to
> utilize generic code instead of a custom implementation.
> 
> Cc: Sumit Semwal <sumit.semwal@linaro.org>
> Cc: Liam Mark <lmark@codeaurora.org>
> Cc: Chris Goldsworthy <cgoldswo@codeaurora.org>
> Cc: Laura Abbott <labbott@kernel.org>
> Cc: Brian Starkey <Brian.Starkey@arm.com>
> Cc: Hridya Valsaraju <hridya@google.com>
> Cc: Suren Baghdasaryan <surenb@google.com>
> Cc: Sandeep Patil <sspatil@google.com>
> Cc: Daniel Mentz <danielmentz@google.com>
> Cc: Ørjan Eide <orjan.eide@arm.com>
> Cc: Robin Murphy <robin.murphy@arm.com>
> Cc: Ezequiel Garcia <ezequiel@collabora.com>
> Cc: Simon Ser <contact@emersion.fr>
> Cc: James Jones <jajones@nvidia.com>
> Cc: linux-media@vger.kernel.org
> Cc: dri-devel@lists.freedesktop.org
> Signed-off-by: John Stultz <john.stultz@linaro.org>

We also have one of these in ttm. I think we should have at most one of
these for the gpu ecosystem overall, maybe as a helper that can be plugged
into all the places.

Or I'm kinda missing something, which could be since I only glanced at
yours for a bit. But it's also called page pool for buffer allocations,
and I don't think there's that many ways to implement that really :-)
-Daniel

> ---
>  drivers/dma-buf/heaps/Kconfig       |  1 +
>  drivers/dma-buf/heaps/system_heap.c | 68 +++++++++++++++++++++++++++--
>  2 files changed, 65 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/dma-buf/heaps/Kconfig b/drivers/dma-buf/heaps/Kconfig
> index ecf65204f714..fa5e1c330cce 100644
> --- a/drivers/dma-buf/heaps/Kconfig
> +++ b/drivers/dma-buf/heaps/Kconfig
> @@ -4,6 +4,7 @@ config DMABUF_HEAPS_DEFERRED_FREE
>  config DMABUF_HEAPS_SYSTEM
>  	bool "DMA-BUF System Heap"
>  	depends on DMABUF_HEAPS
> +	select PAGE_POOL
>  	help
>  	  Choose this option to enable the system dmabuf heap. The system heap
>  	  is backed by pages from the buddy allocator. If in doubt, say Y.
> diff --git a/drivers/dma-buf/heaps/system_heap.c b/drivers/dma-buf/heaps/system_heap.c
> index 17e0e9a68baf..885e30894b77 100644
> --- a/drivers/dma-buf/heaps/system_heap.c
> +++ b/drivers/dma-buf/heaps/system_heap.c
> @@ -20,6 +20,7 @@
>  #include <linux/scatterlist.h>
>  #include <linux/slab.h>
>  #include <linux/vmalloc.h>
> +#include <net/page_pool.h>
>  
>  static struct dma_heap *sys_heap;
>  
> @@ -53,6 +54,7 @@ static gfp_t order_flags[] = {HIGH_ORDER_GFP, LOW_ORDER_GFP, LOW_ORDER_GFP};
>   */
>  static const unsigned int orders[] = {8, 4, 0};
>  #define NUM_ORDERS ARRAY_SIZE(orders)
> +struct page_pool *pools[NUM_ORDERS];
>  
>  static struct sg_table *dup_sg_table(struct sg_table *table)
>  {
> @@ -281,18 +283,59 @@ static void system_heap_vunmap(struct dma_buf *dmabuf, struct dma_buf_map *map)
>  	dma_buf_map_clear(map);
>  }
>  
> +static int system_heap_clear_pages(struct page **pages, int num, pgprot_t pgprot)
> +{
> +	void *addr = vmap(pages, num, VM_MAP, pgprot);
> +
> +	if (!addr)
> +		return -ENOMEM;
> +	memset(addr, 0, PAGE_SIZE * num);
> +	vunmap(addr);
> +	return 0;
> +}
> +
> +static int system_heap_zero_buffer(struct system_heap_buffer *buffer)
> +{
> +	struct sg_table *sgt = &buffer->sg_table;
> +	struct sg_page_iter piter;
> +	struct page *pages[32];
> +	int p = 0;
> +	int ret = 0;
> +
> +	for_each_sgtable_page(sgt, &piter, 0) {
> +		pages[p++] = sg_page_iter_page(&piter);
> +		if (p == ARRAY_SIZE(pages)) {
> +			ret = system_heap_clear_pages(pages, p, PAGE_KERNEL);
> +			if (ret)
> +				return ret;
> +			p = 0;
> +		}
> +	}
> +	if (p)
> +		ret = system_heap_clear_pages(pages, p, PAGE_KERNEL);
> +
> +	return ret;
> +}
> +
>  static void system_heap_dma_buf_release(struct dma_buf *dmabuf)
>  {
>  	struct system_heap_buffer *buffer = dmabuf->priv;
>  	struct sg_table *table;
>  	struct scatterlist *sg;
> -	int i;
> +	int i, j;
> +
> +	/* Zero the buffer pages before adding back to the pool */
> +	system_heap_zero_buffer(buffer);
>  
>  	table = &buffer->sg_table;
>  	for_each_sg(table->sgl, sg, table->nents, i) {
>  		struct page *page = sg_page(sg);
>  
> -		__free_pages(page, compound_order(page));
> +		for (j = 0; j < NUM_ORDERS; j++) {
> +			if (compound_order(page) == orders[j])
> +				break;
> +		}
> +		page_pool_put_full_page(pools[j], page, false);
>  	}
>  	sg_free_table(table);
>  	kfree(buffer);
> @@ -322,8 +365,7 @@ static struct page *alloc_largest_available(unsigned long size,
>  			continue;
>  		if (max_order < orders[i])
>  			continue;
> -
> -		page = alloc_pages(order_flags[i], orders[i]);
> +		page = page_pool_alloc_pages(pools[i], order_flags[i]);
>  		if (!page)
>  			continue;
>  		return page;
> @@ -428,6 +470,24 @@ static const struct dma_heap_ops system_heap_ops = {
>  static int system_heap_create(void)
>  {
>  	struct dma_heap_export_info exp_info;
> +	int i;
> +
> +	for (i = 0; i < NUM_ORDERS; i++) {
> +		struct page_pool_params pp;
> +
> +		memset(&pp, 0, sizeof(pp));
> +		pp.order = orders[i];
> +		pools[i] = page_pool_create(&pp);
> +
> +		if (IS_ERR(pools[i])) {
> +			int j;
> +
> +			pr_err("%s: page pool creation failed!\n", __func__);
> +			for (j = 0; j < i; j++)
> +				page_pool_destroy(pools[j]);
> +			return PTR_ERR(pools[i]);
> +		}
> +	}
>  
>  	exp_info.name = "system";
>  	exp_info.ops = &system_heap_ops;
> -- 
> 2.17.1
> 
> _______________________________________________
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/dri-devel
John Stultz Dec. 19, 2020, 1:16 a.m. UTC | #2
On Fri, Dec 18, 2020 at 6:36 AM Daniel Vetter <daniel@ffwll.ch> wrote:
> On Thu, Dec 17, 2020 at 11:06:11PM +0000, John Stultz wrote:
> > Reuse/abuse the pagepool code from the network code to speed
> > up allocation performance.
> >
> > This is similar to the ION pagepool usage, but tries to
> > utilize generic code instead of a custom implementation.
>
> We also have one of these in ttm. I think we should have at most one of
> these for the gpu ecosystem overall, maybe as a helper that can be plugged
> into all the places.
>
> Or I'm kinda missing something, which could be since I only glanced at
> yours for a bit. But it's also called page pool for buffer allocations,
> and I don't think there's that many ways to implement that really :-)

Yea, when I was looking around the ttm one didn't seem quite as
generic as the networking one, which more easily fit in here.

The main benefit for the system heap is not so much the pool itself
(the normal page allocator is pretty good), as it being able to defer
the free and zero the pages in a background thread, so the pool is
effectively filled with pre-zeroed pages.

But I'll take another look at the ttm implementation and see if it can
be re-used or the shared code refactored and pulled out somehow.

thanks
-john
Daniel Vetter Dec. 21, 2020, 10:08 p.m. UTC | #3
On Fri, Dec 18, 2020 at 05:16:56PM -0800, John Stultz wrote:
> On Fri, Dec 18, 2020 at 6:36 AM Daniel Vetter <daniel@ffwll.ch> wrote:
> > On Thu, Dec 17, 2020 at 11:06:11PM +0000, John Stultz wrote:
> > > Reuse/abuse the pagepool code from the network code to speed
> > > up allocation performance.
> > >
> > > This is similar to the ION pagepool usage, but tries to
> > > utilize generic code instead of a custom implementation.
> >
> > We also have one of these in ttm. I think we should have at most one of
> > these for the gpu ecosystem overall, maybe as a helper that can be plugged
> > into all the places.
> >
> > Or I'm kinda missing something, which could be since I only glanced at
> > yours for a bit. But it's also called page pool for buffer allocations,
> > and I don't think there's that many ways to implement that really :-)
> 
> Yea, when I was looking around the ttm one didn't seem quite as
> generic as the networking one, which more easily fit in here.

Oops, I didn't look that closely and didn't realize you're reusing the one
from net/core/.

> The main benefit for the system heap is not so much the pool itself
> (the normal page allocator is pretty good), as it being able to defer
> the free and zero the pages in a background thread, so the pool is
> effectively filled with pre-zeroed pages.
> 
> But I'll take another look at the ttm implementation and see if it can
> be re-used or the shared code refactored and pulled out somehow.

I think moving the page_pool from net into lib and using it in ttm might
also be an option. Lack of shrinker in the networking one might be a bit a
problem.

Also I guess the real fun will start if you pre-flush pages (after
zeroing) or use write-combined mode (which tends to be a real pain to
allocate). So for those I think we likely need more than just reusing the
one net/core has already.
-Daniel
John Stultz Jan. 23, 2021, 1:28 a.m. UTC | #4
On Mon, Dec 21, 2020 at 2:09 PM Daniel Vetter <daniel@ffwll.ch> wrote:
>
> On Fri, Dec 18, 2020 at 05:16:56PM -0800, John Stultz wrote:
> > On Fri, Dec 18, 2020 at 6:36 AM Daniel Vetter <daniel@ffwll.ch> wrote:
> > > On Thu, Dec 17, 2020 at 11:06:11PM +0000, John Stultz wrote:
> > > > Reuse/abuse the pagepool code from the network code to speed
> > > > up allocation performance.
> > > >
> > > > This is similar to the ION pagepool usage, but tries to
> > > > utilize generic code instead of a custom implementation.
> > >
> > > We also have one of these in ttm. I think we should have at most one of
> > > these for the gpu ecosystem overall, maybe as a helper that can be plugged
> > > into all the places.
> > >
> > > Or I'm kinda missing something, which could be since I only glanced at
> > > yours for a bit. But it's also called page pool for buffer allocations,
> > > and I don't think there's that many ways to implement that really :-)
> >
> > Yea, when I was looking around the ttm one didn't seem quite as
> > generic as the networking one, which more easily fit in here.
>
> Oops, I didn't look that closely and didn't realize you're reusing the one
> from net/core/.
>
> > The main benefit for the system heap is not so much the pool itself
> > (the normal page allocator is pretty good), as it being able to defer
> > the free and zero the pages in a background thread, so the pool is
> > effectively filled with pre-zeroed pages.
> >
> > But I'll take another look at the ttm implementation and see if it can
> > be re-used or the shared code refactored and pulled out somehow.
>
> I think moving the page_pool from net into lib and using it in ttm might
> also be an option. Lack of shrinker in the networking one might be a bit a
> problem.

Yea. I've been looking at this, to see how to abstract out a generic
pool implementation, but each pool implementation has been tweaked for
the specific use cases, so a general abstraction is a bit tough right
off.

For example the ttm pool's handling allocations both from alloc_pages
and dma_alloc in a pool, where the net page pool only uses alloc_pages
(but can pre map via dma_map_attr).

And as you mentioned, the networking page pool is statically sized
where the ttm pool is dynamic and shrinker controlled.

Further, as the ttm pool is utilized for keeping pools of pages set
for specific cache types, it makes it difficult to abstract that out
as we have to be able to reset the caching (set_pages_wb()) when
shrinking, so that would also have to be pushed down into the pool
attributes as well.

So far, in my attempts to share an abstraction for both the net
page_pool and the ttm page pool, it seems to make the code complexity
worse on both sides -  so while I'm interested in continuing to try to
find a way to share code here, I'm not sure it makes sense to hold up
this series (which is already re-using an existing implementation and
provide a performance bump in microbenchmarks) for the
grand-unified-page-pool. Efforts to refactor the ttm pool and net page
pool can continue on indepently, and I'd be happy to move the system
heap to whatever that ends up being.

thanks
-john
Daniel Vetter Feb. 2, 2021, 2:04 p.m. UTC | #5
On Fri, Jan 22, 2021 at 05:28:32PM -0800, John Stultz wrote:
> On Mon, Dec 21, 2020 at 2:09 PM Daniel Vetter <daniel@ffwll.ch> wrote:
> >
> > On Fri, Dec 18, 2020 at 05:16:56PM -0800, John Stultz wrote:
> > > On Fri, Dec 18, 2020 at 6:36 AM Daniel Vetter <daniel@ffwll.ch> wrote:
> > > > On Thu, Dec 17, 2020 at 11:06:11PM +0000, John Stultz wrote:
> > > > > Reuse/abuse the pagepool code from the network code to speed
> > > > > up allocation performance.
> > > > >
> > > > > This is similar to the ION pagepool usage, but tries to
> > > > > utilize generic code instead of a custom implementation.
> > > >
> > > > We also have one of these in ttm. I think we should have at most one of
> > > > these for the gpu ecosystem overall, maybe as a helper that can be plugged
> > > > into all the places.
> > > >
> > > > Or I'm kinda missing something, which could be since I only glanced at
> > > > yours for a bit. But it's also called page pool for buffer allocations,
> > > > and I don't think there's that many ways to implement that really :-)
> > >
> > > Yea, when I was looking around the ttm one didn't seem quite as
> > > generic as the networking one, which more easily fit in here.
> >
> > Oops, I didn't look that closely and didn't realize you're reusing the one
> > from net/core/.
> >
> > > The main benefit for the system heap is not so much the pool itself
> > > (the normal page allocator is pretty good), as it being able to defer
> > > the free and zero the pages in a background thread, so the pool is
> > > effectively filled with pre-zeroed pages.
> > >
> > > But I'll take another look at the ttm implementation and see if it can
> > > be re-used or the shared code refactored and pulled out somehow.
> >
> > I think moving the page_pool from net into lib and using it in ttm might
> > also be an option. Lack of shrinker in the networking one might be a bit a
> > problem.
> 
> Yea. I've been looking at this, to see how to abstract out a generic
> pool implementation, but each pool implementation has been tweaked for
> the specific use cases, so a general abstraction is a bit tough right
> off.
> 
> For example the ttm pool's handling allocations both from alloc_pages
> and dma_alloc in a pool, where the net page pool only uses alloc_pages
> (but can pre map via dma_map_attr).
> 
> And as you mentioned, the networking page pool is statically sized
> where the ttm pool is dynamic and shrinker controlled.
> 
> Further, as the ttm pool is utilized for keeping pools of pages set
> for specific cache types, it makes it difficult to abstract that out
> as we have to be able to reset the caching (set_pages_wb()) when
> shrinking, so that would also have to be pushed down into the pool
> attributes as well.
> 
> So far, in my attempts to share an abstraction for both the net
> page_pool and the ttm page pool, it seems to make the code complexity
> worse on both sides -  so while I'm interested in continuing to try to
> find a way to share code here, I'm not sure it makes sense to hold up
> this series (which is already re-using an existing implementation and
> provide a performance bump in microbenchmarks) for the
> grand-unified-page-pool. Efforts to refactor the ttm pool and net page
> pool can continue on indepently, and I'd be happy to move the system
> heap to whatever that ends up being.

The thing is, I'm not sure sharing code with net/core is a really good
idea, at least it seems like we have some impendence mismatch with the ttm
pool. And going forward I expect sooner or later we need alignment between
the pools/caches under drm with dma-buf heap pools a lot more than between
dma-buf and net/core.

So this feels like a bit code sharing for code sharing's sake and not
where it makes sense. Expecting net/core and gpu stacks to have the exact
same needs for a page pool allocator has good chances to bite us in the
long run.
-Daniel

> 
> thanks
> -john
> _______________________________________________
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/dri-devel
John Stultz Feb. 3, 2021, 5:56 a.m. UTC | #6
On Tue, Feb 2, 2021 at 6:04 AM Daniel Vetter <daniel@ffwll.ch> wrote:
>
> On Fri, Jan 22, 2021 at 05:28:32PM -0800, John Stultz wrote:
> > On Mon, Dec 21, 2020 at 2:09 PM Daniel Vetter <daniel@ffwll.ch> wrote:
> > >
> > > On Fri, Dec 18, 2020 at 05:16:56PM -0800, John Stultz wrote:
> > > > On Fri, Dec 18, 2020 at 6:36 AM Daniel Vetter <daniel@ffwll.ch> wrote:
> > > > > On Thu, Dec 17, 2020 at 11:06:11PM +0000, John Stultz wrote:
> > > > > > Reuse/abuse the pagepool code from the network code to speed
> > > > > > up allocation performance.
> > > > > >
> > > > > > This is similar to the ION pagepool usage, but tries to
> > > > > > utilize generic code instead of a custom implementation.
> > > > >
> > > > > We also have one of these in ttm. I think we should have at most one of
> > > > > these for the gpu ecosystem overall, maybe as a helper that can be plugged
> > > > > into all the places.
> > > > >
> > > > > Or I'm kinda missing something, which could be since I only glanced at
> > > > > yours for a bit. But it's also called page pool for buffer allocations,
> > > > > and I don't think there's that many ways to implement that really :-)
> > > >
> > > > Yea, when I was looking around the ttm one didn't seem quite as
> > > > generic as the networking one, which more easily fit in here.
> > >
> > > Oops, I didn't look that closely and didn't realize you're reusing the one
> > > from net/core/.
> > >
> > > > The main benefit for the system heap is not so much the pool itself
> > > > (the normal page allocator is pretty good), as it being able to defer
> > > > the free and zero the pages in a background thread, so the pool is
> > > > effectively filled with pre-zeroed pages.
> > > >
> > > > But I'll take another look at the ttm implementation and see if it can
> > > > be re-used or the shared code refactored and pulled out somehow.
> > >
> > > I think moving the page_pool from net into lib and using it in ttm might
> > > also be an option. Lack of shrinker in the networking one might be a bit a
> > > problem.
> >
> > Yea. I've been looking at this, to see how to abstract out a generic
> > pool implementation, but each pool implementation has been tweaked for
> > the specific use cases, so a general abstraction is a bit tough right
> > off.
> >
> > For example the ttm pool's handling allocations both from alloc_pages
> > and dma_alloc in a pool, where the net page pool only uses alloc_pages
> > (but can pre map via dma_map_attr).
> >
> > And as you mentioned, the networking page pool is statically sized
> > where the ttm pool is dynamic and shrinker controlled.
> >
> > Further, as the ttm pool is utilized for keeping pools of pages set
> > for specific cache types, it makes it difficult to abstract that out
> > as we have to be able to reset the caching (set_pages_wb()) when
> > shrinking, so that would also have to be pushed down into the pool
> > attributes as well.
> >
> > So far, in my attempts to share an abstraction for both the net
> > page_pool and the ttm page pool, it seems to make the code complexity
> > worse on both sides -  so while I'm interested in continuing to try to
> > find a way to share code here, I'm not sure it makes sense to hold up
> > this series (which is already re-using an existing implementation and
> > provide a performance bump in microbenchmarks) for the
> > grand-unified-page-pool. Efforts to refactor the ttm pool and net page
> > pool can continue on indepently, and I'd be happy to move the system
> > heap to whatever that ends up being.
>
> The thing is, I'm not sure sharing code with net/core is a really good
> idea, at least it seems like we have some impendence mismatch with the ttm
> pool. And going forward I expect sooner or later we need alignment between
> the pools/caches under drm with dma-buf heap pools a lot more than between
> dma-buf and net/core.

I mean...  I don't think you're wrong here, but it was your suggestion.

> So this feels like a bit code sharing for code sharing's sake and not
> where it makes sense. Expecting net/core and gpu stacks to have the exact
> same needs for a page pool allocator has good chances to bite us in the
> long run.

Again, I agree with you at the high level here (dmabuf system heap and
ttm page pooling are conceptually more likely to align, and
duplication of buffer pools is non-optimal), but there's still the
practical aspect of the ttm pool being pretty tied to the ttm code
(utilizing ttm contexts, fixed MAX_ORDER*TTM_NUM_CACHING_TYPES
subpools per pool + 4 global sub-pools for only x86).

So... I guess I'll go for another pass at trying to pull something
generic out of the ttm_pool, but the cynic in me suspects folks will
just object to any inefficiencies added in order to do so (the
code-sharing for its own sake argument above) and I'll be back to
where I am now. But we'll see.

thanks
-john
Daniel Vetter Feb. 3, 2021, 9:46 a.m. UTC | #7
On Tue, Feb 02, 2021 at 09:56:14PM -0800, John Stultz wrote:
> On Tue, Feb 2, 2021 at 6:04 AM Daniel Vetter <daniel@ffwll.ch> wrote:
> >
> > On Fri, Jan 22, 2021 at 05:28:32PM -0800, John Stultz wrote:
> > > On Mon, Dec 21, 2020 at 2:09 PM Daniel Vetter <daniel@ffwll.ch> wrote:
> > > >
> > > > On Fri, Dec 18, 2020 at 05:16:56PM -0800, John Stultz wrote:
> > > > > On Fri, Dec 18, 2020 at 6:36 AM Daniel Vetter <daniel@ffwll.ch> wrote:
> > > > > > On Thu, Dec 17, 2020 at 11:06:11PM +0000, John Stultz wrote:
> > > > > > > Reuse/abuse the pagepool code from the network code to speed
> > > > > > > up allocation performance.
> > > > > > >
> > > > > > > This is similar to the ION pagepool usage, but tries to
> > > > > > > utilize generic code instead of a custom implementation.
> > > > > >
> > > > > > We also have one of these in ttm. I think we should have at most one of
> > > > > > these for the gpu ecosystem overall, maybe as a helper that can be plugged
> > > > > > into all the places.
> > > > > >
> > > > > > Or I'm kinda missing something, which could be since I only glanced at
> > > > > > yours for a bit. But it's also called page pool for buffer allocations,
> > > > > > and I don't think there's that many ways to implement that really :-)
> > > > >
> > > > > Yea, when I was looking around the ttm one didn't seem quite as
> > > > > generic as the networking one, which more easily fit in here.
> > > >
> > > > Oops, I didn't look that closely and didn't realize you're reusing the one
> > > > from net/core/.
> > > >
> > > > > The main benefit for the system heap is not so much the pool itself
> > > > > (the normal page allocator is pretty good), as it being able to defer
> > > > > the free and zero the pages in a background thread, so the pool is
> > > > > effectively filled with pre-zeroed pages.
> > > > >
> > > > > But I'll take another look at the ttm implementation and see if it can
> > > > > be re-used or the shared code refactored and pulled out somehow.
> > > >
> > > > I think moving the page_pool from net into lib and using it in ttm might
> > > > also be an option. Lack of shrinker in the networking one might be a bit a
> > > > problem.
> > >
> > > Yea. I've been looking at this, to see how to abstract out a generic
> > > pool implementation, but each pool implementation has been tweaked for
> > > the specific use cases, so a general abstraction is a bit tough right
> > > off.
> > >
> > > For example the ttm pool's handling allocations both from alloc_pages
> > > and dma_alloc in a pool, where the net page pool only uses alloc_pages
> > > (but can pre map via dma_map_attr).
> > >
> > > And as you mentioned, the networking page pool is statically sized
> > > where the ttm pool is dynamic and shrinker controlled.
> > >
> > > Further, as the ttm pool is utilized for keeping pools of pages set
> > > for specific cache types, it makes it difficult to abstract that out
> > > as we have to be able to reset the caching (set_pages_wb()) when
> > > shrinking, so that would also have to be pushed down into the pool
> > > attributes as well.
> > >
> > > So far, in my attempts to share an abstraction for both the net
> > > page_pool and the ttm page pool, it seems to make the code complexity
> > > worse on both sides -  so while I'm interested in continuing to try to
> > > find a way to share code here, I'm not sure it makes sense to hold up
> > > this series (which is already re-using an existing implementation and
> > > provide a performance bump in microbenchmarks) for the
> > > grand-unified-page-pool. Efforts to refactor the ttm pool and net page
> > > pool can continue on indepently, and I'd be happy to move the system
> > > heap to whatever that ends up being.
> >
> > The thing is, I'm not sure sharing code with net/core is a really good
> > idea, at least it seems like we have some impendence mismatch with the ttm
> > pool. And going forward I expect sooner or later we need alignment between
> > the pools/caches under drm with dma-buf heap pools a lot more than between
> > dma-buf and net/core.
> 
> I mean...  I don't think you're wrong here, but it was your suggestion.
> 
> > So this feels like a bit code sharing for code sharing's sake and not
> > where it makes sense. Expecting net/core and gpu stacks to have the exact
> > same needs for a page pool allocator has good chances to bite us in the
> > long run.
> 
> Again, I agree with you at the high level here (dmabuf system heap and
> ttm page pooling are conceptually more likely to align, and
> duplication of buffer pools is non-optimal), but there's still the
> practical aspect of the ttm pool being pretty tied to the ttm code
> (utilizing ttm contexts, fixed MAX_ORDER*TTM_NUM_CACHING_TYPES
> subpools per pool + 4 global sub-pools for only x86).
> 
> So... I guess I'll go for another pass at trying to pull something
> generic out of the ttm_pool, but the cynic in me suspects folks will
> just object to any inefficiencies added in order to do so (the
> code-sharing for its own sake argument above) and I'll be back to
> where I am now. But we'll see.

Yeah realistically we're not there yet most likely. It's also a bigger
problem, with shrinkers all over various drivers and buffer locking scheme
mostly of the yolo kind.

With Android I'm just more worried than with the other parts since in
reality the actual production gpu stacks on android are all out of tree.
And so there's substantial more inertia against refactoring (in practice
at least, because the only people who care are not super willing to create
tons of work in their out-of-tree stacks). And given past progress waiting
for Android to arrive on upstream is also not a great option really -
outside of some nice tech demos that in practice don't ship anywhere

And without the full stack a lot of this just looks like tech debt
offloading without a hole lot of benefits to upstream.

But also, this isn't a new topic :-)

Cheers, Daniel
diff mbox series

Patch

diff --git a/drivers/dma-buf/heaps/Kconfig b/drivers/dma-buf/heaps/Kconfig
index ecf65204f714..fa5e1c330cce 100644
--- a/drivers/dma-buf/heaps/Kconfig
+++ b/drivers/dma-buf/heaps/Kconfig
@@ -4,6 +4,7 @@  config DMABUF_HEAPS_DEFERRED_FREE
 config DMABUF_HEAPS_SYSTEM
 	bool "DMA-BUF System Heap"
 	depends on DMABUF_HEAPS
+	select PAGE_POOL
 	help
 	  Choose this option to enable the system dmabuf heap. The system heap
 	  is backed by pages from the buddy allocator. If in doubt, say Y.
diff --git a/drivers/dma-buf/heaps/system_heap.c b/drivers/dma-buf/heaps/system_heap.c
index 17e0e9a68baf..885e30894b77 100644
--- a/drivers/dma-buf/heaps/system_heap.c
+++ b/drivers/dma-buf/heaps/system_heap.c
@@ -20,6 +20,7 @@ 
 #include <linux/scatterlist.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
+#include <net/page_pool.h>
 
 static struct dma_heap *sys_heap;
 
@@ -53,6 +54,7 @@  static gfp_t order_flags[] = {HIGH_ORDER_GFP, LOW_ORDER_GFP, LOW_ORDER_GFP};
  */
 static const unsigned int orders[] = {8, 4, 0};
 #define NUM_ORDERS ARRAY_SIZE(orders)
+struct page_pool *pools[NUM_ORDERS];
 
 static struct sg_table *dup_sg_table(struct sg_table *table)
 {
@@ -281,18 +283,59 @@  static void system_heap_vunmap(struct dma_buf *dmabuf, struct dma_buf_map *map)
 	dma_buf_map_clear(map);
 }
 
+static int system_heap_clear_pages(struct page **pages, int num, pgprot_t pgprot)
+{
+	void *addr = vmap(pages, num, VM_MAP, pgprot);
+
+	if (!addr)
+		return -ENOMEM;
+	memset(addr, 0, PAGE_SIZE * num);
+	vunmap(addr);
+	return 0;
+}
+
+static int system_heap_zero_buffer(struct system_heap_buffer *buffer)
+{
+	struct sg_table *sgt = &buffer->sg_table;
+	struct sg_page_iter piter;
+	struct page *pages[32];
+	int p = 0;
+	int ret = 0;
+
+	for_each_sgtable_page(sgt, &piter, 0) {
+		pages[p++] = sg_page_iter_page(&piter);
+		if (p == ARRAY_SIZE(pages)) {
+			ret = system_heap_clear_pages(pages, p, PAGE_KERNEL);
+			if (ret)
+				return ret;
+			p = 0;
+		}
+	}
+	if (p)
+		ret = system_heap_clear_pages(pages, p, PAGE_KERNEL);
+
+	return ret;
+}
+
 static void system_heap_dma_buf_release(struct dma_buf *dmabuf)
 {
 	struct system_heap_buffer *buffer = dmabuf->priv;
 	struct sg_table *table;
 	struct scatterlist *sg;
-	int i;
+	int i, j;
+
+	/* Zero the buffer pages before adding back to the pool */
+	system_heap_zero_buffer(buffer);
 
 	table = &buffer->sg_table;
 	for_each_sg(table->sgl, sg, table->nents, i) {
 		struct page *page = sg_page(sg);
 
-		__free_pages(page, compound_order(page));
+		for (j = 0; j < NUM_ORDERS; j++) {
+			if (compound_order(page) == orders[j])
+				break;
+		}
+		page_pool_put_full_page(pools[j], page, false);
 	}
 	sg_free_table(table);
 	kfree(buffer);
@@ -322,8 +365,7 @@  static struct page *alloc_largest_available(unsigned long size,
 			continue;
 		if (max_order < orders[i])
 			continue;
-
-		page = alloc_pages(order_flags[i], orders[i]);
+		page = page_pool_alloc_pages(pools[i], order_flags[i]);
 		if (!page)
 			continue;
 		return page;
@@ -428,6 +470,24 @@  static const struct dma_heap_ops system_heap_ops = {
 static int system_heap_create(void)
 {
 	struct dma_heap_export_info exp_info;
+	int i;
+
+	for (i = 0; i < NUM_ORDERS; i++) {
+		struct page_pool_params pp;
+
+		memset(&pp, 0, sizeof(pp));
+		pp.order = orders[i];
+		pools[i] = page_pool_create(&pp);
+
+		if (IS_ERR(pools[i])) {
+			int j;
+
+			pr_err("%s: page pool creation failed!\n", __func__);
+			for (j = 0; j < i; j++)
+				page_pool_destroy(pools[j]);
+			return PTR_ERR(pools[i]);
+		}
+	}
 
 	exp_info.name = "system";
 	exp_info.ops = &system_heap_ops;