diff mbox series

[net-next,v2,2/3] page_pool: support non-frag page for page_pool_alloc_frag()

Message ID 20230529092840.40413-3-linyunsheng@huawei.com (mailing list archive)
State Changes Requested
Delegated to: Netdev Maintainers
Headers show
Series support non-frag page for page_pool_alloc_frag() | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for net-next
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 10 this patch: 10
netdev/cc_maintainers success CCed 7 of 7 maintainers
netdev/build_clang success Errors and warnings before: 8 this patch: 8
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 10 this patch: 10
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 69 lines checked
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Yunsheng Lin May 29, 2023, 9:28 a.m. UTC
There is performance penalty with using page frag support when
user requests a larger frag size and a page only supports one
frag user, see [1].

It seems like user may request different frag size depending
on the mtu and packet size, provide an option to allocate
non-frag page when a whole page is not able to hold two frags,
so that user has a unified interface for the memory allocation
with least memory utilization and performance penalty.

1. https://lore.kernel.org/netdev/ZEU+vospFdm08IeE@localhost.localdomain/

Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
CC: Lorenzo Bianconi <lorenzo@kernel.org>
CC: Alexander Duyck <alexander.duyck@gmail.com>
---
 net/core/page_pool.c | 47 +++++++++++++++++++++++++++-----------------
 1 file changed, 29 insertions(+), 18 deletions(-)

Comments

Alexander Duyck May 30, 2023, 3:07 p.m. UTC | #1
On Mon, 2023-05-29 at 17:28 +0800, Yunsheng Lin wrote:
> There is performance penalty with using page frag support when
> user requests a larger frag size and a page only supports one
> frag user, see [1].
> 
> It seems like user may request different frag size depending
> on the mtu and packet size, provide an option to allocate
> non-frag page when a whole page is not able to hold two frags,
> so that user has a unified interface for the memory allocation
> with least memory utilization and performance penalty.
> 
> 1. https://lore.kernel.org/netdev/ZEU+vospFdm08IeE@localhost.localdomain/
> 
> Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
> CC: Lorenzo Bianconi <lorenzo@kernel.org>
> CC: Alexander Duyck <alexander.duyck@gmail.com>
> ---
>  net/core/page_pool.c | 47 +++++++++++++++++++++++++++-----------------
>  1 file changed, 29 insertions(+), 18 deletions(-)
> 
> diff --git a/net/core/page_pool.c b/net/core/page_pool.c
> index 0868aa8f6323..e84ec6eabefd 100644
> --- a/net/core/page_pool.c
> +++ b/net/core/page_pool.c
> @@ -699,14 +699,27 @@ struct page *page_pool_alloc_frag(struct page_pool *pool,
>  	unsigned int max_size = PAGE_SIZE << pool->p.order;
>  	struct page *page = pool->frag_page;
>  
> -	if (WARN_ON(!(pool->p.flags & PP_FLAG_PAGE_FRAG) ||
> -		    size > max_size))
> +	if (unlikely(size > max_size))
>  		return NULL;
>  
> +	if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT) {
> +		*offset = 0;
> +		return page_pool_alloc_pages(pool, gfp);
> +	}
> +

This is a recipe for pain. Rather than doing this I would say we should
stick with our existing behavior and not allow page pool fragments to
be used when the DMA address is consuming the region. Otherwise we are
going to make things very confusing.

If we have to have both version I would much rather just have some
inline calls in the header wrapped in one #ifdef for
PAGE_POOL_DMA_USE_PP_FRAG_COUNT that basically are a wrapper for
page_pool pages treated as pp_frag.

>  	size = ALIGN(size, dma_get_cache_alignment());
> -	*offset = pool->frag_offset;
>  

If we are going to be allocating mono-frag pages they should be
allocated here based on the size check. That way we aren't discrupting
the performance for the smaller fragments and the code below could
function undisturbed.

> -	if (page && *offset + size > max_size) {
> +	if (page) {
> +		*offset = pool->frag_offset;
> +
> +		if (*offset + size <= max_size) {
> +			pool->frag_users++;
> +			pool->frag_offset = *offset + size;
> +			alloc_stat_inc(pool, fast);
> +			return page;
> +		}
> +
> +		pool->frag_page = NULL;
>  		page = page_pool_drain_frag(pool, page);
>  		if (page) {
>  			alloc_stat_inc(pool, fast);
> @@ -714,26 +727,24 @@ struct page *page_pool_alloc_frag(struct page_pool *pool,
>  		}
>  	}
>  
> -	if (!page) {
> -		page = page_pool_alloc_pages(pool, gfp);
> -		if (unlikely(!page)) {
> -			pool->frag_page = NULL;
> -			return NULL;
> -		}
> -
> -		pool->frag_page = page;
> +	page = page_pool_alloc_pages(pool, gfp);
> +	if (unlikely(!page))
> +		return NULL;
>  
>  frag_reset:
> -		pool->frag_users = 1;
> +	/* return page as non-frag page if a page is not able to
> +	 * hold two frags for the current requested size.
> +	 */

This statement ins't exactly true since you make all page pool pages
into fragmented pages.


> +	if (unlikely(size << 1 > max_size)) {

This should happen much sooner so you aren't mixing these allocations
with the smaller ones and forcing the fragmented page to be evicted.

>  		*offset = 0;
> -		pool->frag_offset = size;
> -		page_pool_fragment_page(page, BIAS_MAX);
>  		return page;
>  	}
>  


> -	pool->frag_users++;
> -	pool->frag_offset = *offset + size;
> -	alloc_stat_inc(pool, fast);
> +	pool->frag_page = page;
> +	pool->frag_users = 1;
> +	*offset = 0;
> +	pool->frag_offset = size;
> +	page_pool_fragment_page(page, BIAS_MAX);
>  	return page;
>  }
>  EXPORT_SYMBOL(page_pool_alloc_frag);
Yunsheng Lin May 31, 2023, 12:19 p.m. UTC | #2
On 2023/5/30 23:07, Alexander H Duyck wrote:
...

>> +	if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT) {
>> +		*offset = 0;
>> +		return page_pool_alloc_pages(pool, gfp);
>> +	}
>> +
> 
> This is a recipe for pain. Rather than doing this I would say we should
> stick with our existing behavior and not allow page pool fragments to
> be used when the DMA address is consuming the region. Otherwise we are
> going to make things very confusing.

Are there any other concern other than confusing? we could add a
big comment to make it clear.

The point of adding that is to avoid the driver handling the
PAGE_POOL_DMA_USE_PP_FRAG_COUNT when using page_pool_alloc_frag()
like something like below:

if (!PAGE_POOL_DMA_USE_PP_FRAG_COUNT)
	page = page_pool_alloc_frag()
else
	page = XXXXX;

Or do you perfer the driver handling it? why?

> 
> If we have to have both version I would much rather just have some
> inline calls in the header wrapped in one #ifdef for
> PAGE_POOL_DMA_USE_PP_FRAG_COUNT that basically are a wrapper for
> page_pool pages treated as pp_frag.

Do you have a good name in mind for that wrapper.
In addition to the naming, which API should I use when I am a driver
author wanting to add page pool support?

> 
>>  	size = ALIGN(size, dma_get_cache_alignment());
>> -	*offset = pool->frag_offset;
>>  
> 
> If we are going to be allocating mono-frag pages they should be
> allocated here based on the size check. That way we aren't discrupting
> the performance for the smaller fragments and the code below could
> function undisturbed.

It is to allow possible optimization as below.

> 
>> -	if (page && *offset + size > max_size) {
>> +	if (page) {
>> +		*offset = pool->frag_offset;
>> +
>> +		if (*offset + size <= max_size) {
>> +			pool->frag_users++;
>> +			pool->frag_offset = *offset + size;
>> +			alloc_stat_inc(pool, fast);
>> +			return page;

Note that we still allow frag page here when '(size << 1 > max_size)'.

>> +		}
>> +
>> +		pool->frag_page = NULL;
>>  		page = page_pool_drain_frag(pool, page);
>>  		if (page) {
>>  			alloc_stat_inc(pool, fast);
>> @@ -714,26 +727,24 @@ struct page *page_pool_alloc_frag(struct page_pool *pool,
>>  		}
>>  	}
>>  
>> -	if (!page) {
>> -		page = page_pool_alloc_pages(pool, gfp);
>> -		if (unlikely(!page)) {
>> -			pool->frag_page = NULL;
>> -			return NULL;
>> -		}
>> -
>> -		pool->frag_page = page;
>> +	page = page_pool_alloc_pages(pool, gfp);
>> +	if (unlikely(!page))
>> +		return NULL;
>>  
>>  frag_reset:
>> -		pool->frag_users = 1;
>> +	/* return page as non-frag page if a page is not able to
>> +	 * hold two frags for the current requested size.
>> +	 */
> 
> This statement ins't exactly true since you make all page pool pages
> into fragmented pages.

Any suggestion to describe it more accurately?
I wrote that thinking frag_count being one as non-frag page.

> 
> 
>> +	if (unlikely(size << 1 > max_size)) {
> 
> This should happen much sooner so you aren't mixing these allocations
> with the smaller ones and forcing the fragmented page to be evicted.

As mentioned above, it is to allow a possible optimization
Yunsheng Lin June 1, 2023, 12:21 p.m. UTC | #3
On 2023/5/31 20:19, Yunsheng Lin wrote:
> On 2023/5/30 23:07, Alexander H Duyck wrote:

Hi, Alexander
    Any more comment or concern?
    I feel like we are circling back to v1 about whether it is better
add a new wrapper/API or not and where to do the "(size << 1 > max_size)"
checking. I really like to continue the discussion here instead of in the
new thread again when I post a v3, thanks.

> ...
> 
>>> +	if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT) {
>>> +		*offset = 0;
>>> +		return page_pool_alloc_pages(pool, gfp);
>>> +	}
>>> +
>>
>> This is a recipe for pain. Rather than doing this I would say we should
>> stick with our existing behavior and not allow page pool fragments to
>> be used when the DMA address is consuming the region. Otherwise we are
>> going to make things very confusing.
> 
> Are there any other concern other than confusing? we could add a
> big comment to make it clear.
> 
> The point of adding that is to avoid the driver handling the
> PAGE_POOL_DMA_USE_PP_FRAG_COUNT when using page_pool_alloc_frag()
> like something like below:
> 
> if (!PAGE_POOL_DMA_USE_PP_FRAG_COUNT)
> 	page = page_pool_alloc_frag()
> else
> 	page = XXXXX;
> 
> Or do you perfer the driver handling it? why?
> 
>>
>> If we have to have both version I would much rather just have some
>> inline calls in the header wrapped in one #ifdef for
>> PAGE_POOL_DMA_USE_PP_FRAG_COUNT that basically are a wrapper for
>> page_pool pages treated as pp_frag.
> 
> Do you have a good name in mind for that wrapper.
> In addition to the naming, which API should I use when I am a driver
> author wanting to add page pool support?
> 
>>
>>>  	size = ALIGN(size, dma_get_cache_alignment());
>>> -	*offset = pool->frag_offset;
>>>  
>>
>> If we are going to be allocating mono-frag pages they should be
>> allocated here based on the size check. That way we aren't discrupting
>> the performance for the smaller fragments and the code below could
>> function undisturbed.
> 
> It is to allow possible optimization as below.
> 
>>
>>> -	if (page && *offset + size > max_size) {
>>> +	if (page) {
>>> +		*offset = pool->frag_offset;
>>> +
>>> +		if (*offset + size <= max_size) {
>>> +			pool->frag_users++;
>>> +			pool->frag_offset = *offset + size;
>>> +			alloc_stat_inc(pool, fast);
>>> +			return page;
> 
> Note that we still allow frag page here when '(size << 1 > max_size)'.
> 
>>> +		}
>>> +
>>> +		pool->frag_page = NULL;
>>>  		page = page_pool_drain_frag(pool, page);
>>>  		if (page) {
>>>  			alloc_stat_inc(pool, fast);
>>> @@ -714,26 +727,24 @@ struct page *page_pool_alloc_frag(struct page_pool *pool,
>>>  		}
>>>  	}
>>>  
>>> -	if (!page) {
>>> -		page = page_pool_alloc_pages(pool, gfp);
>>> -		if (unlikely(!page)) {
>>> -			pool->frag_page = NULL;
>>> -			return NULL;
>>> -		}
>>> -
>>> -		pool->frag_page = page;
>>> +	page = page_pool_alloc_pages(pool, gfp);
>>> +	if (unlikely(!page))
>>> +		return NULL;
>>>  
>>>  frag_reset:
>>> -		pool->frag_users = 1;
>>> +	/* return page as non-frag page if a page is not able to
>>> +	 * hold two frags for the current requested size.
>>> +	 */
>>
>> This statement ins't exactly true since you make all page pool pages
>> into fragmented pages.
> 
> Any suggestion to describe it more accurately?
> I wrote that thinking frag_count being one as non-frag page.
> 
>>
>>
>>> +	if (unlikely(size << 1 > max_size)) {
>>
>> This should happen much sooner so you aren't mixing these allocations
>> with the smaller ones and forcing the fragmented page to be evicted.
> 
> As mentioned above, it is to allow a possible optimization
> 
> 
> .
>
Alexander Duyck June 1, 2023, 6:14 p.m. UTC | #4
On Wed, May 31, 2023 at 5:19 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:
>
> On 2023/5/30 23:07, Alexander H Duyck wrote:
> ...
>
> >> +    if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT) {
> >> +            *offset = 0;
> >> +            return page_pool_alloc_pages(pool, gfp);
> >> +    }
> >> +
> >
> > This is a recipe for pain. Rather than doing this I would say we should
> > stick with our existing behavior and not allow page pool fragments to
> > be used when the DMA address is consuming the region. Otherwise we are
> > going to make things very confusing.
>
> Are there any other concern other than confusing? we could add a
> big comment to make it clear.
>
> The point of adding that is to avoid the driver handling the
> PAGE_POOL_DMA_USE_PP_FRAG_COUNT when using page_pool_alloc_frag()
> like something like below:
>
> if (!PAGE_POOL_DMA_USE_PP_FRAG_COUNT)
>         page = page_pool_alloc_frag()
> else
>         page = XXXXX;
>
> Or do you perfer the driver handling it? why?
>
> >
> > If we have to have both version I would much rather just have some
> > inline calls in the header wrapped in one #ifdef for
> > PAGE_POOL_DMA_USE_PP_FRAG_COUNT that basically are a wrapper for
> > page_pool pages treated as pp_frag.
>
> Do you have a good name in mind for that wrapper.
> In addition to the naming, which API should I use when I am a driver
> author wanting to add page pool support?

When I usually have to deal with these sort of things I just rename
the original with a leading underscore or two and then just name the
inline the same as the original function.

> >
> >>      size = ALIGN(size, dma_get_cache_alignment());
> >> -    *offset = pool->frag_offset;
> >>
> >
> > If we are going to be allocating mono-frag pages they should be
> > allocated here based on the size check. That way we aren't discrupting
> > the performance for the smaller fragments and the code below could
> > function undisturbed.
>
> It is to allow possible optimization as below.

What optimization? From what I can tell you are taking extra steps for
non-page pool pages.

> >
> >> -    if (page && *offset + size > max_size) {
> >> +    if (page) {
> >> +            *offset = pool->frag_offset;
> >> +
> >> +            if (*offset + size <= max_size) {
> >> +                    pool->frag_users++;
> >> +                    pool->frag_offset = *offset + size;
> >> +                    alloc_stat_inc(pool, fast);
> >> +                    return page;
>
> Note that we still allow frag page here when '(size << 1 > max_size)'.

You are creating what I call a mono-frag. I am not a huge fan.

> >> +            }
> >> +
> >> +            pool->frag_page = NULL;
> >>              page = page_pool_drain_frag(pool, page);
> >>              if (page) {
> >>                      alloc_stat_inc(pool, fast);
> >> @@ -714,26 +727,24 @@ struct page *page_pool_alloc_frag(struct page_pool *pool,
> >>              }
> >>      }
> >>
> >> -    if (!page) {
> >> -            page = page_pool_alloc_pages(pool, gfp);
> >> -            if (unlikely(!page)) {
> >> -                    pool->frag_page = NULL;
> >> -                    return NULL;
> >> -            }
> >> -
> >> -            pool->frag_page = page;
> >> +    page = page_pool_alloc_pages(pool, gfp);
> >> +    if (unlikely(!page))
> >> +            return NULL;
> >>
> >>  frag_reset:
> >> -            pool->frag_users = 1;
> >> +    /* return page as non-frag page if a page is not able to
> >> +     * hold two frags for the current requested size.
> >> +     */
> >
> > This statement ins't exactly true since you make all page pool pages
> > into fragmented pages.
>
> Any suggestion to describe it more accurately?
> I wrote that thinking frag_count being one as non-frag page.

I wouldn't consider that to be the case. The problem is if frag count
== 1 then you have a fragmented page. It is no different from a page
where you had either freed earlier instances.

> >
> >
> >> +    if (unlikely(size << 1 > max_size)) {
> >
> > This should happen much sooner so you aren't mixing these allocations
> > with the smaller ones and forcing the fragmented page to be evicted.
>
> As mentioned above, it is to allow a possible optimization

Maybe you should point out exactly what you think the optimization is.
I don't see it as such. If you are going to evict anything that has a
size that is over half your max_size then you might as well just skip
using this entirely and just output a non-fragmented/mono frag page
rather than evicting the previously fragmented page.
Yunsheng Lin June 2, 2023, 12:23 p.m. UTC | #5
On 2023/6/2 2:14, Alexander Duyck wrote:
> On Wed, May 31, 2023 at 5:19 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:

...

>>
>>>
>>> If we have to have both version I would much rather just have some
>>> inline calls in the header wrapped in one #ifdef for
>>> PAGE_POOL_DMA_USE_PP_FRAG_COUNT that basically are a wrapper for
>>> page_pool pages treated as pp_frag.
>>
>> Do you have a good name in mind for that wrapper.
>> In addition to the naming, which API should I use when I am a driver
>> author wanting to add page pool support?
> 
> When I usually have to deal with these sort of things I just rename
> the original with a leading underscore or two and then just name the
> inline the same as the original function.

Ok, will follow the pattern if it is really necessary.

> 
>>>
>>>>      size = ALIGN(size, dma_get_cache_alignment());
>>>> -    *offset = pool->frag_offset;
>>>>
>>>
>>> If we are going to be allocating mono-frag pages they should be
>>> allocated here based on the size check. That way we aren't discrupting
>>> the performance for the smaller fragments and the code below could
>>> function undisturbed.
>>
>> It is to allow possible optimization as below.
> 
> What optimization? From what I can tell you are taking extra steps for
> non-page pool pages.

I will talk about the optimization later.

According to my defination in this patchset:
frag page: page alloced from page_pool_alloc_frag() with page->pp_frag_count
           being greater than one.
non-frag page:page alloced return from both page_pool_alloc_frag() and
              page_pool_alloc_pages() with page->pp_frag_count being one.

I assume the above 'non-page pool pages' refer to what I call as 'non-frag
page' alloced return from both page_pool_alloc_frag(), right? And it is
still about doing the (size << 1 > max_size)' checking at the begin instead
of at the middle right now to avoid extra steps for 'non-frag page' case?

> 
>>>
>>>> -    if (page && *offset + size > max_size) {
>>>> +    if (page) {
>>>> +            *offset = pool->frag_offset;
>>>> +
>>>> +            if (*offset + size <= max_size) {
>>>> +                    pool->frag_users++;
>>>> +                    pool->frag_offset = *offset + size;
>>>> +                    alloc_stat_inc(pool, fast);
>>>> +                    return page;
>>
>> Note that we still allow frag page here when '(size << 1 > max_size)'.

This is the optimization I was taking about: suppose we start
from a clean state with 64K page size, if page_pool_alloc_frag()
is called with size being 2K and then 34K, we only need one page
to satisfy caller's need as we do the '*offset + size > max_size'
checking before the '(size << 1 > max_size)' checking.

As you mentioned below, it is at the cost of evicting the previously
fragmented page, I thought about keeping it when implementing, but I
am not sure evicting it is really matter if the previously fragmented
page does not pass the testing by '*offset + size > max_size'?

Or maybe we should adjust the code a litte bit as below to allow the
optimization I mentioned without the cost of evicting the previously
fragmented page?

struct page *page_pool_alloc_frag(struct page_pool *pool,
                                  unsigned int *offset,
                                  unsigned int size, gfp_t gfp)
{
        unsigned int max_size = PAGE_SIZE << pool->p.order;
        struct page *page = pool->frag_page;

        if (unlikely(size > max_size))
                return NULL;

        if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT)
                goto alloc_non_frag;

        size = ALIGN(size, dma_get_cache_alignment());

        if (page && pool->frag_offset <= max_size) {
                *offset = pool->frag_offset;
                pool->frag_users++;
                pool->frag_offset += size;
                alloc_stat_inc(pool, fast);
                return page;
        }

        if (unlikely((size << 1) > max_size))
                goto alloc_non_frag;

        if (page) {
                page = page_pool_drain_frag(pool, page);
                if (page) {
                        alloc_stat_inc(pool, fast);
                        goto frag_reset;
                }
        }

        page = page_pool_alloc_pages(pool, gfp);
        if (unlikely(!page))
                return NULL;

        pool->frag_page = page;
frag_reset:
        pool->frag_users = 1;
        *offset = 0;
        pool->frag_offset = size;
        page_pool_fragment_page(page, BIAS_MAX);
        return page;

alloc_non_frag:
        *offset = 0;
        return page_pool_alloc_pages(pool, gfp);
}

> 
> You are creating what I call a mono-frag. I am not a huge fan.

Do you mean 'mono-frag' as the unifying of frag and non-frag
page handling by assuming all pages in page pool having one
frag user initially in patch 1?
If yes, please let's continue the discussion in pacth 1 so that
we don't have to restart the discussion again.

Or is there some other obvious concern about 'mono-frag' I missed?

> 
>>>> +            }
>>>> +
>>>> +            pool->frag_page = NULL;
>>>>              page = page_pool_drain_frag(pool, page);
>>>>              if (page) {
>>>>                      alloc_stat_inc(pool, fast);
>>>> @@ -714,26 +727,24 @@ struct page *page_pool_alloc_frag(struct page_pool *pool,
>>>>              }
>>>>      }
>>>>
>>>> -    if (!page) {
>>>> -            page = page_pool_alloc_pages(pool, gfp);
>>>> -            if (unlikely(!page)) {
>>>> -                    pool->frag_page = NULL;
>>>> -                    return NULL;
>>>> -            }
>>>> -
>>>> -            pool->frag_page = page;
>>>> +    page = page_pool_alloc_pages(pool, gfp);
>>>> +    if (unlikely(!page))
>>>> +            return NULL;
>>>>
>>>>  frag_reset:
>>>> -            pool->frag_users = 1;
>>>> +    /* return page as non-frag page if a page is not able to
>>>> +     * hold two frags for the current requested size.
>>>> +     */
>>>
>>> This statement ins't exactly true since you make all page pool pages
>>> into fragmented pages.
>>
>> Any suggestion to describe it more accurately?
>> I wrote that thinking frag_count being one as non-frag page.
> 
> I wouldn't consider that to be the case. The problem is if frag count
> == 1 then you have a fragmented page. It is no different from a page
> where you had either freed earlier instances.

It seems you still have some concern about the unifying in patch 1,
please do go and comment why it is not ok to assume 'frag count == 1'
as non-frag page in patch 1.

> 
>>>
>>>
>>>> +    if (unlikely(size << 1 > max_size)) {
>>>
>>> This should happen much sooner so you aren't mixing these allocations
>>> with the smaller ones and forcing the fragmented page to be evicted.
>>
>> As mentioned above, it is to allow a possible optimization
> 
> Maybe you should point out exactly what you think the optimization is.
> I don't see it as such. If you are going to evict anything that has a
> size that is over half your max_size then you might as well just skip
> using this entirely and just output a non-fragmented/mono frag page
> rather than evicting the previously fragmented page.

As explained as above.

> .
>
Alexander Duyck June 2, 2023, 3:57 p.m. UTC | #6
On Fri, Jun 2, 2023 at 5:23 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:
>
> On 2023/6/2 2:14, Alexander Duyck wrote:
> > On Wed, May 31, 2023 at 5:19 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:
>
> ...
>
> >>
> >>>
> >>> If we have to have both version I would much rather just have some
> >>> inline calls in the header wrapped in one #ifdef for
> >>> PAGE_POOL_DMA_USE_PP_FRAG_COUNT that basically are a wrapper for
> >>> page_pool pages treated as pp_frag.
> >>
> >> Do you have a good name in mind for that wrapper.
> >> In addition to the naming, which API should I use when I am a driver
> >> author wanting to add page pool support?
> >
> > When I usually have to deal with these sort of things I just rename
> > the original with a leading underscore or two and then just name the
> > inline the same as the original function.
>
> Ok, will follow the pattern if it is really necessary.
>
> >
> >>>
> >>>>      size = ALIGN(size, dma_get_cache_alignment());
> >>>> -    *offset = pool->frag_offset;
> >>>>
> >>>
> >>> If we are going to be allocating mono-frag pages they should be
> >>> allocated here based on the size check. That way we aren't discrupting
> >>> the performance for the smaller fragments and the code below could
> >>> function undisturbed.
> >>
> >> It is to allow possible optimization as below.
> >
> > What optimization? From what I can tell you are taking extra steps for
> > non-page pool pages.
>
> I will talk about the optimization later.
>
> According to my defination in this patchset:
> frag page: page alloced from page_pool_alloc_frag() with page->pp_frag_count
>            being greater than one.
> non-frag page:page alloced return from both page_pool_alloc_frag() and
>               page_pool_alloc_pages() with page->pp_frag_count being one.
>
> I assume the above 'non-page pool pages' refer to what I call as 'non-frag
> page' alloced return from both page_pool_alloc_frag(), right? And it is
> still about doing the (size << 1 > max_size)' checking at the begin instead
> of at the middle right now to avoid extra steps for 'non-frag page' case?

Yeah, the non-page I was referring to were you mono-frag pages.

> >
> >>>
> >>>> -    if (page && *offset + size > max_size) {
> >>>> +    if (page) {
> >>>> +            *offset = pool->frag_offset;
> >>>> +
> >>>> +            if (*offset + size <= max_size) {
> >>>> +                    pool->frag_users++;
> >>>> +                    pool->frag_offset = *offset + size;
> >>>> +                    alloc_stat_inc(pool, fast);
> >>>> +                    return page;
> >>
> >> Note that we still allow frag page here when '(size << 1 > max_size)'.
>
> This is the optimization I was taking about: suppose we start
> from a clean state with 64K page size, if page_pool_alloc_frag()
> is called with size being 2K and then 34K, we only need one page
> to satisfy caller's need as we do the '*offset + size > max_size'
> checking before the '(size << 1 > max_size)' checking.

The issue is the unaccounted for waste. We are supposed to know the
general size of the frags being used so we can compute truesize. If
for example you are using an order 3 page and you are splitting it
between a 2K and a 17K fragment the 2K fragments will have a massive
truesize underestimate that can lead to memory issues if those smaller
fragments end up holding onto the pages.

As such we should try to keep the small fragments away from anything
larger than half of the page.

> As you mentioned below, it is at the cost of evicting the previously
> fragmented page, I thought about keeping it when implementing, but I
> am not sure evicting it is really matter if the previously fragmented
> page does not pass the testing by '*offset + size > max_size'?
>
> Or maybe we should adjust the code a litte bit as below to allow the
> optimization I mentioned without the cost of evicting the previously
> fragmented page?

Really we should just not be mixing smaller fragments w/ larger ones
for the above truesize reasons.

> struct page *page_pool_alloc_frag(struct page_pool *pool,
>                                   unsigned int *offset,
>                                   unsigned int size, gfp_t gfp)
> {
>         unsigned int max_size = PAGE_SIZE << pool->p.order;
>         struct page *page = pool->frag_page;
>
>         if (unlikely(size > max_size))
>                 return NULL;
>
>         if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT)
>                 goto alloc_non_frag;
>
>         size = ALIGN(size, dma_get_cache_alignment());
>
>         if (page && pool->frag_offset <= max_size) {
>                 *offset = pool->frag_offset;
>                 pool->frag_users++;
>                 pool->frag_offset += size;
>                 alloc_stat_inc(pool, fast);
>                 return page;
>         }
>
>         if (unlikely((size << 1) > max_size))
>                 goto alloc_non_frag;
>
>         if (page) {
>                 page = page_pool_drain_frag(pool, page);
>                 if (page) {
>                         alloc_stat_inc(pool, fast);
>                         goto frag_reset;
>                 }
>         }
>
>         page = page_pool_alloc_pages(pool, gfp);
>         if (unlikely(!page))
>                 return NULL;
>
>         pool->frag_page = page;
> frag_reset:
>         pool->frag_users = 1;
>         *offset = 0;
>         pool->frag_offset = size;
>         page_pool_fragment_page(page, BIAS_MAX);
>         return page;
>
> alloc_non_frag:
>         *offset = 0;
>         return page_pool_alloc_pages(pool, gfp);
> }
>
> >
> > You are creating what I call a mono-frag. I am not a huge fan.
>
> Do you mean 'mono-frag' as the unifying of frag and non-frag
> page handling by assuming all pages in page pool having one
> frag user initially in patch 1?
> If yes, please let's continue the discussion in pacth 1 so that
> we don't have to restart the discussion again.
>
> Or is there some other obvious concern about 'mono-frag' I missed?

The main concern I have is having mono-frags and truly fragmented
pages mixing too much which is making a bit of a mess of things.

If we are going to use these fragments we really need to be able to
tell the truesize of these mono-frag pages. What I want to avoid is
seeing drivers abuse this to allocate huge swaths of memory but not
accounting for it in the skb_truesize.
Yunsheng Lin June 3, 2023, 4:20 a.m. UTC | #7
On 2023/6/2 23:57, Alexander Duyck wrote:
> On Fri, Jun 2, 2023 at 5:23 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:

...

>>
>> According to my defination in this patchset:
>> frag page: page alloced from page_pool_alloc_frag() with page->pp_frag_count
>>            being greater than one.
>> non-frag page:page alloced return from both page_pool_alloc_frag() and
>>               page_pool_alloc_pages() with page->pp_frag_count being one.
>>
>> I assume the above 'non-page pool pages' refer to what I call as 'non-frag
>> page' alloced return from both page_pool_alloc_frag(), right? And it is
>> still about doing the (size << 1 > max_size)' checking at the begin instead
>> of at the middle right now to avoid extra steps for 'non-frag page' case?
> 
> Yeah, the non-page I was referring to were you mono-frag pages.

I was using 'frag page' and 'non-frag page' per the defination above,
and you were using 'mono-frag' mostly and 'non-page' sometimes.
I am really confused by them as I felt like I got what they meant and
then I was lost when you used them in the next comment. I really hope
that you could describe what do you mean in more detailed by using
'mono-frag pages' and 'non-page', so that we can choose the right
naming to continue the discussion without further misunderstanding
and confusion.

> 
>>>
>>>>>
>>>>>> -    if (page && *offset + size > max_size) {
>>>>>> +    if (page) {
>>>>>> +            *offset = pool->frag_offset;
>>>>>> +
>>>>>> +            if (*offset + size <= max_size) {
>>>>>> +                    pool->frag_users++;
>>>>>> +                    pool->frag_offset = *offset + size;
>>>>>> +                    alloc_stat_inc(pool, fast);
>>>>>> +                    return page;
>>>>
>>>> Note that we still allow frag page here when '(size << 1 > max_size)'.
>>
>> This is the optimization I was taking about: suppose we start
>> from a clean state with 64K page size, if page_pool_alloc_frag()
>> is called with size being 2K and then 34K, we only need one page
>> to satisfy caller's need as we do the '*offset + size > max_size'
>> checking before the '(size << 1 > max_size)' checking.
> 
> The issue is the unaccounted for waste. We are supposed to know the
> general size of the frags being used so we can compute truesize. If

Note, for case of veth and virtio_net, the driver may only know the
current frag size when calling page_pool_alloc_frag(), it does not
konw what is the size of the frags will be used next time, how exactly
are we going to compute the truesize for cases with different frag
size?  As far as I can tell, we may only do something like virtio_net
is doing with 'page_frag' for the last frag as below, for other frags,
the truesize may need to take accounting to the aligning requirement:
https://elixir.bootlin.com/linux/v6.3.5/source/drivers/net/virtio_net.c#L1638

> for example you are using an order 3 page and you are splitting it
> between a 2K and a 17K fragment the 2K fragments will have a massive
> truesize underestimate that can lead to memory issues if those smaller
> fragments end up holding onto the pages.
> 
> As such we should try to keep the small fragments away from anything
> larger than half of the page.

IMHO, doing the above only alleviate the problem. How is above splitting
different from splitting it evently with 16 2K fragments, and when 15 frag
is released, we still have the last 2K fragment holding onto 32K memory,
doesn't that also cause massive truesize underestimate? Not to mention that
for system with 64K page size.

In RFC patch below, 'page_pool_frag' is used to report the truesize, but
I was thinking both 'page_frag' and 'page_frag_cache' both have a similiar
problem, so I dropped it in V1 and left that as a future improvement.
 
I can pick it up again if 'truesize' is really the concern, but we have to
align on how to compute the truesize here first.

https://patchwork.kernel.org/project/netdevbpf/patch/20230516124801.2465-4-linyunsheng@huawei.com/
Alexander Duyck June 5, 2023, 3:08 p.m. UTC | #8
On Fri, Jun 2, 2023 at 9:20 PM Yunsheng Lin <yunshenglin0825@gmail.com> wrote:
>
> On 2023/6/2 23:57, Alexander Duyck wrote:
> > On Fri, Jun 2, 2023 at 5:23 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:
>
> ...
>
> >>
> >> According to my defination in this patchset:
> >> frag page: page alloced from page_pool_alloc_frag() with page->pp_frag_count
> >>            being greater than one.
> >> non-frag page:page alloced return from both page_pool_alloc_frag() and
> >>               page_pool_alloc_pages() with page->pp_frag_count being one.
> >>
> >> I assume the above 'non-page pool pages' refer to what I call as 'non-frag
> >> page' alloced return from both page_pool_alloc_frag(), right? And it is
> >> still about doing the (size << 1 > max_size)' checking at the begin instead
> >> of at the middle right now to avoid extra steps for 'non-frag page' case?
> >
> > Yeah, the non-page I was referring to were you mono-frag pages.
>
> I was using 'frag page' and 'non-frag page' per the defination above,
> and you were using 'mono-frag' mostly and 'non-page' sometimes.
> I am really confused by them as I felt like I got what they meant and
> then I was lost when you used them in the next comment. I really hope
> that you could describe what do you mean in more detailed by using
> 'mono-frag pages' and 'non-page', so that we can choose the right
> naming to continue the discussion without further misunderstanding
> and confusion.

I will try to be consistent about this going forward:
non-fragmented - legacy page pool w/o page frags
mono-frag - after this page page pool w/o frags
fragmented - before/after this patch w/ frags

> >
> >>>
> >>>>>
> >>>>>> -    if (page && *offset + size > max_size) {
> >>>>>> +    if (page) {
> >>>>>> +            *offset = pool->frag_offset;
> >>>>>> +
> >>>>>> +            if (*offset + size <= max_size) {
> >>>>>> +                    pool->frag_users++;
> >>>>>> +                    pool->frag_offset = *offset + size;
> >>>>>> +                    alloc_stat_inc(pool, fast);
> >>>>>> +                    return page;
> >>>>
> >>>> Note that we still allow frag page here when '(size << 1 > max_size)'.
> >>
> >> This is the optimization I was taking about: suppose we start
> >> from a clean state with 64K page size, if page_pool_alloc_frag()
> >> is called with size being 2K and then 34K, we only need one page
> >> to satisfy caller's need as we do the '*offset + size > max_size'
> >> checking before the '(size << 1 > max_size)' checking.
> >
> > The issue is the unaccounted for waste. We are supposed to know the
> > general size of the frags being used so we can compute truesize. If
>
> Note, for case of veth and virtio_net, the driver may only know the
> current frag size when calling page_pool_alloc_frag(), it does not
> konw what is the size of the frags will be used next time, how exactly
> are we going to compute the truesize for cases with different frag
> size?  As far as I can tell, we may only do something like virtio_net
> is doing with 'page_frag' for the last frag as below, for other frags,
> the truesize may need to take accounting to the aligning requirement:
> https://elixir.bootlin.com/linux/v6.3.5/source/drivers/net/virtio_net.c#L1638

Yeah, that is more-or-less what I am getting at. This is why drivers
will tend to want to allocate a mono-frag themselves and then take
care of adding additional fragmentation as needed. If you are
abstracting that away from the driver then it makes it much harder to
track that truesize.

> > for example you are using an order 3 page and you are splitting it
> > between a 2K and a 17K fragment the 2K fragments will have a massive
> > truesize underestimate that can lead to memory issues if those smaller
> > fragments end up holding onto the pages.
> >
> > As such we should try to keep the small fragments away from anything
> > larger than half of the page.
>
> IMHO, doing the above only alleviate the problem. How is above splitting
> different from splitting it evently with 16 2K fragments, and when 15 frag
> is released, we still have the last 2K fragment holding onto 32K memory,
> doesn't that also cause massive truesize underestimate? Not to mention that
> for system with 64K page size.

Yes, that is a known issue. That is why I am not wanting us to further
exacerbate the issue.

> In RFC patch below, 'page_pool_frag' is used to report the truesize, but
> I was thinking both 'page_frag' and 'page_frag_cache' both have a similiar
> problem, so I dropped it in V1 and left that as a future improvement.
>
> I can pick it up again if 'truesize' is really the concern, but we have to
> align on how to compute the truesize here first.
>
> https://patchwork.kernel.org/project/netdevbpf/patch/20230516124801.2465-4-linyunsheng@huawei.com/

I am assuming this is the same one you mentioned in the other patch.
As I said the problem is the remainder is being ignored. The logic
should be pushed to the drivers to handle the truesize and is one of
the reasons why the expectation is that either the driver will use
something like a fixed constant size if it is using the raw page pool
fragments, or if it is going to do random sized chunks then it will
track the size of the chunks of the page is it using and assign the
remainders to the last fragment used in a given page.
diff mbox series

Patch

diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 0868aa8f6323..e84ec6eabefd 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -699,14 +699,27 @@  struct page *page_pool_alloc_frag(struct page_pool *pool,
 	unsigned int max_size = PAGE_SIZE << pool->p.order;
 	struct page *page = pool->frag_page;
 
-	if (WARN_ON(!(pool->p.flags & PP_FLAG_PAGE_FRAG) ||
-		    size > max_size))
+	if (unlikely(size > max_size))
 		return NULL;
 
+	if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT) {
+		*offset = 0;
+		return page_pool_alloc_pages(pool, gfp);
+	}
+
 	size = ALIGN(size, dma_get_cache_alignment());
-	*offset = pool->frag_offset;
 
-	if (page && *offset + size > max_size) {
+	if (page) {
+		*offset = pool->frag_offset;
+
+		if (*offset + size <= max_size) {
+			pool->frag_users++;
+			pool->frag_offset = *offset + size;
+			alloc_stat_inc(pool, fast);
+			return page;
+		}
+
+		pool->frag_page = NULL;
 		page = page_pool_drain_frag(pool, page);
 		if (page) {
 			alloc_stat_inc(pool, fast);
@@ -714,26 +727,24 @@  struct page *page_pool_alloc_frag(struct page_pool *pool,
 		}
 	}
 
-	if (!page) {
-		page = page_pool_alloc_pages(pool, gfp);
-		if (unlikely(!page)) {
-			pool->frag_page = NULL;
-			return NULL;
-		}
-
-		pool->frag_page = page;
+	page = page_pool_alloc_pages(pool, gfp);
+	if (unlikely(!page))
+		return NULL;
 
 frag_reset:
-		pool->frag_users = 1;
+	/* return page as non-frag page if a page is not able to
+	 * hold two frags for the current requested size.
+	 */
+	if (unlikely(size << 1 > max_size)) {
 		*offset = 0;
-		pool->frag_offset = size;
-		page_pool_fragment_page(page, BIAS_MAX);
 		return page;
 	}
 
-	pool->frag_users++;
-	pool->frag_offset = *offset + size;
-	alloc_stat_inc(pool, fast);
+	pool->frag_page = page;
+	pool->frag_users = 1;
+	*offset = 0;
+	pool->frag_offset = size;
+	page_pool_fragment_page(page, BIAS_MAX);
 	return page;
 }
 EXPORT_SYMBOL(page_pool_alloc_frag);