diff mbox series

[net-next,v2,05/15] mm: page_frag: use initial zero offset for page_frag_alloc_align()

Message ID 20240415131941.51153-6-linyunsheng@huawei.com (mailing list archive)
State Changes Requested
Delegated to: Netdev Maintainers
Headers show
Series First try to replace page_frag with page_frag_cache | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for net-next, async
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 929 this patch: 929
netdev/build_tools success No tools touched, skip
netdev/cc_maintainers success CCed 2 of 2 maintainers
netdev/build_clang success Errors and warnings before: 939 this patch: 939
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 940 this patch: 940
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 69 lines checked
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Yunsheng Lin April 15, 2024, 1:19 p.m. UTC
We are above to use page_frag_alloc_*() API to not just
allocate memory for skb->data, but also use them to do
the memory allocation for skb frag too. Currently the
implementation of page_frag in mm subsystem is running
the offset as a countdown rather than count-up value,
there may have several advantages to that as mentioned
in [1], but it may have some disadvantages, for example,
it may disable skb frag coaleasing and more correct cache
prefetching

We have a trade-off to make in order to have a unified
implementation and API for page_frag, so use a initial zero
offset in this patch, and the following patch will try to
make some optimization to aovid the disadvantages as much
as possible.

1. https://lore.kernel.org/all/f4abe71b3439b39d17a6fb2d410180f367cadf5c.camel@gmail.com/

CC: Alexander Duyck <alexander.duyck@gmail.com>
Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
---
 mm/page_frag_cache.c | 31 ++++++++++++++-----------------
 1 file changed, 14 insertions(+), 17 deletions(-)

Comments

Alexander Duyck April 15, 2024, 11:55 p.m. UTC | #1
On Mon, 2024-04-15 at 21:19 +0800, Yunsheng Lin wrote:
> We are above to use page_frag_alloc_*() API to not just
> allocate memory for skb->data, but also use them to do
> the memory allocation for skb frag too. Currently the
> implementation of page_frag in mm subsystem is running
> the offset as a countdown rather than count-up value,
> there may have several advantages to that as mentioned
> in [1], but it may have some disadvantages, for example,
> it may disable skb frag coaleasing and more correct cache
> prefetching
> 
> We have a trade-off to make in order to have a unified
> implementation and API for page_frag, so use a initial zero
> offset in this patch, and the following patch will try to
> make some optimization to aovid the disadvantages as much
> as possible.
> 
> 1. https://lore.kernel.org/all/f4abe71b3439b39d17a6fb2d410180f367cadf5c.camel@gmail.com/
> 
> CC: Alexander Duyck <alexander.duyck@gmail.com>
> Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
> ---
>  mm/page_frag_cache.c | 31 ++++++++++++++-----------------
>  1 file changed, 14 insertions(+), 17 deletions(-)
> 
> diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c
> index 64993b5d1243..dc864ee09536 100644
> --- a/mm/page_frag_cache.c
> +++ b/mm/page_frag_cache.c
> @@ -65,9 +65,8 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
>  			      unsigned int fragsz, gfp_t gfp_mask,
>  			      unsigned int align_mask)
>  {
> -	unsigned int size = PAGE_SIZE;
> +	unsigned int size, offset;
>  	struct page *page;
> -	int offset;
>  
>  	if (unlikely(!nc->va)) {
>  refill:
> @@ -75,10 +74,6 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
>  		if (!page)
>  			return NULL;
>  
> -#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
> -		/* if size can vary use size else just use PAGE_SIZE */
> -		size = nc->size;
> -#endif
>  		/* Even if we own the page, we do not use atomic_set().
>  		 * This would break get_page_unless_zero() users.
>  		 */
> @@ -87,11 +82,18 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
>  		/* reset page count bias and offset to start of new frag */
>  		nc->pfmemalloc = page_is_pfmemalloc(page);
>  		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
> -		nc->offset = size;
> +		nc->offset = 0;
>  	}
>  
> -	offset = nc->offset - fragsz;
> -	if (unlikely(offset < 0)) {
> +#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
> +	/* if size can vary use size else just use PAGE_SIZE */
> +	size = nc->size;
> +#else
> +	size = PAGE_SIZE;
> +#endif
> +
> +	offset = ALIGN(nc->offset, -align_mask);

I am not sure if using -align_mask here with the ALIGN macro is really
to your benefit. I would be curious what the compiler is generating.

Again, I think you would be much better off with:
	offset = __ALIGN_KERNEL_MASK(nc->offset, ~align_mask);

That will save you a number of conversions as the use of the ALIGN
macro gives you:
	offset = (nc->offset + (-align_mask - 1)) & ~(-align_mask -
1);

whereas what I am suggesting gives you:
	offset = (nc->offset + ~align_mask) & ~(~align_mask));

My main concern is that I am not sure the compiler will optimize around
the combination of bit operations and arithmetic operations. It seems
much cleaner to me to stick to the bitwise operations for the alignment
than to force this into the vhost approach which requires a power of 2
aligned mask.

Also the old code was aligning on the combination of offset AND fragsz.
This new logic is aligning on offset only. Do we run the risk of
overwriting blocks of neighbouring fragments if two users of
napi_alloc_frag_align end up passing arguments that have different
alignment values?

> +	if (unlikely(offset + fragsz > size)) {
>  		page = virt_to_page(nc->va);
>  
>  		if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
> @@ -102,17 +104,13 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
>  			goto refill;
>  		}
>  
> -#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
> -		/* if size can vary use size else just use PAGE_SIZE */
> -		size = nc->size;
> -#endif
>  		/* OK, page count is 0, we can safely set it */
>  		set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
>  
>  		/* reset page count bias and offset to start of new frag */
>  		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
> -		offset = size - fragsz;
> -		if (unlikely(offset < 0)) {
> +		offset = 0;
> +		if (unlikely(fragsz > size)) {

This check can probably be moved now. It was placed here to optimize
things as a check of offset < 0 was a single jump command based on the
signed flag being set as a result of the offset calculation.

It might make sense to pull this out of here and instead place it at
the start of this block after the initial check with offset + fragsz >
size since that would shorten the need to carry the size variable.

>  			/*
>  			 * The caller is trying to allocate a fragment
>  			 * with fragsz > PAGE_SIZE but the cache isn't big
> @@ -127,8 +125,7 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
>  	}
>  
>  	nc->pagecnt_bias--;
> -	offset &= align_mask;
> -	nc->offset = offset;
> +	nc->offset = offset + fragsz;
>  
>  	return nc->va + offset;
>  }
Yunsheng Lin April 16, 2024, 1:11 p.m. UTC | #2
On 2024/4/16 7:55, Alexander H Duyck wrote:
> On Mon, 2024-04-15 at 21:19 +0800, Yunsheng Lin wrote:
>> We are above to use page_frag_alloc_*() API to not just
>> allocate memory for skb->data, but also use them to do
>> the memory allocation for skb frag too. Currently the
>> implementation of page_frag in mm subsystem is running
>> the offset as a countdown rather than count-up value,
>> there may have several advantages to that as mentioned
>> in [1], but it may have some disadvantages, for example,
>> it may disable skb frag coaleasing and more correct cache
>> prefetching
>>
>> We have a trade-off to make in order to have a unified
>> implementation and API for page_frag, so use a initial zero
>> offset in this patch, and the following patch will try to
>> make some optimization to aovid the disadvantages as much
>> as possible.
>>
>> 1. https://lore.kernel.org/all/f4abe71b3439b39d17a6fb2d410180f367cadf5c.camel@gmail.com/
>>
>> CC: Alexander Duyck <alexander.duyck@gmail.com>
>> Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
>> ---
>>  mm/page_frag_cache.c | 31 ++++++++++++++-----------------
>>  1 file changed, 14 insertions(+), 17 deletions(-)
>>
>> diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c
>> index 64993b5d1243..dc864ee09536 100644
>> --- a/mm/page_frag_cache.c
>> +++ b/mm/page_frag_cache.c
>> @@ -65,9 +65,8 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
>>  			      unsigned int fragsz, gfp_t gfp_mask,
>>  			      unsigned int align_mask)
>>  {
>> -	unsigned int size = PAGE_SIZE;
>> +	unsigned int size, offset;
>>  	struct page *page;
>> -	int offset;
>>  
>>  	if (unlikely(!nc->va)) {
>>  refill:
>> @@ -75,10 +74,6 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
>>  		if (!page)
>>  			return NULL;
>>  
>> -#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
>> -		/* if size can vary use size else just use PAGE_SIZE */
>> -		size = nc->size;
>> -#endif
>>  		/* Even if we own the page, we do not use atomic_set().
>>  		 * This would break get_page_unless_zero() users.
>>  		 */
>> @@ -87,11 +82,18 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
>>  		/* reset page count bias and offset to start of new frag */
>>  		nc->pfmemalloc = page_is_pfmemalloc(page);
>>  		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
>> -		nc->offset = size;
>> +		nc->offset = 0;
>>  	}
>>  
>> -	offset = nc->offset - fragsz;
>> -	if (unlikely(offset < 0)) {
>> +#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
>> +	/* if size can vary use size else just use PAGE_SIZE */
>> +	size = nc->size;
>> +#else
>> +	size = PAGE_SIZE;
>> +#endif
>> +
>> +	offset = ALIGN(nc->offset, -align_mask);
> 
> I am not sure if using -align_mask here with the ALIGN macro is really
> to your benefit. I would be curious what the compiler is generating.
> 
> Again, I think you would be much better off with:
> 	offset = __ALIGN_KERNEL_MASK(nc->offset, ~align_mask);
> 
> That will save you a number of conversions as the use of the ALIGN
> macro gives you:
> 	offset = (nc->offset + (-align_mask - 1)) & ~(-align_mask -
> 1);
> 
> whereas what I am suggesting gives you:
> 	offset = (nc->offset + ~align_mask) & ~(~align_mask));
> 
> My main concern is that I am not sure the compiler will optimize around
> the combination of bit operations and arithmetic operations. It seems
> much cleaner to me to stick to the bitwise operations for the alignment
> than to force this into the vhost approach which requires a power of 2
> aligned mask.

My argument about the above is in [1]. But since you seems to not be working
through the next patch yet, I might just do it as you suggested in the next
version so that I don't have to repeat my argument again:(

1. https://lore.kernel.org/all/df826acf-8867-7eb6-e7f0-962c106bc28b@huawei.com/

> 
> Also the old code was aligning on the combination of offset AND fragsz.
> This new logic is aligning on offset only. Do we run the risk of
> overwriting blocks of neighbouring fragments if two users of
> napi_alloc_frag_align end up passing arguments that have different
> alignment values?

I am not sure I understand the question here.
As my understanding, both the old code and new code is aligning on
the offset, and both might have space reserved before the offset
due to aligning. The memory returned to the caller is in the range
of [offset, offset + fragsz). Am I missing something obvious here?

> 
>> +	if (unlikely(offset + fragsz > size)) {
>>  		page = virt_to_page(nc->va);
>>  
>>  		if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
>> @@ -102,17 +104,13 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
>>  			goto refill;
>>  		}
>>  
>> -#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
>> -		/* if size can vary use size else just use PAGE_SIZE */
>> -		size = nc->size;
>> -#endif
>>  		/* OK, page count is 0, we can safely set it */
>>  		set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
>>  
>>  		/* reset page count bias and offset to start of new frag */
>>  		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
>> -		offset = size - fragsz;
>> -		if (unlikely(offset < 0)) {
>> +		offset = 0;
>> +		if (unlikely(fragsz > size)) {
> 
> This check can probably be moved now. It was placed here to optimize
> things as a check of offset < 0 was a single jump command based on the
> signed flag being set as a result of the offset calculation.
> 
> It might make sense to pull this out of here and instead place it at
> the start of this block after the initial check with offset + fragsz >
> size since that would shorten the need to carry the size variable.

Yes, that is better.

But does it make more sense to just do the 'fragsz > PAGE_SIZE' checking
alongside with the aligning checking, as we have a better chance of
succeding in allocating order 0 page than order 3 page, so it seems the
caller is not allowed to pass a fragsz being bigger than PAGE_SIZE anyway?

> 
>>  			/*
>>  			 * The caller is trying to allocate a fragment
>>  			 * with fragsz > PAGE_SIZE but the cache isn't big
>> @@ -127,8 +125,7 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
>>  	}
>>  
>>  	nc->pagecnt_bias--;
>> -	offset &= align_mask;
>> -	nc->offset = offset;
>> +	nc->offset = offset + fragsz;
>>  
>>  	return nc->va + offset;
>>  }
> 
> .
>
Alexander Duyck April 16, 2024, 3:51 p.m. UTC | #3
On Tue, 2024-04-16 at 21:11 +0800, Yunsheng Lin wrote:
> On 2024/4/16 7:55, Alexander H Duyck wrote:
> > On Mon, 2024-04-15 at 21:19 +0800, Yunsheng Lin wrote:
> > > We are above to use page_frag_alloc_*() API to not just
> > > allocate memory for skb->data, but also use them to do
> > > the memory allocation for skb frag too. Currently the
> > > implementation of page_frag in mm subsystem is running
> > > the offset as a countdown rather than count-up value,
> > > there may have several advantages to that as mentioned
> > > in [1], but it may have some disadvantages, for example,
> > > it may disable skb frag coaleasing and more correct cache
> > > prefetching
> > > 
> > > We have a trade-off to make in order to have a unified
> > > implementation and API for page_frag, so use a initial zero
> > > offset in this patch, and the following patch will try to
> > > make some optimization to aovid the disadvantages as much
> > > as possible.
> > > 
> > > 1. https://lore.kernel.org/all/f4abe71b3439b39d17a6fb2d410180f367cadf5c.camel@gmail.com/
> > > 
> > > CC: Alexander Duyck <alexander.duyck@gmail.com>
> > > Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
> > > ---
> > >  mm/page_frag_cache.c | 31 ++++++++++++++-----------------
> > >  1 file changed, 14 insertions(+), 17 deletions(-)
> > > 
> > > diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c
> > > index 64993b5d1243..dc864ee09536 100644
> > > --- a/mm/page_frag_cache.c
> > > +++ b/mm/page_frag_cache.c
> > > @@ -65,9 +65,8 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
> > >  			      unsigned int fragsz, gfp_t gfp_mask,
> > >  			      unsigned int align_mask)
> > >  {
> > > -	unsigned int size = PAGE_SIZE;
> > > +	unsigned int size, offset;
> > >  	struct page *page;
> > > -	int offset;
> > >  
> > >  	if (unlikely(!nc->va)) {
> > >  refill:
> > > @@ -75,10 +74,6 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
> > >  		if (!page)
> > >  			return NULL;
> > >  
> > > -#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
> > > -		/* if size can vary use size else just use PAGE_SIZE */
> > > -		size = nc->size;
> > > -#endif
> > >  		/* Even if we own the page, we do not use atomic_set().
> > >  		 * This would break get_page_unless_zero() users.
> > >  		 */
> > > @@ -87,11 +82,18 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
> > >  		/* reset page count bias and offset to start of new frag */
> > >  		nc->pfmemalloc = page_is_pfmemalloc(page);
> > >  		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
> > > -		nc->offset = size;
> > > +		nc->offset = 0;
> > >  	}
> > >  
> > > -	offset = nc->offset - fragsz;
> > > -	if (unlikely(offset < 0)) {
> > > +#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
> > > +	/* if size can vary use size else just use PAGE_SIZE */
> > > +	size = nc->size;
> > > +#else
> > > +	size = PAGE_SIZE;
> > > +#endif
> > > +
> > > +	offset = ALIGN(nc->offset, -align_mask);
> > 
> > I am not sure if using -align_mask here with the ALIGN macro is really
> > to your benefit. I would be curious what the compiler is generating.
> > 
> > Again, I think you would be much better off with:
> > 	offset = __ALIGN_KERNEL_MASK(nc->offset, ~align_mask);
> > 
> > That will save you a number of conversions as the use of the ALIGN
> > macro gives you:
> > 	offset = (nc->offset + (-align_mask - 1)) & ~(-align_mask -
> > 1);
> > 
> > whereas what I am suggesting gives you:
> > 	offset = (nc->offset + ~align_mask) & ~(~align_mask));
> > 
> > My main concern is that I am not sure the compiler will optimize around
> > the combination of bit operations and arithmetic operations. It seems
> > much cleaner to me to stick to the bitwise operations for the alignment
> > than to force this into the vhost approach which requires a power of 2
> > aligned mask.
> 
> My argument about the above is in [1]. But since you seems to not be working
> through the next patch yet, I might just do it as you suggested in the next
> version so that I don't have to repeat my argument again:(
> 
> 1. https://lore.kernel.org/all/df826acf-8867-7eb6-e7f0-962c106bc28b@huawei.com/

Sorry, I didn't have time to go digging through the mailing list to
review all the patches from the last set. I was only Cced on a few of
them as I recall. As you know I have the fbnic patches I also have been
trying to get pushed out so that was my primary focus the last couple
weeks.

That said, this just goes into my earlier complaints. You are now
optimizing for the non-aligned paths. There are few callers that are
asking for this to provide non-aligned segments. In most cases they are
at least cache aligned. Specifically the __netdev_alloc_frag_align and
__napi_alloc_frag_align are aligning things at a minimum to
SMP_CACHE_BYTES by aligning the fragsz argument using SKB_DATA_ALIGN.
Perhaps it would be better to actually incorporate that alignment
guarantee into the calls themselves by doing an &= with the align_mask
request for those two functions to make this more transparent.

> > 
> > Also the old code was aligning on the combination of offset AND fragsz.
> > This new logic is aligning on offset only. Do we run the risk of
> > overwriting blocks of neighbouring fragments if two users of
> > napi_alloc_frag_align end up passing arguments that have different
> > alignment values?
> 
> I am not sure I understand the question here.
> As my understanding, both the old code and new code is aligning on
> the offset, and both might have space reserved before the offset
> due to aligning. The memory returned to the caller is in the range
> of [offset, offset + fragsz). Am I missing something obvious here?

My main concern is that by aligning offset - fragsz by the alignment
mask we were taking care of all our variables immediately ourselves. If
we didn't provide a correct value it was all traceable to one call as
the assumption was that fragsz would be a multiple of the alignment
value.

With your change the alignment is done in the following call. So now it
splits up the alignment of fragsz from the alignment of the offset. As
such we will probably need to add additional overhead to guarantee
fragsz is a multiple of the alignment.

> > 
> > > +	if (unlikely(offset + fragsz > size)) {
> > >  		page = virt_to_page(nc->va);
> > >  
> > >  		if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
> > > @@ -102,17 +104,13 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
> > >  			goto refill;
> > >  		}
> > >  
> > > -#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
> > > -		/* if size can vary use size else just use PAGE_SIZE */
> > > -		size = nc->size;
> > > -#endif
> > >  		/* OK, page count is 0, we can safely set it */
> > >  		set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
> > >  
> > >  		/* reset page count bias and offset to start of new frag */
> > >  		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
> > > -		offset = size - fragsz;
> > > -		if (unlikely(offset < 0)) {
> > > +		offset = 0;
> > > +		if (unlikely(fragsz > size)) {
> > 
> > This check can probably be moved now. It was placed here to optimize
> > things as a check of offset < 0 was a single jump command based on the
> > signed flag being set as a result of the offset calculation.
> > 
> > It might make sense to pull this out of here and instead place it at
> > the start of this block after the initial check with offset + fragsz >
> > size since that would shorten the need to carry the size variable.
> 
> Yes, that is better.
> 
> But does it make more sense to just do the 'fragsz > PAGE_SIZE' checking
> alongside with the aligning checking, as we have a better chance of
> succeding in allocating order 0 page than order 3 page, so it seems the
> caller is not allowed to pass a fragsz being bigger than PAGE_SIZE anyway?

Yeah, that should be fine.

> > 
> > >  			/*
> > >  			 * The caller is trying to allocate a fragment
> > >  			 * with fragsz > PAGE_SIZE but the cache isn't big
> > > @@ -127,8 +125,7 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
> > >  	}
> > >  
> > >  	nc->pagecnt_bias--;
> > > -	offset &= align_mask;
> > > -	nc->offset = offset;
> > > +	nc->offset = offset + fragsz;
> > >  
> > >  	return nc->va + offset;
> > >  }
> > 
> > .
> >
Yunsheng Lin April 17, 2024, 1:17 p.m. UTC | #4
On 2024/4/16 23:51, Alexander H Duyck wrote:
> On Tue, 2024-04-16 at 21:11 +0800, Yunsheng Lin wrote:
>> On 2024/4/16 7:55, Alexander H Duyck wrote:
>>> On Mon, 2024-04-15 at 21:19 +0800, Yunsheng Lin wrote:
>>>> We are above to use page_frag_alloc_*() API to not just
>>>> allocate memory for skb->data, but also use them to do
>>>> the memory allocation for skb frag too. Currently the
>>>> implementation of page_frag in mm subsystem is running
>>>> the offset as a countdown rather than count-up value,
>>>> there may have several advantages to that as mentioned
>>>> in [1], but it may have some disadvantages, for example,
>>>> it may disable skb frag coaleasing and more correct cache
>>>> prefetching
>>>>
>>>> We have a trade-off to make in order to have a unified
>>>> implementation and API for page_frag, so use a initial zero
>>>> offset in this patch, and the following patch will try to
>>>> make some optimization to aovid the disadvantages as much
>>>> as possible.
>>>>
>>>> 1. https://lore.kernel.org/all/f4abe71b3439b39d17a6fb2d410180f367cadf5c.camel@gmail.com/
>>>>
>>>> CC: Alexander Duyck <alexander.duyck@gmail.com>
>>>> Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
>>>> ---
>>>>  mm/page_frag_cache.c | 31 ++++++++++++++-----------------
>>>>  1 file changed, 14 insertions(+), 17 deletions(-)
>>>>
>>>> diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c
>>>> index 64993b5d1243..dc864ee09536 100644
>>>> --- a/mm/page_frag_cache.c
>>>> +++ b/mm/page_frag_cache.c
>>>> @@ -65,9 +65,8 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
>>>>  			      unsigned int fragsz, gfp_t gfp_mask,
>>>>  			      unsigned int align_mask)
>>>>  {
>>>> -	unsigned int size = PAGE_SIZE;
>>>> +	unsigned int size, offset;
>>>>  	struct page *page;
>>>> -	int offset;
>>>>  
>>>>  	if (unlikely(!nc->va)) {
>>>>  refill:
>>>> @@ -75,10 +74,6 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
>>>>  		if (!page)
>>>>  			return NULL;
>>>>  
>>>> -#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
>>>> -		/* if size can vary use size else just use PAGE_SIZE */
>>>> -		size = nc->size;
>>>> -#endif
>>>>  		/* Even if we own the page, we do not use atomic_set().
>>>>  		 * This would break get_page_unless_zero() users.
>>>>  		 */
>>>> @@ -87,11 +82,18 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
>>>>  		/* reset page count bias and offset to start of new frag */
>>>>  		nc->pfmemalloc = page_is_pfmemalloc(page);
>>>>  		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
>>>> -		nc->offset = size;
>>>> +		nc->offset = 0;
>>>>  	}
>>>>  
>>>> -	offset = nc->offset - fragsz;
>>>> -	if (unlikely(offset < 0)) {
>>>> +#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
>>>> +	/* if size can vary use size else just use PAGE_SIZE */
>>>> +	size = nc->size;
>>>> +#else
>>>> +	size = PAGE_SIZE;
>>>> +#endif
>>>> +
>>>> +	offset = ALIGN(nc->offset, -align_mask);
>>>
>>> I am not sure if using -align_mask here with the ALIGN macro is really
>>> to your benefit. I would be curious what the compiler is generating.
>>>
>>> Again, I think you would be much better off with:
>>> 	offset = __ALIGN_KERNEL_MASK(nc->offset, ~align_mask);
>>>
>>> That will save you a number of conversions as the use of the ALIGN
>>> macro gives you:
>>> 	offset = (nc->offset + (-align_mask - 1)) & ~(-align_mask -
>>> 1);
>>>
>>> whereas what I am suggesting gives you:
>>> 	offset = (nc->offset + ~align_mask) & ~(~align_mask));
>>>
>>> My main concern is that I am not sure the compiler will optimize around
>>> the combination of bit operations and arithmetic operations. It seems
>>> much cleaner to me to stick to the bitwise operations for the alignment
>>> than to force this into the vhost approach which requires a power of 2
>>> aligned mask.
>>
>> My argument about the above is in [1]. But since you seems to not be working
>> through the next patch yet, I might just do it as you suggested in the next
>> version so that I don't have to repeat my argument again:(
>>
>> 1. https://lore.kernel.org/all/df826acf-8867-7eb6-e7f0-962c106bc28b@huawei.com/
> 
> Sorry, I didn't have time to go digging through the mailing list to
> review all the patches from the last set. I was only Cced on a few of

I thought adding 'CC: Alexander Duyck <alexander.duyck@gmail.com>' in
the cover letter would enable the git sendmail to send all the patches
to a specific email, apparently it did not. And I seems to only add that
in rfc and v1, but forgot to add it in the newest v2 version:(

> them as I recall. As you know I have the fbnic patches I also have been
> trying to get pushed out so that was my primary focus the last couple
> weeks.

Understood.

> 
> That said, this just goes into my earlier complaints. You are now
> optimizing for the non-aligned paths. There are few callers that are
> asking for this to provide non-aligned segments. In most cases they are

I suppose that 'optimizing for the non-aligned paths' is referring to
doing the data alignment in a inline helper for aligned API caller and
avoid doing the data alignment for non-aligned API caller in the patch 6?

For the existing user, it seems there are more callers for the non-aligned
API than callers for aligned API:

Referenced in 13 files:
https://elixir.bootlin.com/linux/v6.7-rc8/A/ident/napi_alloc_frag

Referenced in 2 files:
https://elixir.bootlin.com/linux/v6.7-rc8/A/ident/napi_alloc_frag_align

Referenced in 15 files:
https://elixir.bootlin.com/linux/v6.7-rc8/A/ident/netdev_alloc_frag

No references found in the database
https://elixir.bootlin.com/linux/v6.7-rc8/A/ident/netdev_alloc_frag_align

Referenced in 6 files:
https://elixir.bootlin.com/linux/v6.7-rc8/A/ident/page_frag_alloc

Referenced in 3 files:
https://elixir.bootlin.com/linux/v6.7-rc8/A/ident/page_frag_alloc_align

And we are adding new users mostly asking for non-aligned segments in
patch 13.

I would argue that it is not about optimizing for the non-aligned paths,
it is about avoid doing the data alignment operation for non-aligned API.

> at least cache aligned. Specifically the __netdev_alloc_frag_align and
> __napi_alloc_frag_align are aligning things at a minimum to
> SMP_CACHE_BYTES by aligning the fragsz argument using SKB_DATA_ALIGN.

It seems the above is doing the aligning operation for fragsz, most of
callers are calling __netdev_alloc_frag_align() and __napi_alloc_frag_align()
with align_mask being ~0u.

> Perhaps it would be better to actually incorporate that alignment
> guarantee into the calls themselves by doing an &= with the align_mask
> request for those two functions to make this more transparent.

Did you means doing something like below for fragsz too?
fragsz = __ALIGN_KERNEL_MASK(fragsz, ~align_mask);

> 
>>>
>>> Also the old code was aligning on the combination of offset AND fragsz.
>>> This new logic is aligning on offset only. Do we run the risk of
>>> overwriting blocks of neighbouring fragments if two users of
>>> napi_alloc_frag_align end up passing arguments that have different
>>> alignment values?
>>
>> I am not sure I understand the question here.
>> As my understanding, both the old code and new code is aligning on
>> the offset, and both might have space reserved before the offset
>> due to aligning. The memory returned to the caller is in the range
>> of [offset, offset + fragsz). Am I missing something obvious here?
> 
> My main concern is that by aligning offset - fragsz by the alignment
> mask we were taking care of all our variables immediately ourselves. If
> we didn't provide a correct value it was all traceable to one call as
> the assumption was that fragsz would be a multiple of the alignment
> value.
> 
> With your change the alignment is done in the following call. So now it
> splits up the alignment of fragsz from the alignment of the offset. As
> such we will probably need to add additional overhead to guarantee
> fragsz is a multiple of the alignment.

I am not thinking it through how the above will affect the API caller yet
if different caller is passing different alignment for the same
'page_frag_cache' instance, does it cause some cache bouncing or dma
issue if used for dma?

I am supposing it depends on what alignment semantics are we providing
here:
1. Ensure alignment for both offset and fragsz.
2. Ensure alignment for offset only.
3. Ensure alignment for fragsz only.

It seems you are in favour of option 1?
I am supposing it is a balance between performance and API flexibility
here? If it is possible to enforce the caller to use the same alignment
for the same 'page_frag_cache' instance, and give a warning if it is
not using the same alignment? So that we only need to ensure alignment
for offset or fragsz, but not both of them.

I am not sure if there is a strong use case to support both alignment
for offset and fragsz, we might create a new API for it if it is a
strong use case?
diff mbox series

Patch

diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c
index 64993b5d1243..dc864ee09536 100644
--- a/mm/page_frag_cache.c
+++ b/mm/page_frag_cache.c
@@ -65,9 +65,8 @@  void *__page_frag_alloc_align(struct page_frag_cache *nc,
 			      unsigned int fragsz, gfp_t gfp_mask,
 			      unsigned int align_mask)
 {
-	unsigned int size = PAGE_SIZE;
+	unsigned int size, offset;
 	struct page *page;
-	int offset;
 
 	if (unlikely(!nc->va)) {
 refill:
@@ -75,10 +74,6 @@  void *__page_frag_alloc_align(struct page_frag_cache *nc,
 		if (!page)
 			return NULL;
 
-#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
-		/* if size can vary use size else just use PAGE_SIZE */
-		size = nc->size;
-#endif
 		/* Even if we own the page, we do not use atomic_set().
 		 * This would break get_page_unless_zero() users.
 		 */
@@ -87,11 +82,18 @@  void *__page_frag_alloc_align(struct page_frag_cache *nc,
 		/* reset page count bias and offset to start of new frag */
 		nc->pfmemalloc = page_is_pfmemalloc(page);
 		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
-		nc->offset = size;
+		nc->offset = 0;
 	}
 
-	offset = nc->offset - fragsz;
-	if (unlikely(offset < 0)) {
+#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
+	/* if size can vary use size else just use PAGE_SIZE */
+	size = nc->size;
+#else
+	size = PAGE_SIZE;
+#endif
+
+	offset = ALIGN(nc->offset, -align_mask);
+	if (unlikely(offset + fragsz > size)) {
 		page = virt_to_page(nc->va);
 
 		if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
@@ -102,17 +104,13 @@  void *__page_frag_alloc_align(struct page_frag_cache *nc,
 			goto refill;
 		}
 
-#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
-		/* if size can vary use size else just use PAGE_SIZE */
-		size = nc->size;
-#endif
 		/* OK, page count is 0, we can safely set it */
 		set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
 
 		/* reset page count bias and offset to start of new frag */
 		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
-		offset = size - fragsz;
-		if (unlikely(offset < 0)) {
+		offset = 0;
+		if (unlikely(fragsz > size)) {
 			/*
 			 * The caller is trying to allocate a fragment
 			 * with fragsz > PAGE_SIZE but the cache isn't big
@@ -127,8 +125,7 @@  void *__page_frag_alloc_align(struct page_frag_cache *nc,
 	}
 
 	nc->pagecnt_bias--;
-	offset &= align_mask;
-	nc->offset = offset;
+	nc->offset = offset + fragsz;
 
 	return nc->va + offset;
 }