diff mbox series

[net-next,v1,02/12] mm: page_frag: use initial zero offset for page_frag_alloc_align()

Message ID 20240407130850.19625-3-linyunsheng@huawei.com (mailing list archive)
State New
Headers show
Series [net-next,v1,01/12] mm: Move the page fragment allocator from page_alloc into its own file | expand

Commit Message

Yunsheng Lin April 7, 2024, 1:08 p.m. UTC
We are above to use page_frag_alloc_*() API to not just
allocate memory for skb->data, but also use them to do
the memory allocation for skb frag too. Currently the
implementation of page_frag in mm subsystem is running
the offset as a countdown rather than count-up value,
there may have several advantages to that as mentioned
in [1], but it may have some disadvantages, for example,
it may disable skb frag coaleasing and more correct cache
prefetching

We have a trade-off to make in order to have a unified
implementation and API for page_frag, so use a initial zero
offset in this patch, and the following patch will try to
make some optimization to aovid the disadvantages as much
as possible.

1. https://lore.kernel.org/all/f4abe71b3439b39d17a6fb2d410180f367cadf5c.camel@gmail.com/

CC: Alexander Duyck <alexander.duyck@gmail.com>
Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
---
 mm/page_frag_cache.c | 31 ++++++++++++++-----------------
 1 file changed, 14 insertions(+), 17 deletions(-)

Comments

Alexander Duyck April 7, 2024, 5:52 p.m. UTC | #1
On Sun, 2024-04-07 at 21:08 +0800, Yunsheng Lin wrote:
> We are above to use page_frag_alloc_*() API to not just
> allocate memory for skb->data, but also use them to do
> the memory allocation for skb frag too. Currently the
> implementation of page_frag in mm subsystem is running
> the offset as a countdown rather than count-up value,
> there may have several advantages to that as mentioned
> in [1], but it may have some disadvantages, for example,
> it may disable skb frag coaleasing and more correct cache
> prefetching
> 
> We have a trade-off to make in order to have a unified
> implementation and API for page_frag, so use a initial zero
> offset in this patch, and the following patch will try to
> make some optimization to aovid the disadvantages as much
> as possible.
> 
> 1. https://lore.kernel.org/all/f4abe71b3439b39d17a6fb2d410180f367cadf5c.camel@gmail.com/
> 
> CC: Alexander Duyck <alexander.duyck@gmail.com>
> Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
> ---
>  mm/page_frag_cache.c | 31 ++++++++++++++-----------------
>  1 file changed, 14 insertions(+), 17 deletions(-)
> 
> diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c
> index a0f90ba25200..3e3e88d9af90 100644
> --- a/mm/page_frag_cache.c
> +++ b/mm/page_frag_cache.c
> @@ -67,9 +67,8 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
>  			      unsigned int fragsz, gfp_t gfp_mask,
>  			      unsigned int align_mask)
>  {
> -	unsigned int size = PAGE_SIZE;
> +	unsigned int size, offset;
>  	struct page *page;
> -	int offset;
>  
>  	if (unlikely(!nc->va)) {
>  refill:
> @@ -77,10 +76,6 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
>  		if (!page)
>  			return NULL;
>  
> -#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
> -		/* if size can vary use size else just use PAGE_SIZE */
> -		size = nc->size;
> -#endif
>  		/* Even if we own the page, we do not use atomic_set().
>  		 * This would break get_page_unless_zero() users.
>  		 */
> @@ -89,11 +84,18 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
>  		/* reset page count bias and offset to start of new frag */
>  		nc->pfmemalloc = page_is_pfmemalloc(page);
>  		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
> -		nc->offset = size;
> +		nc->offset = 0;
>  	}
>  
> -	offset = nc->offset - fragsz;
> -	if (unlikely(offset < 0)) {
> +#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
> +	/* if size can vary use size else just use PAGE_SIZE */
> +	size = nc->size;
> +#else
> +	size = PAGE_SIZE;
> +#endif
> +
> +	offset = ALIGN(nc->offset, -align_mask);
> +	if (unlikely(offset + fragsz > size)) {

Rather than using "ALIGN" with a negative value it would probably make
more sense to use __ALIGN_KERNEL_MASK with ~align_mask. I am not sure
how well the compiler sorts out the use of negatives to flip values
that are then converted to masks with the "(a) - 1".

>  		page = virt_to_page(nc->va);
>  
>  		if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
> @@ -104,17 +106,13 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
>  			goto refill;
>  		}
>  
> -#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
> -		/* if size can vary use size else just use PAGE_SIZE */
> -		size = nc->size;
> -#endif
>  		/* OK, page count is 0, we can safely set it */
>  		set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
>  
>  		/* reset page count bias and offset to start of new frag */
>  		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
> -		offset = size - fragsz;
> -		if (unlikely(offset < 0)) {
> +		offset = 0;
> +		if (unlikely(fragsz > size)) {
>  			/*
>  			 * The caller is trying to allocate a fragment
>  			 * with fragsz > PAGE_SIZE but the cache isn't big
> @@ -129,8 +127,7 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
>  	}
>  
>  	nc->pagecnt_bias--;
> -	offset &= align_mask;
> -	nc->offset = offset;
> +	nc->offset = offset + fragsz;
>  
>  	return nc->va + offset;
>  }
Yunsheng Lin April 8, 2024, 1:39 p.m. UTC | #2
On 2024/4/8 1:52, Alexander H Duyck wrote:
> On Sun, 2024-04-07 at 21:08 +0800, Yunsheng Lin wrote:
>> We are above to use page_frag_alloc_*() API to not just
>> allocate memory for skb->data, but also use them to do
>> the memory allocation for skb frag too. Currently the
>> implementation of page_frag in mm subsystem is running
>> the offset as a countdown rather than count-up value,
>> there may have several advantages to that as mentioned
>> in [1], but it may have some disadvantages, for example,
>> it may disable skb frag coaleasing and more correct cache
>> prefetching
>>
>> We have a trade-off to make in order to have a unified
>> implementation and API for page_frag, so use a initial zero
>> offset in this patch, and the following patch will try to
>> make some optimization to aovid the disadvantages as much
>> as possible.
>>
>> 1. https://lore.kernel.org/all/f4abe71b3439b39d17a6fb2d410180f367cadf5c.camel@gmail.com/
>>
>> CC: Alexander Duyck <alexander.duyck@gmail.com>
>> Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
>> ---
>>  mm/page_frag_cache.c | 31 ++++++++++++++-----------------
>>  1 file changed, 14 insertions(+), 17 deletions(-)
>>
>> diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c
>> index a0f90ba25200..3e3e88d9af90 100644
>> --- a/mm/page_frag_cache.c
>> +++ b/mm/page_frag_cache.c
>> @@ -67,9 +67,8 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
>>  			      unsigned int fragsz, gfp_t gfp_mask,
>>  			      unsigned int align_mask)
>>  {
>> -	unsigned int size = PAGE_SIZE;
>> +	unsigned int size, offset;
>>  	struct page *page;
>> -	int offset;
>>  
>>  	if (unlikely(!nc->va)) {
>>  refill:
>> @@ -77,10 +76,6 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
>>  		if (!page)
>>  			return NULL;
>>  
>> -#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
>> -		/* if size can vary use size else just use PAGE_SIZE */
>> -		size = nc->size;
>> -#endif
>>  		/* Even if we own the page, we do not use atomic_set().
>>  		 * This would break get_page_unless_zero() users.
>>  		 */
>> @@ -89,11 +84,18 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
>>  		/* reset page count bias and offset to start of new frag */
>>  		nc->pfmemalloc = page_is_pfmemalloc(page);
>>  		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
>> -		nc->offset = size;
>> +		nc->offset = 0;
>>  	}
>>  
>> -	offset = nc->offset - fragsz;
>> -	if (unlikely(offset < 0)) {
>> +#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
>> +	/* if size can vary use size else just use PAGE_SIZE */
>> +	size = nc->size;
>> +#else
>> +	size = PAGE_SIZE;
>> +#endif
>> +
>> +	offset = ALIGN(nc->offset, -align_mask);
>> +	if (unlikely(offset + fragsz > size)) {
> 
> Rather than using "ALIGN" with a negative value it would probably make
> more sense to use __ALIGN_KERNEL_MASK with ~align_mask. I am not sure
> how well the compiler sorts out the use of negatives to flip values
> that are then converted to masks with the "(a) - 1".

The next patch will remove the '-' in '-align_mask' as the 'ALIGN' operation
is done in the inline helper. I am not sure that matter much to use
__ALIGN_KERNEL_MASK with ~align_mask?

>
Alexander Duyck April 8, 2024, 4:11 p.m. UTC | #3
On Mon, Apr 8, 2024 at 6:39 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:
>
> On 2024/4/8 1:52, Alexander H Duyck wrote:
> > On Sun, 2024-04-07 at 21:08 +0800, Yunsheng Lin wrote:
> >> We are above to use page_frag_alloc_*() API to not just
> >> allocate memory for skb->data, but also use them to do
> >> the memory allocation for skb frag too. Currently the
> >> implementation of page_frag in mm subsystem is running
> >> the offset as a countdown rather than count-up value,
> >> there may have several advantages to that as mentioned
> >> in [1], but it may have some disadvantages, for example,
> >> it may disable skb frag coaleasing and more correct cache
> >> prefetching
> >>
> >> We have a trade-off to make in order to have a unified
> >> implementation and API for page_frag, so use a initial zero
> >> offset in this patch, and the following patch will try to
> >> make some optimization to aovid the disadvantages as much
> >> as possible.
> >>
> >> 1. https://lore.kernel.org/all/f4abe71b3439b39d17a6fb2d410180f367cadf5c.camel@gmail.com/
> >>
> >> CC: Alexander Duyck <alexander.duyck@gmail.com>
> >> Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
> >> ---
> >>  mm/page_frag_cache.c | 31 ++++++++++++++-----------------
> >>  1 file changed, 14 insertions(+), 17 deletions(-)
> >>
> >> diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c
> >> index a0f90ba25200..3e3e88d9af90 100644
> >> --- a/mm/page_frag_cache.c
> >> +++ b/mm/page_frag_cache.c
> >> @@ -67,9 +67,8 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
> >>                            unsigned int fragsz, gfp_t gfp_mask,
> >>                            unsigned int align_mask)
> >>  {
> >> -    unsigned int size = PAGE_SIZE;
> >> +    unsigned int size, offset;
> >>      struct page *page;
> >> -    int offset;
> >>
> >>      if (unlikely(!nc->va)) {
> >>  refill:
> >> @@ -77,10 +76,6 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
> >>              if (!page)
> >>                      return NULL;
> >>
> >> -#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
> >> -            /* if size can vary use size else just use PAGE_SIZE */
> >> -            size = nc->size;
> >> -#endif
> >>              /* Even if we own the page, we do not use atomic_set().
> >>               * This would break get_page_unless_zero() users.
> >>               */
> >> @@ -89,11 +84,18 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
> >>              /* reset page count bias and offset to start of new frag */
> >>              nc->pfmemalloc = page_is_pfmemalloc(page);
> >>              nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
> >> -            nc->offset = size;
> >> +            nc->offset = 0;
> >>      }
> >>
> >> -    offset = nc->offset - fragsz;
> >> -    if (unlikely(offset < 0)) {
> >> +#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
> >> +    /* if size can vary use size else just use PAGE_SIZE */
> >> +    size = nc->size;
> >> +#else
> >> +    size = PAGE_SIZE;
> >> +#endif
> >> +
> >> +    offset = ALIGN(nc->offset, -align_mask);
> >> +    if (unlikely(offset + fragsz > size)) {
> >
> > Rather than using "ALIGN" with a negative value it would probably make
> > more sense to use __ALIGN_KERNEL_MASK with ~align_mask. I am not sure
> > how well the compiler sorts out the use of negatives to flip values
> > that are then converted to masks with the "(a) - 1".
>
> The next patch will remove the '-' in '-align_mask' as the 'ALIGN' operation
> is done in the inline helper. I am not sure that matter much to use
> __ALIGN_KERNEL_MASK with ~align_mask?

It is a matter of making the negations more obvious. Basically you
could achieve the same alignment by doing:
  (offset + (~align_mask)) & ~(~align_mask)
rather than:
  (offset + ((-align_mask) - 1)) & ~((-align_mask) - 1)

I'm not sure the compiler will pick up on the fact that the two are
identical and can save a number of operations. Also my suggested
approach is closer to how it used to work. Technically the one you are
using only works if align_mask is a negative power of 2.
Yunsheng Lin April 9, 2024, 7:59 a.m. UTC | #4
On 2024/4/9 0:11, Alexander Duyck wrote:
> On Mon, Apr 8, 2024 at 6:39 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:
>>
>> On 2024/4/8 1:52, Alexander H Duyck wrote:
>>> On Sun, 2024-04-07 at 21:08 +0800, Yunsheng Lin wrote:
>>>> We are above to use page_frag_alloc_*() API to not just
>>>> allocate memory for skb->data, but also use them to do
>>>> the memory allocation for skb frag too. Currently the
>>>> implementation of page_frag in mm subsystem is running
>>>> the offset as a countdown rather than count-up value,
>>>> there may have several advantages to that as mentioned
>>>> in [1], but it may have some disadvantages, for example,
>>>> it may disable skb frag coaleasing and more correct cache
>>>> prefetching
>>>>
>>>> We have a trade-off to make in order to have a unified
>>>> implementation and API for page_frag, so use a initial zero
>>>> offset in this patch, and the following patch will try to
>>>> make some optimization to aovid the disadvantages as much
>>>> as possible.
>>>>
>>>> 1. https://lore.kernel.org/all/f4abe71b3439b39d17a6fb2d410180f367cadf5c.camel@gmail.com/
>>>>
>>>> CC: Alexander Duyck <alexander.duyck@gmail.com>
>>>> Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
>>>> ---
>>>>  mm/page_frag_cache.c | 31 ++++++++++++++-----------------
>>>>  1 file changed, 14 insertions(+), 17 deletions(-)
>>>>
>>>> diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c
>>>> index a0f90ba25200..3e3e88d9af90 100644
>>>> --- a/mm/page_frag_cache.c
>>>> +++ b/mm/page_frag_cache.c
>>>> @@ -67,9 +67,8 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
>>>>                            unsigned int fragsz, gfp_t gfp_mask,
>>>>                            unsigned int align_mask)
>>>>  {
>>>> -    unsigned int size = PAGE_SIZE;
>>>> +    unsigned int size, offset;
>>>>      struct page *page;
>>>> -    int offset;
>>>>
>>>>      if (unlikely(!nc->va)) {
>>>>  refill:
>>>> @@ -77,10 +76,6 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
>>>>              if (!page)
>>>>                      return NULL;
>>>>
>>>> -#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
>>>> -            /* if size can vary use size else just use PAGE_SIZE */
>>>> -            size = nc->size;
>>>> -#endif
>>>>              /* Even if we own the page, we do not use atomic_set().
>>>>               * This would break get_page_unless_zero() users.
>>>>               */
>>>> @@ -89,11 +84,18 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
>>>>              /* reset page count bias and offset to start of new frag */
>>>>              nc->pfmemalloc = page_is_pfmemalloc(page);
>>>>              nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
>>>> -            nc->offset = size;
>>>> +            nc->offset = 0;
>>>>      }
>>>>
>>>> -    offset = nc->offset - fragsz;
>>>> -    if (unlikely(offset < 0)) {
>>>> +#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
>>>> +    /* if size can vary use size else just use PAGE_SIZE */
>>>> +    size = nc->size;
>>>> +#else
>>>> +    size = PAGE_SIZE;
>>>> +#endif
>>>> +
>>>> +    offset = ALIGN(nc->offset, -align_mask);
>>>> +    if (unlikely(offset + fragsz > size)) {
>>>
>>> Rather than using "ALIGN" with a negative value it would probably make
>>> more sense to use __ALIGN_KERNEL_MASK with ~align_mask. I am not sure
>>> how well the compiler sorts out the use of negatives to flip values
>>> that are then converted to masks with the "(a) - 1".
>>
>> The next patch will remove the '-' in '-align_mask' as the 'ALIGN' operation
>> is done in the inline helper. I am not sure that matter much to use
>> __ALIGN_KERNEL_MASK with ~align_mask?
> 
> It is a matter of making the negations more obvious. Basically you
> could achieve the same alignment by doing:
>   (offset + (~align_mask)) & ~(~align_mask)
> rather than:
>   (offset + ((-align_mask) - 1)) & ~((-align_mask) - 1)
> 
> I'm not sure the compiler will pick up on the fact that the two are
> identical and can save a number of operations. Also my suggested
> approach is closer to how it used to work. Technically the one you are
> using only works if align_mask is a negative power of 2.

In patch 3, we have below, so the above trick is not really needed after
patch 3:


@@ -94,7 +93,7 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
 	size = PAGE_SIZE;
 #endif

-	offset = ALIGN(nc->offset, -align_mask);
+	offset = nc->offset;
 	if (unlikely(offset + fragsz > size)) {
 		page = virt_to_page(nc->va);

@@ -131,7 +130,7 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,

 	return nc->va + offset;
 }
-EXPORT_SYMBOL(__page_frag_alloc_align);
+EXPORT_SYMBOL(page_frag_alloc);

...

+static inline void *__page_frag_alloc_align(struct page_frag_cache *nc,
+					    unsigned int fragsz, gfp_t gfp_mask,
+					    unsigned int align)
+{
+	nc->offset = ALIGN(nc->offset, align);
+
+	return page_frag_alloc(nc, fragsz, gfp_mask);
+}

 static inline void *page_frag_alloc_align(struct page_frag_cache *nc,
 					  unsigned int fragsz, gfp_t gfp_mask,
 					  unsigned int align)
 {
 	WARN_ON_ONCE(!is_power_of_2(align));
-	return __page_frag_alloc_align(nc, fragsz, gfp_mask, -align);
-}



> .
>
diff mbox series

Patch

diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c
index a0f90ba25200..3e3e88d9af90 100644
--- a/mm/page_frag_cache.c
+++ b/mm/page_frag_cache.c
@@ -67,9 +67,8 @@  void *__page_frag_alloc_align(struct page_frag_cache *nc,
 			      unsigned int fragsz, gfp_t gfp_mask,
 			      unsigned int align_mask)
 {
-	unsigned int size = PAGE_SIZE;
+	unsigned int size, offset;
 	struct page *page;
-	int offset;
 
 	if (unlikely(!nc->va)) {
 refill:
@@ -77,10 +76,6 @@  void *__page_frag_alloc_align(struct page_frag_cache *nc,
 		if (!page)
 			return NULL;
 
-#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
-		/* if size can vary use size else just use PAGE_SIZE */
-		size = nc->size;
-#endif
 		/* Even if we own the page, we do not use atomic_set().
 		 * This would break get_page_unless_zero() users.
 		 */
@@ -89,11 +84,18 @@  void *__page_frag_alloc_align(struct page_frag_cache *nc,
 		/* reset page count bias and offset to start of new frag */
 		nc->pfmemalloc = page_is_pfmemalloc(page);
 		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
-		nc->offset = size;
+		nc->offset = 0;
 	}
 
-	offset = nc->offset - fragsz;
-	if (unlikely(offset < 0)) {
+#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
+	/* if size can vary use size else just use PAGE_SIZE */
+	size = nc->size;
+#else
+	size = PAGE_SIZE;
+#endif
+
+	offset = ALIGN(nc->offset, -align_mask);
+	if (unlikely(offset + fragsz > size)) {
 		page = virt_to_page(nc->va);
 
 		if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
@@ -104,17 +106,13 @@  void *__page_frag_alloc_align(struct page_frag_cache *nc,
 			goto refill;
 		}
 
-#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
-		/* if size can vary use size else just use PAGE_SIZE */
-		size = nc->size;
-#endif
 		/* OK, page count is 0, we can safely set it */
 		set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
 
 		/* reset page count bias and offset to start of new frag */
 		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
-		offset = size - fragsz;
-		if (unlikely(offset < 0)) {
+		offset = 0;
+		if (unlikely(fragsz > size)) {
 			/*
 			 * The caller is trying to allocate a fragment
 			 * with fragsz > PAGE_SIZE but the cache isn't big
@@ -129,8 +127,7 @@  void *__page_frag_alloc_align(struct page_frag_cache *nc,
 	}
 
 	nc->pagecnt_bias--;
-	offset &= align_mask;
-	nc->offset = offset;
+	nc->offset = offset + fragsz;
 
 	return nc->va + offset;
 }