diff mbox series

[net-next,v2,09/15] mm: page_frag: reuse MSB of 'size' field for pfmemalloc

Message ID 20240415131941.51153-10-linyunsheng@huawei.com (mailing list archive)
State New
Headers show
Series [net-next,v2,01/15] mm: page_frag: add a test module for page_frag | expand

Commit Message

Yunsheng Lin April 15, 2024, 1:19 p.m. UTC
The '(PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)' case is for the
system with page size less than 32KB, which is 0x8000 bytes
requiring 16 bits space, change 'size' to 'size_mask' to avoid
using the MSB, and change 'pfmemalloc' field to reuse the that
MSB, so that we remove the orginal space needed by 'pfmemalloc'.

For another case, the MSB of 'offset' is reused for 'pfmemalloc'.

Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
---
 include/linux/page_frag_cache.h | 13 ++++++++-----
 mm/page_frag_cache.c            |  5 +++--
 2 files changed, 11 insertions(+), 7 deletions(-)

Comments

Alexander Duyck April 16, 2024, 4:22 p.m. UTC | #1
On Mon, 2024-04-15 at 21:19 +0800, Yunsheng Lin wrote:
> The '(PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)' case is for the
> system with page size less than 32KB, which is 0x8000 bytes
> requiring 16 bits space, change 'size' to 'size_mask' to avoid
> using the MSB, and change 'pfmemalloc' field to reuse the that
> MSB, so that we remove the orginal space needed by 'pfmemalloc'.
> 
> For another case, the MSB of 'offset' is reused for 'pfmemalloc'.
> 
> Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
> ---
>  include/linux/page_frag_cache.h | 13 ++++++++-----
>  mm/page_frag_cache.c            |  5 +++--
>  2 files changed, 11 insertions(+), 7 deletions(-)
> 
> diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h
> index fe5faa80b6c3..40a7d6da9ef0 100644
> --- a/include/linux/page_frag_cache.h
> +++ b/include/linux/page_frag_cache.h
> @@ -12,15 +12,16 @@ struct page_frag_cache {
>  	void *va;
>  #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
>  	__u16 offset;
> -	__u16 size;
> +	__u16 size_mask:15;
> +	__u16 pfmemalloc:1;
>  #else
> -	__u32 offset;
> +	__u32 offset:31;
> +	__u32 pfmemalloc:1;
>  #endif

This seems like a really bad idea. Using a bit-field like this seems
like a waste as it means that all the accesses now have to add
additional operations to access either offset or size. It wasn't as if
this is an oversized struct, or one that we are allocating a ton of. As
such I am not sure why we need to optmize for size like this.

>  	/* we maintain a pagecount bias, so that we dont dirty cache line
>  	 * containing page->_refcount every time we allocate a fragment.
>  	 */
>  	unsigned int		pagecnt_bias;
> -	bool pfmemalloc;
>  };
>  
>  static inline void page_frag_cache_init(struct page_frag_cache *nc)
> @@ -43,7 +44,9 @@ static inline void *__page_frag_alloc_va_align(struct page_frag_cache *nc,
>  					       gfp_t gfp_mask,
>  					       unsigned int align)
>  {
> -	nc->offset = ALIGN(nc->offset, align);
> +	unsigned int offset = nc->offset;
> +
> +	nc->offset = ALIGN(offset, align);
>  
>  	return page_frag_alloc_va(nc, fragsz, gfp_mask);
>  }
> @@ -53,7 +56,7 @@ static inline void *page_frag_alloc_va_align(struct page_frag_cache *nc,
>  					     gfp_t gfp_mask,
>  					     unsigned int align)
>  {
> -	WARN_ON_ONCE(!is_power_of_2(align));
> +	WARN_ON_ONCE(!is_power_of_2(align) || align >= PAGE_SIZE);

The "align >= PAGE_SIZE" fix should probably go with your change that
reversed the direction.

>  
>  	return __page_frag_alloc_va_align(nc, fragsz, gfp_mask, align);
>  }
> diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c
> index 50511d8522d0..8d93029116e1 100644
> --- a/mm/page_frag_cache.c
> +++ b/mm/page_frag_cache.c
> @@ -32,7 +32,8 @@ static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
>  		   __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC;
>  	page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
>  				PAGE_FRAG_CACHE_MAX_ORDER);
> -	nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;
> +	nc->size_mask = page ? PAGE_FRAG_CACHE_MAX_SIZE - 1 : PAGE_SIZE - 1;
> +	VM_BUG_ON(page && nc->size_mask != PAGE_FRAG_CACHE_MAX_SIZE - 1);
>  #endif
>  	if (unlikely(!page))
>  		page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
> @@ -86,7 +87,7 @@ void *page_frag_alloc_va(struct page_frag_cache *nc, unsigned int fragsz,
>  
>  #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
>  	/* if size can vary use size else just use PAGE_SIZE */
> -	size = nc->size;
> +	size = nc->size_mask + 1;
>  #else
>  	size = PAGE_SIZE;
>  #endif

So now we are having to add arithmetic operations to the size in
addition having to mask in order to read the values. That just seems
like that much more overhead.
Yunsheng Lin April 17, 2024, 1:19 p.m. UTC | #2
On 2024/4/17 0:22, Alexander H Duyck wrote:
> On Mon, 2024-04-15 at 21:19 +0800, Yunsheng Lin wrote:
>> The '(PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)' case is for the
>> system with page size less than 32KB, which is 0x8000 bytes
>> requiring 16 bits space, change 'size' to 'size_mask' to avoid
>> using the MSB, and change 'pfmemalloc' field to reuse the that
>> MSB, so that we remove the orginal space needed by 'pfmemalloc'.
>>
>> For another case, the MSB of 'offset' is reused for 'pfmemalloc'.
>>
>> Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
>> ---
>>  include/linux/page_frag_cache.h | 13 ++++++++-----
>>  mm/page_frag_cache.c            |  5 +++--
>>  2 files changed, 11 insertions(+), 7 deletions(-)
>>
>> diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h
>> index fe5faa80b6c3..40a7d6da9ef0 100644
>> --- a/include/linux/page_frag_cache.h
>> +++ b/include/linux/page_frag_cache.h
>> @@ -12,15 +12,16 @@ struct page_frag_cache {
>>  	void *va;
>>  #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
>>  	__u16 offset;
>> -	__u16 size;
>> +	__u16 size_mask:15;
>> +	__u16 pfmemalloc:1;
>>  #else
>> -	__u32 offset;
>> +	__u32 offset:31;
>> +	__u32 pfmemalloc:1;
>>  #endif
> 
> This seems like a really bad idea. Using a bit-field like this seems
> like a waste as it means that all the accesses now have to add
> additional operations to access either offset or size. It wasn't as if
> this is an oversized struct, or one that we are allocating a ton of. As
> such I am not sure why we need to optmize for size like this.

For the old 'struct page_frag' use case, there is one 'struct page_frag'
for every socket and task_struct, there may be tens of thousands of
them even in a 32 bit sysmem, which might mean a lof of memory even for
a single byte saving, see patch 13.

> 
>>  	/* we maintain a pagecount bias, so that we dont dirty cache line
>>  	 * containing page->_refcount every time we allocate a fragment.
>>  	 */
>>  	unsigned int		pagecnt_bias;
>> -	bool pfmemalloc;
>>  };
Alexander Duyck April 17, 2024, 3:11 p.m. UTC | #3
On Wed, 2024-04-17 at 21:19 +0800, Yunsheng Lin wrote:
> On 2024/4/17 0:22, Alexander H Duyck wrote:
> > On Mon, 2024-04-15 at 21:19 +0800, Yunsheng Lin wrote:
> > > The '(PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)' case is for the
> > > system with page size less than 32KB, which is 0x8000 bytes
> > > requiring 16 bits space, change 'size' to 'size_mask' to avoid
> > > using the MSB, and change 'pfmemalloc' field to reuse the that
> > > MSB, so that we remove the orginal space needed by 'pfmemalloc'.
> > > 
> > > For another case, the MSB of 'offset' is reused for 'pfmemalloc'.
> > > 
> > > Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
> > > ---
> > >  include/linux/page_frag_cache.h | 13 ++++++++-----
> > >  mm/page_frag_cache.c            |  5 +++--
> > >  2 files changed, 11 insertions(+), 7 deletions(-)
> > > 
> > > diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h
> > > index fe5faa80b6c3..40a7d6da9ef0 100644
> > > --- a/include/linux/page_frag_cache.h
> > > +++ b/include/linux/page_frag_cache.h
> > > @@ -12,15 +12,16 @@ struct page_frag_cache {
> > >  	void *va;
> > >  #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
> > >  	__u16 offset;
> > > -	__u16 size;
> > > +	__u16 size_mask:15;
> > > +	__u16 pfmemalloc:1;
> > >  #else
> > > -	__u32 offset;
> > > +	__u32 offset:31;
> > > +	__u32 pfmemalloc:1;
> > >  #endif
> > 
> > This seems like a really bad idea. Using a bit-field like this seems
> > like a waste as it means that all the accesses now have to add
> > additional operations to access either offset or size. It wasn't as if
> > this is an oversized struct, or one that we are allocating a ton of. As
> > such I am not sure why we need to optmize for size like this.
> 
> For the old 'struct page_frag' use case, there is one 'struct page_frag'
> for every socket and task_struct, there may be tens of thousands of
> them even in a 32 bit sysmem, which might mean a lof of memory even for
> a single byte saving, see patch 13.
> 

Yeah, I finally had time to finish getting through the patch set last
night. Sorry for quick firing reviews but lately I haven't had much
time to work on upstream work, and as you mentioned last time I only
got to 3 patches so I was trying to speed through.

I get that you are trying to reduce the size but in the next patch you
actually end up overshooting that on x86_64 systems. I am assuming that
is to try to account for the 32b use case? On 64b I am pretty sure you
don't get any gain since a long followed by two u16s and an int will
still be 16B. What we probably need to watch out for is the
optimization for size versus having to add instructions to extract and
insert the data back into the struct.

Anyway as far as this layout I am not sure it is the best way to go.
You are combining pfmemalloc with either size *OR* offset, and then
combining the pagecnt_bias with the va. I'm wondering if it wouldn't
make more sense to look at putting together the structure something
like:

#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
typedef u16 page_frag_bias_t;
#else
typedef u32 page_frag_bias_t;
#endif

struct page_frag_cache {
	/* page address and offset */
	void *va;
	page_frag_bias_t pagecnt_bias;
	u8 pfmemalloc;
	u8 page_frag_order;
}

The basic idea would be that we would be able to replace the size mask
with just a shift value representing the page order of the page being
fragmented. With that we can reduce the size to just a single byte. In
addition we could probably leave it there regardless of build as the
order should be initialized to 0 when this is allocated to it would be
correct even in the case where it isn't used (and there isn't much we
can do about the hole anyway).

In addition by combining the virtual address with the offset we can
just use the combined result for what we need. The only item that has
to be worked out is how to deal with the end of a page in the count up
case. However the combination seems the most logical one since they are
meant to be combined ultimately anyway. It does put limits on when we
can align things as we don't want to align ourselves into the next
page, but I think it makes more sense then the other limits that had to
be put on allocations and such in order to allow us to squeeze
pagecnt_bias into the virtual address.

Anyway I pulled in your patches and plan to do a bit of testing, after
I figure out what the nvme disk ID regression is I am seeing. My main
concern can be summed up as the NIC driver use case
(netdev/napi_alloc_frag callers) versus the socket/vhost use case. The
main thing in the case of the NIC driver callers is that we have a need
for isolation and guarantees that we won't lose cache line alignment. I
think those are the callers you are missing in your benchmarks, but
arguably that might be something you cannot test as I don't know what
NICs you have access to and if you have any that are using those calls.
Yunsheng Lin April 18, 2024, 9:39 a.m. UTC | #4
On 2024/4/17 23:11, Alexander H Duyck wrote:
> On Wed, 2024-04-17 at 21:19 +0800, Yunsheng Lin wrote:
>> On 2024/4/17 0:22, Alexander H Duyck wrote:
>>> On Mon, 2024-04-15 at 21:19 +0800, Yunsheng Lin wrote:
>>>> The '(PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)' case is for the
>>>> system with page size less than 32KB, which is 0x8000 bytes
>>>> requiring 16 bits space, change 'size' to 'size_mask' to avoid
>>>> using the MSB, and change 'pfmemalloc' field to reuse the that
>>>> MSB, so that we remove the orginal space needed by 'pfmemalloc'.
>>>>
>>>> For another case, the MSB of 'offset' is reused for 'pfmemalloc'.
>>>>
>>>> Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
>>>> ---
>>>>  include/linux/page_frag_cache.h | 13 ++++++++-----
>>>>  mm/page_frag_cache.c            |  5 +++--
>>>>  2 files changed, 11 insertions(+), 7 deletions(-)
>>>>
>>>> diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h
>>>> index fe5faa80b6c3..40a7d6da9ef0 100644
>>>> --- a/include/linux/page_frag_cache.h
>>>> +++ b/include/linux/page_frag_cache.h
>>>> @@ -12,15 +12,16 @@ struct page_frag_cache {
>>>>  	void *va;
>>>>  #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
>>>>  	__u16 offset;
>>>> -	__u16 size;
>>>> +	__u16 size_mask:15;
>>>> +	__u16 pfmemalloc:1;
>>>>  #else
>>>> -	__u32 offset;
>>>> +	__u32 offset:31;
>>>> +	__u32 pfmemalloc:1;
>>>>  #endif
>>>
>>> This seems like a really bad idea. Using a bit-field like this seems
>>> like a waste as it means that all the accesses now have to add
>>> additional operations to access either offset or size. It wasn't as if
>>> this is an oversized struct, or one that we are allocating a ton of. As
>>> such I am not sure why we need to optmize for size like this.
>>
>> For the old 'struct page_frag' use case, there is one 'struct page_frag'
>> for every socket and task_struct, there may be tens of thousands of
>> them even in a 32 bit sysmem, which might mean a lof of memory even for
>> a single byte saving, see patch 13.
>>
> 
> Yeah, I finally had time to finish getting through the patch set last
> night. Sorry for quick firing reviews but lately I haven't had much
> time to work on upstream work, and as you mentioned last time I only
> got to 3 patches so I was trying to speed through.
> 
> I get that you are trying to reduce the size but in the next patch you
> actually end up overshooting that on x86_64 systems. I am assuming that
> is to try to account for the 32b use case? On 64b I am pretty sure you
> don't get any gain since a long followed by two u16s and an int will
> still be 16B. What we probably need to watch out for is the
> optimization for size versus having to add instructions to extract and
> insert the data back into the struct.
> 
> Anyway as far as this layout I am not sure it is the best way to go.
> You are combining pfmemalloc with either size *OR* offset, and then

Does it really matter if pfmemalloc is conbined with size or offset?
as we are using bitfield for pfmemalloc.

> combining the pagecnt_bias with the va. I'm wondering if it wouldn't
> make more sense to look at putting together the structure something
> like:
> 
> #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
> typedef u16 page_frag_bias_t;
> #else
> typedef u32 page_frag_bias_t;
> #endif
> 
> struct page_frag_cache {
> 	/* page address and offset */
> 	void *va;

Generally I am agreed with combining the virtual address with the
offset for the reason you mentioned below.

> 	page_frag_bias_t pagecnt_bias;
> 	u8 pfmemalloc;
> 	u8 page_frag_order;
> }

The issue with the 'page_frag_order' I see is that we might need to do
a 'PAGE << page_frag_order' to get actual size, and we might also need
to do 'size - 1' to get the size_mask if we want to mask out the offset
from the 'va'.

For page_frag_order, we need to:
size = PAGE << page_frag_order
size_mask = size - 1

For size_mask, it seem we only need to do:
size = size_mask + 1

And as PAGE_FRAG_CACHE_MAX_SIZE = 32K, which can be fitted into 15 bits
if we use size_mask instead of size.

Does it make sense to use below, so that we only need to use bitfield
for SIZE < PAGE_FRAG_CACHE_MAX_SIZE in 32 bits system? And 'struct
page_frag' is using a similar '(BITS_PER_LONG > 32)' checking trick.

struct page_frag_cache {
	/* page address and offset */
	void *va;

#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) && (BITS_PER_LONG <= 32)
	u16 pagecnt_bias;
	u16 size_mask:15;
	u16 pfmemalloc:1;
#else
	u32 pagecnt_bias;
	u16 size_mask;
	u16 pfmemalloc;
#endif
};

> 
> The basic idea would be that we would be able to replace the size mask
> with just a shift value representing the page order of the page being
> fragmented. With that we can reduce the size to just a single byte. In
> addition we could probably leave it there regardless of build as the
> order should be initialized to 0 when this is allocated to it would be
> correct even in the case where it isn't used (and there isn't much we
> can do about the hole anyway).
> 
> In addition by combining the virtual address with the offset we can
> just use the combined result for what we need. The only item that has
> to be worked out is how to deal with the end of a page in the count up
> case. However the combination seems the most logical one since they are
> meant to be combined ultimately anyway. It does put limits on when we
> can align things as we don't want to align ourselves into the next

I guess it means we need to mask out the offset 'va' before doing the
aligning operation and 'offset + fragsz > size' checking, right?

> page, but I think it makes more sense then the other limits that had to
> be put on allocations and such in order to allow us to squeeze
> pagecnt_bias into the virtual address.
> 
> Anyway I pulled in your patches and plan to do a bit of testing, after
> I figure out what the nvme disk ID regression is I am seeing. My main
> concern can be summed up as the NIC driver use case
> (netdev/napi_alloc_frag callers) versus the socket/vhost use case. The
> main thing in the case of the NIC driver callers is that we have a need
> for isolation and guarantees that we won't lose cache line alignment. I
> think those are the callers you are missing in your benchmarks, but
> arguably that might be something you cannot test as I don't know what
> NICs you have access to and if you have any that are using those calls.

I guess we just need to replace the API used by socket/vhost with the one
used by netdev/napi_alloc_frag callers in mm/page_frag_test.c in patch 1,
which is introduced to test performance of page_frag implementation, see:

https://lore.kernel.org/all/20240415131941.51153-2-linyunsheng@huawei.com/

> .
>
Yunsheng Lin April 26, 2024, 9:38 a.m. UTC | #5
On 2024/4/18 17:39, Yunsheng Lin wrote:

...

> 
>> combining the pagecnt_bias with the va. I'm wondering if it wouldn't
>> make more sense to look at putting together the structure something
>> like:
>>
>> #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
>> typedef u16 page_frag_bias_t;
>> #else
>> typedef u32 page_frag_bias_t;
>> #endif
>>
>> struct page_frag_cache {
>> 	/* page address and offset */
>> 	void *va;
> 
> Generally I am agreed with combining the virtual address with the
> offset for the reason you mentioned below.
> 
>> 	page_frag_bias_t pagecnt_bias;
>> 	u8 pfmemalloc;
>> 	u8 page_frag_order;
>> }
> 
> The issue with the 'page_frag_order' I see is that we might need to do
> a 'PAGE << page_frag_order' to get actual size, and we might also need
> to do 'size - 1' to get the size_mask if we want to mask out the offset
> from the 'va'.
> 
> For page_frag_order, we need to:
> size = PAGE << page_frag_order
> size_mask = size - 1
> 
> For size_mask, it seem we only need to do:
> size = size_mask + 1
> 
> And as PAGE_FRAG_CACHE_MAX_SIZE = 32K, which can be fitted into 15 bits
> if we use size_mask instead of size.
> 
> Does it make sense to use below, so that we only need to use bitfield
> for SIZE < PAGE_FRAG_CACHE_MAX_SIZE in 32 bits system? And 'struct
> page_frag' is using a similar '(BITS_PER_LONG > 32)' checking trick.
> 
> struct page_frag_cache {
> 	/* page address and offset */
> 	void *va;
> 
> #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) && (BITS_PER_LONG <= 32)
> 	u16 pagecnt_bias;
> 	u16 size_mask:15;
> 	u16 pfmemalloc:1;
> #else
> 	u32 pagecnt_bias;
> 	u16 size_mask;
> 	u16 pfmemalloc;
> #endif
> };
> 

After considering a few different layouts for 'struct page_frag_cache',
it seems the below is more optimized:

struct page_frag_cache {
	/* page address & pfmemalloc & order */
	void *va;

#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) && (BITS_PER_LONG <= 32)
	u16 pagecnt_bias;
	u16 size;
#else
	u32 pagecnt_bias;
	u32 size;
#endif
}

The lower bits of 'va' is or'ed with the page order & pfmemalloc instead
of offset or pagecnt_bias, so that we don't have to add more checking
for handling the problem of not having enough space for offset or
pagecnt_bias for PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE and 32 bits system.
And page address & pfmemalloc & order is unchanged for the same page
in the same 'page_frag_cache' instance, it makes sense to fit them
together.

Also, it seems it is better to replace 'offset' with 'size', which indicates
the remaining size for the cache in a 'page_frag_cache' instance, and we
might be able to do a single 'size >= fragsz' checking for the case of cache
being enough, which should be the fast path if we ensure size is zoro when
'va' == NULL.

Something like below:

#define PAGE_FRAG_CACHE_ORDER_MASK	GENMASK(1, 0)
#define PAGE_FRAG_CACHE_PFMEMALLOC_BIT	BIT(2)

struct page_frag_cache {
	/* page address & pfmemalloc & order */
	void *va;

#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) && (BITS_PER_LONG <= 32)
	u16 pagecnt_bias;
	u16 size;
#else
	u32 pagecnt_bias;
	u32 size;
#endif
};


static void *__page_frag_cache_refill(struct page_frag_cache *nc,
				      unsigned int fragsz, gfp_t gfp_mask,
				      unsigned int align_mask)
{
	gfp_t gfp = gfp_mask;
	struct page *page;
	void *va;

#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
	/* Ensure free_unref_page() can be used to free the page fragment */
	BUILD_BUG_ON(PAGE_FRAG_CACHE_MAX_ORDER > PAGE_ALLOC_COSTLY_ORDER);

	gfp_mask = (gfp_mask & ~__GFP_DIRECT_RECLAIM) |  __GFP_COMP |
		   __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC;
	page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
				PAGE_FRAG_CACHE_MAX_ORDER);
	if (likely(page)) {
		nc->size = PAGE_FRAG_CACHE_MAX_SIZE - fragsz;
		va = page_address(page);
		nc->va = (void *)((unsigned long)va |
				  PAGE_FRAG_CACHE_MAX_ORDER |
				  (page_is_pfmemalloc(page) ?
				   PAGE_FRAG_CACHE_PFMEMALLOC_BIT : 0));
		page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE;
		return va;
	}
#endif
	page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
	if (likely(page)) {
		nc->size = PAGE_SIZE - fragsz;
		va = page_address(page);
		nc->va = (void *)((unsigned long)va |
				  (page_is_pfmemalloc(page) ?
				   PAGE_FRAG_CACHE_PFMEMALLOC_BIT : 0));
		page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE;
		return va;
	}

	nc->va = NULL;
	nc->size = 0;
	return NULL;
}

void *__page_frag_alloc_va_align(struct page_frag_cache *nc,
				 unsigned int fragsz, gfp_t gfp_mask,
				 unsigned int align_mask)
{
#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
	unsigned long page_order;
#endif
	unsigned long page_size;
	unsigned long size;
	struct page *page;
	void *va;

	size = nc->size & align_mask;
	va = nc->va;
#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
	page_order = (unsigned long)va & PAGE_FRAG_CACHE_ORDER_MASK;
	page_size = PAGE_SIZE << page_order;
#else
	page_size = PAGE_SIZE;
#endif

	if (unlikely(fragsz > size)) {
		if (unlikely(!va))
			return __page_frag_cache_refill(nc, fragsz, gfp_mask,
							align_mask);

		/* fragsz is not supposed to be bigger than PAGE_SIZE as we are
		 * allowing order 3 page allocation to fail easily under low
		 * memory condition.
		 */
		if (WARN_ON_ONCE(fragsz > PAGE_SIZE))
			return NULL;

		page = virt_to_page(va);
		if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
			return __page_frag_cache_refill(nc, fragsz, gfp_mask,
							align_mask);

		if (unlikely((unsigned long)va &
			     PAGE_FRAG_CACHE_PFMEMALLOC_BIT)) {
			free_unref_page(page, compound_order(page));
			return __page_frag_cache_refill(nc, fragsz, gfp_mask,
							align_mask);
		}

		/* OK, page count is 0, we can safely set it */
		set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);

		/* reset page count bias and offset to start of new frag */
		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
		size = page_size;
	}

	va = (void *)((unsigned long)va & PAGE_MASK);
	va = va + (page_size - size);
	nc->size = size - fragsz;
	nc->pagecnt_bias--;

	return va;
}
EXPORT_SYMBOL(__page_frag_alloc_va_align);
Alexander Duyck April 29, 2024, 2:49 p.m. UTC | #6
On Fri, Apr 26, 2024 at 2:38 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:
>
> On 2024/4/18 17:39, Yunsheng Lin wrote:
>
> ...
>
> >
> >> combining the pagecnt_bias with the va. I'm wondering if it wouldn't
> >> make more sense to look at putting together the structure something
> >> like:
> >>
> >> #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
> >> typedef u16 page_frag_bias_t;
> >> #else
> >> typedef u32 page_frag_bias_t;
> >> #endif
> >>
> >> struct page_frag_cache {
> >>      /* page address and offset */
> >>      void *va;
> >
> > Generally I am agreed with combining the virtual address with the
> > offset for the reason you mentioned below.
> >
> >>      page_frag_bias_t pagecnt_bias;
> >>      u8 pfmemalloc;
> >>      u8 page_frag_order;
> >> }
> >
> > The issue with the 'page_frag_order' I see is that we might need to do
> > a 'PAGE << page_frag_order' to get actual size, and we might also need
> > to do 'size - 1' to get the size_mask if we want to mask out the offset
> > from the 'va'.
> >
> > For page_frag_order, we need to:
> > size = PAGE << page_frag_order
> > size_mask = size - 1
> >
> > For size_mask, it seem we only need to do:
> > size = size_mask + 1
> >
> > And as PAGE_FRAG_CACHE_MAX_SIZE = 32K, which can be fitted into 15 bits
> > if we use size_mask instead of size.
> >
> > Does it make sense to use below, so that we only need to use bitfield
> > for SIZE < PAGE_FRAG_CACHE_MAX_SIZE in 32 bits system? And 'struct
> > page_frag' is using a similar '(BITS_PER_LONG > 32)' checking trick.
> >
> > struct page_frag_cache {
> >       /* page address and offset */
> >       void *va;
> >
> > #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) && (BITS_PER_LONG <= 32)
> >       u16 pagecnt_bias;
> >       u16 size_mask:15;
> >       u16 pfmemalloc:1;
> > #else
> >       u32 pagecnt_bias;
> >       u16 size_mask;
> >       u16 pfmemalloc;
> > #endif
> > };
> >
>
> After considering a few different layouts for 'struct page_frag_cache',
> it seems the below is more optimized:
>
> struct page_frag_cache {
>         /* page address & pfmemalloc & order */
>         void *va;

I see. So basically just pack the much smaller bitfields in here.

>
> #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) && (BITS_PER_LONG <= 32)
>         u16 pagecnt_bias;
>         u16 size;
> #else
>         u32 pagecnt_bias;
>         u32 size;
> #endif
> }
>
> The lower bits of 'va' is or'ed with the page order & pfmemalloc instead
> of offset or pagecnt_bias, so that we don't have to add more checking
> for handling the problem of not having enough space for offset or
> pagecnt_bias for PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE and 32 bits system.
> And page address & pfmemalloc & order is unchanged for the same page
> in the same 'page_frag_cache' instance, it makes sense to fit them
> together.
>
> Also, it seems it is better to replace 'offset' with 'size', which indicates
> the remaining size for the cache in a 'page_frag_cache' instance, and we
> might be able to do a single 'size >= fragsz' checking for the case of cache
> being enough, which should be the fast path if we ensure size is zoro when
> 'va' == NULL.

I'm not sure the rename to size is called for as it is going to be
confusing. Maybe something like "remaining"?

> Something like below:
>
> #define PAGE_FRAG_CACHE_ORDER_MASK      GENMASK(1, 0)
> #define PAGE_FRAG_CACHE_PFMEMALLOC_BIT  BIT(2)

The only downside is that it is ossifying things so that we can only
ever do order 3 as the maximum cache size. It might be better to do a
full 8 bytes as on x86 it would just mean accessing the low end of a
16b register. Then you can have pfmemalloc at bit 8.

> struct page_frag_cache {
>         /* page address & pfmemalloc & order */
>         void *va;
>

When you start combining things like this we normally would convert va
to an unsigned long.

> #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) && (BITS_PER_LONG <= 32)
>         u16 pagecnt_bias;
>         u16 size;
> #else
>         u32 pagecnt_bias;
>         u32 size;
> #endif
> };
>
>
> static void *__page_frag_cache_refill(struct page_frag_cache *nc,
>                                       unsigned int fragsz, gfp_t gfp_mask,
>                                       unsigned int align_mask)
> {
>         gfp_t gfp = gfp_mask;
>         struct page *page;
>         void *va;
>
> #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
>         /* Ensure free_unref_page() can be used to free the page fragment */
>         BUILD_BUG_ON(PAGE_FRAG_CACHE_MAX_ORDER > PAGE_ALLOC_COSTLY_ORDER);
>
>         gfp_mask = (gfp_mask & ~__GFP_DIRECT_RECLAIM) |  __GFP_COMP |
>                    __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC;
>         page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
>                                 PAGE_FRAG_CACHE_MAX_ORDER);
>         if (likely(page)) {
>                 nc->size = PAGE_FRAG_CACHE_MAX_SIZE - fragsz;

I wouldn't pass fragsz here. Ideally we keep this from having to get
pulled directly into the allocator and can instead treat this as a
pristine page. We can do the subtraction further down in the actual
frag alloc call.

>                 va = page_address(page);
>                 nc->va = (void *)((unsigned long)va |
>                                   PAGE_FRAG_CACHE_MAX_ORDER |
>                                   (page_is_pfmemalloc(page) ?
>                                    PAGE_FRAG_CACHE_PFMEMALLOC_BIT : 0));
>                 page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
>                 nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE;
>                 return va;
>         }
> #endif
>         page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
>         if (likely(page)) {
>                 nc->size = PAGE_SIZE - fragsz;
>                 va = page_address(page);
>                 nc->va = (void *)((unsigned long)va |
>                                   (page_is_pfmemalloc(page) ?
>                                    PAGE_FRAG_CACHE_PFMEMALLOC_BIT : 0));
>                 page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
>                 nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE;
>                 return va;
>         }
>
>         nc->va = NULL;
>         nc->size = 0;
>         return NULL;
> }
>
> void *__page_frag_alloc_va_align(struct page_frag_cache *nc,
>                                  unsigned int fragsz, gfp_t gfp_mask,
>                                  unsigned int align_mask)
> {
> #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
>         unsigned long page_order;
> #endif
>         unsigned long page_size;
>         unsigned long size;
>         struct page *page;
>         void *va;
>
>         size = nc->size & align_mask;
>         va = nc->va;
> #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
>         page_order = (unsigned long)va & PAGE_FRAG_CACHE_ORDER_MASK;
>         page_size = PAGE_SIZE << page_order;
> #else
>         page_size = PAGE_SIZE;
> #endif

So I notice you got rid of the loops within the function. One of the
reasons for structuring it the way it was is to enable better code
caching. By unfolding the loops you are increasing the number of
instructions that have to be fetched and processed in order to
allocate the buffers.

>
>         if (unlikely(fragsz > size)) {
>                 if (unlikely(!va))
>                         return __page_frag_cache_refill(nc, fragsz, gfp_mask,
>                                                         align_mask);
>
>                 /* fragsz is not supposed to be bigger than PAGE_SIZE as we are
>                  * allowing order 3 page allocation to fail easily under low
>                  * memory condition.
>                  */
>                 if (WARN_ON_ONCE(fragsz > PAGE_SIZE))
>                         return NULL;
>
>                 page = virt_to_page(va);
>                 if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
>                         return __page_frag_cache_refill(nc, fragsz, gfp_mask,
>                                                         align_mask);
>
>                 if (unlikely((unsigned long)va &
>                              PAGE_FRAG_CACHE_PFMEMALLOC_BIT)) {
>                         free_unref_page(page, compound_order(page));
>                         return __page_frag_cache_refill(nc, fragsz, gfp_mask,
>                                                         align_mask);
>                 }
>
>                 /* OK, page count is 0, we can safely set it */
>                 set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
>
>                 /* reset page count bias and offset to start of new frag */
>                 nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
>                 size = page_size;
>         }
>
>         va = (void *)((unsigned long)va & PAGE_MASK);
>         va = va + (page_size - size);
>         nc->size = size - fragsz;
>         nc->pagecnt_bias--;
>
>         return va;
> }
> EXPORT_SYMBOL(__page_frag_alloc_va_align);
>
Yunsheng Lin April 30, 2024, 12:05 p.m. UTC | #7
On 2024/4/29 22:49, Alexander Duyck wrote:

...

>>>
>>
>> After considering a few different layouts for 'struct page_frag_cache',
>> it seems the below is more optimized:
>>
>> struct page_frag_cache {
>>         /* page address & pfmemalloc & order */
>>         void *va;
> 
> I see. So basically just pack the much smaller bitfields in here.
> 
>>
>> #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) && (BITS_PER_LONG <= 32)
>>         u16 pagecnt_bias;
>>         u16 size;
>> #else
>>         u32 pagecnt_bias;
>>         u32 size;
>> #endif
>> }
>>
>> The lower bits of 'va' is or'ed with the page order & pfmemalloc instead
>> of offset or pagecnt_bias, so that we don't have to add more checking
>> for handling the problem of not having enough space for offset or
>> pagecnt_bias for PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE and 32 bits system.
>> And page address & pfmemalloc & order is unchanged for the same page
>> in the same 'page_frag_cache' instance, it makes sense to fit them
>> together.
>>
>> Also, it seems it is better to replace 'offset' with 'size', which indicates
>> the remaining size for the cache in a 'page_frag_cache' instance, and we
>> might be able to do a single 'size >= fragsz' checking for the case of cache
>> being enough, which should be the fast path if we ensure size is zoro when
>> 'va' == NULL.
> 
> I'm not sure the rename to size is called for as it is going to be
> confusing. Maybe something like "remaining"?

Yes, using 'size' for that is a bit confusing.
Beside the above 'remaining', by googling, it seems we may have below
options too:
'residual','unused', 'surplus'

> 
>> Something like below:
>>
>> #define PAGE_FRAG_CACHE_ORDER_MASK      GENMASK(1, 0)
>> #define PAGE_FRAG_CACHE_PFMEMALLOC_BIT  BIT(2)
> 
> The only downside is that it is ossifying things so that we can only

There is 12 bits that is always useful, we can always extend ORDER_MASK
to more bits if lager order number is needed.

> ever do order 3 as the maximum cache size. It might be better to do a
> full 8 bytes as on x86 it would just mean accessing the low end of a
> 16b register. Then you can have pfmemalloc at bit 8.

I am not sure I understand the above as it seems we may have below option:
1. Use somthing like the above ORDER_MASK and PFMEMALLOC_BIT.
2. Use bitfield as something like below:

unsigned long va:20;---or 52 for 64bit system
unsigned long pfmemalloc:1
unsigned long order:11;

Or is there a better idea in your mind?

> 
>> struct page_frag_cache {
>>         /* page address & pfmemalloc & order */
>>         void *va;
>>
> 
> When you start combining things like this we normally would convert va
> to an unsigned long.

Ack.

> 
>> #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) && (BITS_PER_LONG <= 32)
>>         u16 pagecnt_bias;
>>         u16 size;
>> #else
>>         u32 pagecnt_bias;
>>         u32 size;
>> #endif
>> };
>>
>>
>> static void *__page_frag_cache_refill(struct page_frag_cache *nc,
>>                                       unsigned int fragsz, gfp_t gfp_mask,
>>                                       unsigned int align_mask)
>> {
>>         gfp_t gfp = gfp_mask;
>>         struct page *page;
>>         void *va;
>>
>> #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
>>         /* Ensure free_unref_page() can be used to free the page fragment */
>>         BUILD_BUG_ON(PAGE_FRAG_CACHE_MAX_ORDER > PAGE_ALLOC_COSTLY_ORDER);
>>
>>         gfp_mask = (gfp_mask & ~__GFP_DIRECT_RECLAIM) |  __GFP_COMP |
>>                    __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC;
>>         page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
>>                                 PAGE_FRAG_CACHE_MAX_ORDER);
>>         if (likely(page)) {
>>                 nc->size = PAGE_FRAG_CACHE_MAX_SIZE - fragsz;
> 
> I wouldn't pass fragsz here. Ideally we keep this from having to get
> pulled directly into the allocator and can instead treat this as a
> pristine page. We can do the subtraction further down in the actual
> frag alloc call.

Yes for the maintanability point of view.
But for performance point of view, doesn't it make sense to do the
subtraction here, as doing the subtraction in the actual frag alloc
call may involve more load/store operation to do the subtraction?

> 
>>                 va = page_address(page);
>>                 nc->va = (void *)((unsigned long)va |
>>                                   PAGE_FRAG_CACHE_MAX_ORDER |
>>                                   (page_is_pfmemalloc(page) ?
>>                                    PAGE_FRAG_CACHE_PFMEMALLOC_BIT : 0));
>>                 page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
>>                 nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE;
>>                 return va;
>>         }
>> #endif
>>         page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
>>         if (likely(page)) {
>>                 nc->size = PAGE_SIZE - fragsz;
>>                 va = page_address(page);
>>                 nc->va = (void *)((unsigned long)va |
>>                                   (page_is_pfmemalloc(page) ?
>>                                    PAGE_FRAG_CACHE_PFMEMALLOC_BIT : 0));
>>                 page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
>>                 nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE;
>>                 return va;
>>         }
>>
>>         nc->va = NULL;
>>         nc->size = 0;
>>         return NULL;
>> }
>>
>> void *__page_frag_alloc_va_align(struct page_frag_cache *nc,
>>                                  unsigned int fragsz, gfp_t gfp_mask,
>>                                  unsigned int align_mask)
>> {
>> #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
>>         unsigned long page_order;
>> #endif
>>         unsigned long page_size;
>>         unsigned long size;
>>         struct page *page;
>>         void *va;
>>
>>         size = nc->size & align_mask;
>>         va = nc->va;
>> #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
>>         page_order = (unsigned long)va & PAGE_FRAG_CACHE_ORDER_MASK;
>>         page_size = PAGE_SIZE << page_order;
>> #else
>>         page_size = PAGE_SIZE;
>> #endif
> 
> So I notice you got rid of the loops within the function. One of the
> reasons for structuring it the way it was is to enable better code
> caching. By unfolding the loops you are increasing the number of
> instructions that have to be fetched and processed in order to
> allocate the buffers.

I am not sure I understand what does 'the loops' means here, as there
is not 'while' or 'for' here. I suppose you are referring to the 'goto'
here?

>
Alexander Duyck April 30, 2024, 2:54 p.m. UTC | #8
On Tue, Apr 30, 2024 at 5:06 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:
>
> On 2024/4/29 22:49, Alexander Duyck wrote:
>
> ...
>
> >>>
> >>
> >> After considering a few different layouts for 'struct page_frag_cache',
> >> it seems the below is more optimized:
> >>
> >> struct page_frag_cache {
> >>         /* page address & pfmemalloc & order */
> >>         void *va;
> >
> > I see. So basically just pack the much smaller bitfields in here.
> >
> >>
> >> #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) && (BITS_PER_LONG <= 32)
> >>         u16 pagecnt_bias;
> >>         u16 size;
> >> #else
> >>         u32 pagecnt_bias;
> >>         u32 size;
> >> #endif
> >> }
> >>
> >> The lower bits of 'va' is or'ed with the page order & pfmemalloc instead
> >> of offset or pagecnt_bias, so that we don't have to add more checking
> >> for handling the problem of not having enough space for offset or
> >> pagecnt_bias for PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE and 32 bits system.
> >> And page address & pfmemalloc & order is unchanged for the same page
> >> in the same 'page_frag_cache' instance, it makes sense to fit them
> >> together.
> >>
> >> Also, it seems it is better to replace 'offset' with 'size', which indicates
> >> the remaining size for the cache in a 'page_frag_cache' instance, and we
> >> might be able to do a single 'size >= fragsz' checking for the case of cache
> >> being enough, which should be the fast path if we ensure size is zoro when
> >> 'va' == NULL.
> >
> > I'm not sure the rename to size is called for as it is going to be
> > confusing. Maybe something like "remaining"?
>
> Yes, using 'size' for that is a bit confusing.
> Beside the above 'remaining', by googling, it seems we may have below
> options too:
> 'residual','unused', 'surplus'
>
> >
> >> Something like below:
> >>
> >> #define PAGE_FRAG_CACHE_ORDER_MASK      GENMASK(1, 0)
> >> #define PAGE_FRAG_CACHE_PFMEMALLOC_BIT  BIT(2)
> >
> > The only downside is that it is ossifying things so that we can only
>
> There is 12 bits that is always useful, we can always extend ORDER_MASK
> to more bits if lager order number is needed.
>
> > ever do order 3 as the maximum cache size. It might be better to do a
> > full 8 bytes as on x86 it would just mean accessing the low end of a
> > 16b register. Then you can have pfmemalloc at bit 8.
>
> I am not sure I understand the above as it seems we may have below option:
> 1. Use somthing like the above ORDER_MASK and PFMEMALLOC_BIT.
> 2. Use bitfield as something like below:
>
> unsigned long va:20;---or 52 for 64bit system
> unsigned long pfmemalloc:1
> unsigned long order:11;
>
> Or is there a better idea in your mind?

All I was suggesting was to make the ORDER_MASK 8 bits. Doing that the
compiler can just store VA in a register such as RCX and instead of
having to bother with a mask it could then just use CL to access the
order. As far as the flag bits such as pfmemalloc the 4 bits starting
at 8 would make the most sense since it doesn't naturally align to
anything and would have to be masked anyway.

Using a bitfield like you suggest would be problematic as the compiler
would assume a shift is needed so you would have to add a shift to
your code to offset it for accessing VA.

> >
> >> struct page_frag_cache {
> >>         /* page address & pfmemalloc & order */
> >>         void *va;
> >>
> >
> > When you start combining things like this we normally would convert va
> > to an unsigned long.
>
> Ack.
>
> >
> >> #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) && (BITS_PER_LONG <= 32)
> >>         u16 pagecnt_bias;
> >>         u16 size;
> >> #else
> >>         u32 pagecnt_bias;
> >>         u32 size;
> >> #endif
> >> };
> >>
> >>
> >> static void *__page_frag_cache_refill(struct page_frag_cache *nc,
> >>                                       unsigned int fragsz, gfp_t gfp_mask,
> >>                                       unsigned int align_mask)
> >> {
> >>         gfp_t gfp = gfp_mask;
> >>         struct page *page;
> >>         void *va;
> >>
> >> #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
> >>         /* Ensure free_unref_page() can be used to free the page fragment */
> >>         BUILD_BUG_ON(PAGE_FRAG_CACHE_MAX_ORDER > PAGE_ALLOC_COSTLY_ORDER);
> >>
> >>         gfp_mask = (gfp_mask & ~__GFP_DIRECT_RECLAIM) |  __GFP_COMP |
> >>                    __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC;
> >>         page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
> >>                                 PAGE_FRAG_CACHE_MAX_ORDER);
> >>         if (likely(page)) {
> >>                 nc->size = PAGE_FRAG_CACHE_MAX_SIZE - fragsz;
> >
> > I wouldn't pass fragsz here. Ideally we keep this from having to get
> > pulled directly into the allocator and can instead treat this as a
> > pristine page. We can do the subtraction further down in the actual
> > frag alloc call.
>
> Yes for the maintanability point of view.
> But for performance point of view, doesn't it make sense to do the
> subtraction here, as doing the subtraction in the actual frag alloc
> call may involve more load/store operation to do the subtraction?

It just means more code paths doing different things. It doesn't add
much here since what you are doing is juggling more variables in this
function as a result of this.

> >
> >>                 va = page_address(page);
> >>                 nc->va = (void *)((unsigned long)va |
> >>                                   PAGE_FRAG_CACHE_MAX_ORDER |
> >>                                   (page_is_pfmemalloc(page) ?
> >>                                    PAGE_FRAG_CACHE_PFMEMALLOC_BIT : 0));
> >>                 page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
> >>                 nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE;
> >>                 return va;
> >>         }
> >> #endif
> >>         page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
> >>         if (likely(page)) {
> >>                 nc->size = PAGE_SIZE - fragsz;
> >>                 va = page_address(page);
> >>                 nc->va = (void *)((unsigned long)va |
> >>                                   (page_is_pfmemalloc(page) ?
> >>                                    PAGE_FRAG_CACHE_PFMEMALLOC_BIT : 0));
> >>                 page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
> >>                 nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE;
> >>                 return va;
> >>         }
> >>
> >>         nc->va = NULL;
> >>         nc->size = 0;
> >>         return NULL;
> >> }
> >>
> >> void *__page_frag_alloc_va_align(struct page_frag_cache *nc,
> >>                                  unsigned int fragsz, gfp_t gfp_mask,
> >>                                  unsigned int align_mask)
> >> {
> >> #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
> >>         unsigned long page_order;
> >> #endif
> >>         unsigned long page_size;
> >>         unsigned long size;
> >>         struct page *page;
> >>         void *va;
> >>
> >>         size = nc->size & align_mask;
> >>         va = nc->va;
> >> #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
> >>         page_order = (unsigned long)va & PAGE_FRAG_CACHE_ORDER_MASK;
> >>         page_size = PAGE_SIZE << page_order;
> >> #else
> >>         page_size = PAGE_SIZE;
> >> #endif
> >
> > So I notice you got rid of the loops within the function. One of the
> > reasons for structuring it the way it was is to enable better code
> > caching. By unfolding the loops you are increasing the number of
> > instructions that have to be fetched and processed in order to
> > allocate the buffers.
>
> I am not sure I understand what does 'the loops' means here, as there
> is not 'while' or 'for' here. I suppose you are referring to the 'goto'
> here?

So there was logic before that would jump to a label back at the start
of the function. It seems like you got rid of that logic and just
flattened everything out. This is likely resulting in some duplication
in the code and overall an increase in the number of instructions that
need to be fetched to allocate the fragment. As I recall one of the
reasons things were folded up the way they were was to allow it to use
a short jump instruction instead of a longer one. I suspect we may be
losing that with these changes.
Yunsheng Lin May 6, 2024, 12:33 p.m. UTC | #9
On 2024/4/30 22:54, Alexander Duyck wrote:
> On Tue, Apr 30, 2024 at 5:06 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:
>>
>> On 2024/4/29 22:49, Alexander Duyck wrote:
>>
>> ...
>>
>>>>>
>>>>
>>>> After considering a few different layouts for 'struct page_frag_cache',
>>>> it seems the below is more optimized:
>>>>
>>>> struct page_frag_cache {
>>>>         /* page address & pfmemalloc & order */
>>>>         void *va;
>>>
>>> I see. So basically just pack the much smaller bitfields in here.
>>>
>>>>
>>>> #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) && (BITS_PER_LONG <= 32)
>>>>         u16 pagecnt_bias;
>>>>         u16 size;
>>>> #else
>>>>         u32 pagecnt_bias;
>>>>         u32 size;
>>>> #endif
>>>> }
>>>>
>>>> The lower bits of 'va' is or'ed with the page order & pfmemalloc instead
>>>> of offset or pagecnt_bias, so that we don't have to add more checking
>>>> for handling the problem of not having enough space for offset or
>>>> pagecnt_bias for PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE and 32 bits system.
>>>> And page address & pfmemalloc & order is unchanged for the same page
>>>> in the same 'page_frag_cache' instance, it makes sense to fit them
>>>> together.
>>>>
>>>> Also, it seems it is better to replace 'offset' with 'size', which indicates
>>>> the remaining size for the cache in a 'page_frag_cache' instance, and we
>>>> might be able to do a single 'size >= fragsz' checking for the case of cache
>>>> being enough, which should be the fast path if we ensure size is zoro when
>>>> 'va' == NULL.
>>>
>>> I'm not sure the rename to size is called for as it is going to be
>>> confusing. Maybe something like "remaining"?
>>
>> Yes, using 'size' for that is a bit confusing.
>> Beside the above 'remaining', by googling, it seems we may have below
>> options too:
>> 'residual','unused', 'surplus'
>>
>>>
>>>> Something like below:
>>>>
>>>> #define PAGE_FRAG_CACHE_ORDER_MASK      GENMASK(1, 0)
>>>> #define PAGE_FRAG_CACHE_PFMEMALLOC_BIT  BIT(2)
>>>
>>> The only downside is that it is ossifying things so that we can only
>>
>> There is 12 bits that is always useful, we can always extend ORDER_MASK
>> to more bits if lager order number is needed.
>>
>>> ever do order 3 as the maximum cache size. It might be better to do a
>>> full 8 bytes as on x86 it would just mean accessing the low end of a
>>> 16b register. Then you can have pfmemalloc at bit 8.
>>
>> I am not sure I understand the above as it seems we may have below option:
>> 1. Use somthing like the above ORDER_MASK and PFMEMALLOC_BIT.
>> 2. Use bitfield as something like below:
>>
>> unsigned long va:20;---or 52 for 64bit system
>> unsigned long pfmemalloc:1
>> unsigned long order:11;
>>
>> Or is there a better idea in your mind?
> 
> All I was suggesting was to make the ORDER_MASK 8 bits. Doing that the
> compiler can just store VA in a register such as RCX and instead of
> having to bother with a mask it could then just use CL to access the
> order. As far as the flag bits such as pfmemalloc the 4 bits starting
> at 8 would make the most sense since it doesn't naturally align to
> anything and would have to be masked anyway.

Ok.

> 
> Using a bitfield like you suggest would be problematic as the compiler
> would assume a shift is needed so you would have to add a shift to
> your code to offset it for accessing VA.
> 
>>>
>>>> struct page_frag_cache {
>>>>         /* page address & pfmemalloc & order */
>>>>         void *va;
>>>>
>>>
>>> When you start combining things like this we normally would convert va
>>> to an unsigned long.
>>
>> Ack.

It seems it makes more sense to convert va to something like 'struct encoded_va'
mirroring 'struct encoded_page' in below:

https://elixir.bootlin.com/linux/v6.7-rc8/source/include/linux/mm_types.h#L222

>>
>>>
>>>> #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) && (BITS_PER_LONG <= 32)
>>>>         u16 pagecnt_bias;
>>>>         u16 size;
>>>> #else
>>>>         u32 pagecnt_bias;
>>>>         u32 size;
>>>> #endif
>>>> };
>>>>
>>>>
diff mbox series

Patch

diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h
index fe5faa80b6c3..40a7d6da9ef0 100644
--- a/include/linux/page_frag_cache.h
+++ b/include/linux/page_frag_cache.h
@@ -12,15 +12,16 @@  struct page_frag_cache {
 	void *va;
 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
 	__u16 offset;
-	__u16 size;
+	__u16 size_mask:15;
+	__u16 pfmemalloc:1;
 #else
-	__u32 offset;
+	__u32 offset:31;
+	__u32 pfmemalloc:1;
 #endif
 	/* we maintain a pagecount bias, so that we dont dirty cache line
 	 * containing page->_refcount every time we allocate a fragment.
 	 */
 	unsigned int		pagecnt_bias;
-	bool pfmemalloc;
 };
 
 static inline void page_frag_cache_init(struct page_frag_cache *nc)
@@ -43,7 +44,9 @@  static inline void *__page_frag_alloc_va_align(struct page_frag_cache *nc,
 					       gfp_t gfp_mask,
 					       unsigned int align)
 {
-	nc->offset = ALIGN(nc->offset, align);
+	unsigned int offset = nc->offset;
+
+	nc->offset = ALIGN(offset, align);
 
 	return page_frag_alloc_va(nc, fragsz, gfp_mask);
 }
@@ -53,7 +56,7 @@  static inline void *page_frag_alloc_va_align(struct page_frag_cache *nc,
 					     gfp_t gfp_mask,
 					     unsigned int align)
 {
-	WARN_ON_ONCE(!is_power_of_2(align));
+	WARN_ON_ONCE(!is_power_of_2(align) || align >= PAGE_SIZE);
 
 	return __page_frag_alloc_va_align(nc, fragsz, gfp_mask, align);
 }
diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c
index 50511d8522d0..8d93029116e1 100644
--- a/mm/page_frag_cache.c
+++ b/mm/page_frag_cache.c
@@ -32,7 +32,8 @@  static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
 		   __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC;
 	page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
 				PAGE_FRAG_CACHE_MAX_ORDER);
-	nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;
+	nc->size_mask = page ? PAGE_FRAG_CACHE_MAX_SIZE - 1 : PAGE_SIZE - 1;
+	VM_BUG_ON(page && nc->size_mask != PAGE_FRAG_CACHE_MAX_SIZE - 1);
 #endif
 	if (unlikely(!page))
 		page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
@@ -86,7 +87,7 @@  void *page_frag_alloc_va(struct page_frag_cache *nc, unsigned int fragsz,
 
 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
 	/* if size can vary use size else just use PAGE_SIZE */
-	size = nc->size;
+	size = nc->size_mask + 1;
 #else
 	size = PAGE_SIZE;
 #endif