diff mbox series

[RFC,v11,08/14] mm: page_frag: some minor refactoring before adding new API

Message ID 20240719093338.55117-9-linyunsheng@huawei.com (mailing list archive)
State Superseded
Headers show
Series Replace page_frag with page_frag_cache for sk_page_frag() | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR success PR summary
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-11 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-12 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / test (test_maps, false, 360) / test_maps on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-16 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-17 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-18 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-19 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-21 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-22 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-27 success Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17-O2
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18-O2
bpf/vmtest-bpf-next-VM_Test-37 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-41 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-42 success Logs for x86_64-llvm-18 / veristat
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-8 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-14 success Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-15 success Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-31 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-32 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-38 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-39 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-40 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Guessed tree name to be net-next, async
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 712 this patch: 712
netdev/build_tools success Errors and warnings before: 0 this patch: 0
netdev/cc_maintainers success CCed 3 of 3 maintainers
netdev/build_clang success Errors and warnings before: 774 this patch: 774
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 5739 this patch: 5739
netdev/checkpatch warning WARNING: Do not crash the kernel unless it is absolutely unavoidable--use WARN_ON_ONCE() plus recovery code (if feasible) instead of BUG() or variants
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Yunsheng Lin July 19, 2024, 9:33 a.m. UTC
Refactor common codes from __page_frag_alloc_va_align()
to __page_frag_cache_refill(), so that the new API can
make use of them.

CC: Alexander Duyck <alexander.duyck@gmail.com>
Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
---
 include/linux/page_frag_cache.h |  2 +-
 mm/page_frag_cache.c            | 93 +++++++++++++++++----------------
 2 files changed, 49 insertions(+), 46 deletions(-)

Comments

Alexander Duyck July 21, 2024, 11:40 p.m. UTC | #1
On Fri, 2024-07-19 at 17:33 +0800, Yunsheng Lin wrote:
> Refactor common codes from __page_frag_alloc_va_align()
> to __page_frag_cache_refill(), so that the new API can
> make use of them.
> 
> CC: Alexander Duyck <alexander.duyck@gmail.com>
> Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
> ---
>  include/linux/page_frag_cache.h |  2 +-
>  mm/page_frag_cache.c            | 93 +++++++++++++++++----------------
>  2 files changed, 49 insertions(+), 46 deletions(-)
> 
> diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h
> index 12a16f8e8ad0..5aa45de7a9a5 100644
> --- a/include/linux/page_frag_cache.h
> +++ b/include/linux/page_frag_cache.h
> @@ -50,7 +50,7 @@ static inline void *encoded_page_address(unsigned long encoded_va)
>  
>  static inline void page_frag_cache_init(struct page_frag_cache *nc)
>  {
> -	nc->encoded_va = 0;
> +	memset(nc, 0, sizeof(*nc));
>  }
>  

I do not like requiring the entire structure to be reset as a part of
init. If encoded_va is 0 then we have reset the page and the flags.
There shouldn't be anything else we need to reset as remaining and bias
will be reset when we reallocate.

>  static inline bool page_frag_cache_is_pfmemalloc(struct page_frag_cache *nc)
> diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c
> index 7928e5d50711..d9c9cad17af7 100644
> --- a/mm/page_frag_cache.c
> +++ b/mm/page_frag_cache.c
> @@ -19,6 +19,28 @@
>  #include <linux/page_frag_cache.h>
>  #include "internal.h"
>  
> +static struct page *__page_frag_cache_recharge(struct page_frag_cache *nc)
> +{
> +	unsigned long encoded_va = nc->encoded_va;
> +	struct page *page;
> +
> +	page = virt_to_page((void *)encoded_va);
> +	if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
> +		return NULL;
> +
> +	if (unlikely(encoded_page_pfmemalloc(encoded_va))) {
> +		VM_BUG_ON(compound_order(page) !=
> +			  encoded_page_order(encoded_va));
> +		free_unref_page(page, encoded_page_order(encoded_va));
> +		return NULL;
> +	}
> +
> +	/* OK, page count is 0, we can safely set it */
> +	set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
> +
> +	return page;
> +}
> +
>  static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
>  					     gfp_t gfp_mask)
>  {
> @@ -26,6 +48,14 @@ static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
>  	struct page *page = NULL;
>  	gfp_t gfp = gfp_mask;
>  
> +	if (likely(nc->encoded_va)) {
> +		page = __page_frag_cache_recharge(nc);
> +		if (page) {
> +			order = encoded_page_order(nc->encoded_va);
> +			goto out;
> +		}
> +	}
> +

This code has no business here. This is refill, you just dropped
recharge in here which will make a complete mess of the ordering and be
confusing to say the least.

The expectation was that if we are calling this function it is going to
overwrite the virtual address to NULL on failure so we discard the old
page if there is one present. This changes that behaviour. What you
effectively did is made __page_frag_cache_refill into the recharge
function.

>  #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
>  	gfp_mask = (gfp_mask & ~__GFP_DIRECT_RECLAIM) |  __GFP_COMP |
>  		   __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC;
> @@ -35,7 +65,7 @@ static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
>  	if (unlikely(!page)) {
>  		page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
>  		if (unlikely(!page)) {
> -			nc->encoded_va = 0;
> +			memset(nc, 0, sizeof(*nc));
>  			return NULL;
>  		}
>  

The memset will take a few more instructions than the existing code
did. I would prefer to keep this as is if at all possible.

> @@ -45,6 +75,16 @@ static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
>  	nc->encoded_va = encode_aligned_va(page_address(page), order,
>  					   page_is_pfmemalloc(page));
>  
> +	/* Even if we own the page, we do not use atomic_set().
> +	 * This would break get_page_unless_zero() users.
> +	 */
> +	page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
> +
> +out:
> +	/* reset page count bias and remaining to start of new frag */
> +	nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
> +	nc->remaining = PAGE_SIZE << order;
> +
>  	return page;
>  }
>  

Why bother returning a page at all? It doesn't seem like you don't use
it anymore. It looks like the use cases you have for it in patch 11/12
all appear to be broken from what I can tell as you are adding page as
a variable when we don't need to be passing internal details to the
callers of the function when just a simple error return code would do.

> @@ -55,7 +95,7 @@ void page_frag_cache_drain(struct page_frag_cache *nc)
>  
>  	__page_frag_cache_drain(virt_to_head_page((void *)nc->encoded_va),
>  				nc->pagecnt_bias);
> -	nc->encoded_va = 0;
> +	memset(nc, 0, sizeof(*nc));
>  }
>  EXPORT_SYMBOL(page_frag_cache_drain);
>  
> @@ -72,31 +112,9 @@ void *__page_frag_alloc_va_align(struct page_frag_cache *nc,
>  				 unsigned int fragsz, gfp_t gfp_mask,
>  				 unsigned int align_mask)
>  {
> -	unsigned long encoded_va = nc->encoded_va;
> -	unsigned int size, remaining;
> -	struct page *page;
> -
> -	if (unlikely(!encoded_va)) {
> -refill:
> -		page = __page_frag_cache_refill(nc, gfp_mask);
> -		if (!page)
> -			return NULL;
> -
> -		encoded_va = nc->encoded_va;
> -		size = page_frag_cache_page_size(encoded_va);
> +	unsigned int size = page_frag_cache_page_size(nc->encoded_va);
> +	unsigned int remaining = nc->remaining & align_mask;
>  
> -		/* Even if we own the page, we do not use atomic_set().
> -		 * This would break get_page_unless_zero() users.
> -		 */
> -		page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
> -
> -		/* reset page count bias and remaining to start of new frag */
> -		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
> -		nc->remaining = size;
> -	}
> -
> -	size = page_frag_cache_page_size(encoded_va);
> -	remaining = nc->remaining & align_mask;
>  	if (unlikely(remaining < fragsz)) {

I am not a fan of adding a dependency on remaining being set *before*
encoded_va. The fact is it relies on the size to set it. In addition
this is creating a big blob of code for the conditional paths to have
to jump over.

I think it is much better to first validate encoded_va, and then
validate remaining. Otherwise just checking remaining seems problematic
and like a recipe for NULL pointer accesses.

>  		if (unlikely(fragsz > PAGE_SIZE)) {
>  			/*
> @@ -111,32 +129,17 @@ void *__page_frag_alloc_va_align(struct page_frag_cache *nc,
>  			return NULL;
>  		}
>  
> -		page = virt_to_page((void *)encoded_va);
> -
> -		if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
> -			goto refill;
> -
> -		if (unlikely(encoded_page_pfmemalloc(encoded_va))) {
> -			VM_BUG_ON(compound_order(page) !=
> -				  encoded_page_order(encoded_va));
> -			free_unref_page(page, encoded_page_order(encoded_va));
> -			goto refill;
> -		}
> -
> -		/* OK, page count is 0, we can safely set it */
> -		set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
> -
> -		/* reset page count bias and remaining to start of new frag */
> -		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
> -		nc->remaining = size;
> +		if (unlikely(!__page_frag_cache_refill(nc, gfp_mask)))
> +			return NULL;
>  
> +		size = page_frag_cache_page_size(nc->encoded_va);

So this is adding yet another setting/reading of size to the recharge
path now. Previously the recharge path could just reuse the existing
size.

>  		remaining = size;
>  	}
>  
>  	nc->pagecnt_bias--;
>  	nc->remaining = remaining - fragsz;
>  
> -	return encoded_page_address(encoded_va) + (size - remaining);
> +	return encoded_page_address(nc->encoded_va) + (size - remaining);
>  }
>  EXPORT_SYMBOL(__page_frag_alloc_va_align);
>
Yunsheng Lin July 22, 2024, 12:55 p.m. UTC | #2
On 2024/7/22 7:40, Alexander H Duyck wrote:
> On Fri, 2024-07-19 at 17:33 +0800, Yunsheng Lin wrote:
>> Refactor common codes from __page_frag_alloc_va_align()
>> to __page_frag_cache_refill(), so that the new API can
>> make use of them.
>>
>> CC: Alexander Duyck <alexander.duyck@gmail.com>
>> Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
>> ---
>>  include/linux/page_frag_cache.h |  2 +-
>>  mm/page_frag_cache.c            | 93 +++++++++++++++++----------------
>>  2 files changed, 49 insertions(+), 46 deletions(-)
>>
>> diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h
>> index 12a16f8e8ad0..5aa45de7a9a5 100644
>> --- a/include/linux/page_frag_cache.h
>> +++ b/include/linux/page_frag_cache.h
>> @@ -50,7 +50,7 @@ static inline void *encoded_page_address(unsigned long encoded_va)
>>  
>>  static inline void page_frag_cache_init(struct page_frag_cache *nc)
>>  {
>> -	nc->encoded_va = 0;
>> +	memset(nc, 0, sizeof(*nc));
>>  }
>>  
> 
> I do not like requiring the entire structure to be reset as a part of
> init. If encoded_va is 0 then we have reset the page and the flags.
> There shouldn't be anything else we need to reset as remaining and bias
> will be reset when we reallocate.

The argument is about aoviding one checking for fast path by doing the
memset in the slow path, which you might already know accroding to your
comment in previous version.

It is just sometimes hard to understand your preference for maintainability
over performance here as sometimes your comment seems to perfer performance
over maintainability, like the LEA trick you mentioned and offset count-down
before this patchset. It would be good to be more consistent about this,
otherwise it is sometimes confusing when doing the refactoring.

> 
>>  static inline bool page_frag_cache_is_pfmemalloc(struct page_frag_cache *nc)
>> diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c
>> index 7928e5d50711..d9c9cad17af7 100644
>> --- a/mm/page_frag_cache.c
>> +++ b/mm/page_frag_cache.c
>> @@ -19,6 +19,28 @@
>>  #include <linux/page_frag_cache.h>
>>  #include "internal.h"
>>  
>> +static struct page *__page_frag_cache_recharge(struct page_frag_cache *nc)
>> +{
>> +	unsigned long encoded_va = nc->encoded_va;
>> +	struct page *page;
>> +
>> +	page = virt_to_page((void *)encoded_va);
>> +	if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
>> +		return NULL;
>> +
>> +	if (unlikely(encoded_page_pfmemalloc(encoded_va))) {
>> +		VM_BUG_ON(compound_order(page) !=
>> +			  encoded_page_order(encoded_va));
>> +		free_unref_page(page, encoded_page_order(encoded_va));
>> +		return NULL;
>> +	}
>> +
>> +	/* OK, page count is 0, we can safely set it */
>> +	set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
>> +
>> +	return page;
>> +}
>> +
>>  static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
>>  					     gfp_t gfp_mask)
>>  {
>> @@ -26,6 +48,14 @@ static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
>>  	struct page *page = NULL;
>>  	gfp_t gfp = gfp_mask;
>>  
>> +	if (likely(nc->encoded_va)) {
>> +		page = __page_frag_cache_recharge(nc);
>> +		if (page) {
>> +			order = encoded_page_order(nc->encoded_va);
>> +			goto out;
>> +		}
>> +	}
>> +
> 
> This code has no business here. This is refill, you just dropped
> recharge in here which will make a complete mess of the ordering and be
> confusing to say the least.
> 
> The expectation was that if we are calling this function it is going to
> overwrite the virtual address to NULL on failure so we discard the old
> page if there is one present. This changes that behaviour. What you
> effectively did is made __page_frag_cache_refill into the recharge
> function.

The idea is to reuse the below for both __page_frag_cache_refill() and
__page_frag_cache_recharge(), which seems to be about maintainability
to not having duplicated code. If there is a better idea to avoid that
duplicated code while keeping the old behaviour, I am happy to change
it.

	/* reset page count bias and remaining to start of new frag */
	nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
	nc->remaining = PAGE_SIZE << order;

> 
>>  #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
>>  	gfp_mask = (gfp_mask & ~__GFP_DIRECT_RECLAIM) |  __GFP_COMP |
>>  		   __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC;
>> @@ -35,7 +65,7 @@ static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
>>  	if (unlikely(!page)) {
>>  		page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
>>  		if (unlikely(!page)) {
>> -			nc->encoded_va = 0;
>> +			memset(nc, 0, sizeof(*nc));
>>  			return NULL;
>>  		}
>>  
> 
> The memset will take a few more instructions than the existing code
> did. I would prefer to keep this as is if at all possible.

It will not take more instructions for arm64 as it has 'stp' instruction for
__HAVE_ARCH_MEMSET is set.
There is something similar for x64?

> 
>> @@ -45,6 +75,16 @@ static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
>>  	nc->encoded_va = encode_aligned_va(page_address(page), order,
>>  					   page_is_pfmemalloc(page));
>>  
>> +	/* Even if we own the page, we do not use atomic_set().
>> +	 * This would break get_page_unless_zero() users.
>> +	 */
>> +	page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
>> +
>> +out:
>> +	/* reset page count bias and remaining to start of new frag */
>> +	nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
>> +	nc->remaining = PAGE_SIZE << order;
>> +
>>  	return page;
>>  }
>>  
> 
> Why bother returning a page at all? It doesn't seem like you don't use
> it anymore. It looks like the use cases you have for it in patch 11/12
> all appear to be broken from what I can tell as you are adding page as
> a variable when we don't need to be passing internal details to the
> callers of the function when just a simple error return code would do.

It would be good to be more specific about the 'broken' part here.
Alexander Duyck July 22, 2024, 3:32 p.m. UTC | #3
On Mon, Jul 22, 2024 at 5:55 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:
>
> On 2024/7/22 7:40, Alexander H Duyck wrote:
> > On Fri, 2024-07-19 at 17:33 +0800, Yunsheng Lin wrote:
> >> Refactor common codes from __page_frag_alloc_va_align()
> >> to __page_frag_cache_refill(), so that the new API can
> >> make use of them.
> >>
> >> CC: Alexander Duyck <alexander.duyck@gmail.com>
> >> Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
> >> ---
> >>  include/linux/page_frag_cache.h |  2 +-
> >>  mm/page_frag_cache.c            | 93 +++++++++++++++++----------------
> >>  2 files changed, 49 insertions(+), 46 deletions(-)
> >>
> >> diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h
> >> index 12a16f8e8ad0..5aa45de7a9a5 100644
> >> --- a/include/linux/page_frag_cache.h
> >> +++ b/include/linux/page_frag_cache.h
> >> @@ -50,7 +50,7 @@ static inline void *encoded_page_address(unsigned long encoded_va)
> >>
> >>  static inline void page_frag_cache_init(struct page_frag_cache *nc)
> >>  {
> >> -    nc->encoded_va = 0;
> >> +    memset(nc, 0, sizeof(*nc));
> >>  }
> >>
> >
> > I do not like requiring the entire structure to be reset as a part of
> > init. If encoded_va is 0 then we have reset the page and the flags.
> > There shouldn't be anything else we need to reset as remaining and bias
> > will be reset when we reallocate.
>
> The argument is about aoviding one checking for fast path by doing the
> memset in the slow path, which you might already know accroding to your
> comment in previous version.
>
> It is just sometimes hard to understand your preference for maintainability
> over performance here as sometimes your comment seems to perfer performance
> over maintainability, like the LEA trick you mentioned and offset count-down
> before this patchset. It would be good to be more consistent about this,
> otherwise it is sometimes confusing when doing the refactoring.

The use of a negative offset is arguably more maintainable in my mind
rather than being a performance trick. Essentially if you use the
negative value you can just mask off the upper bits and it is the
offset in the page. As such it is actually easier for me to read
versus "remaining" which is an offset from the end of the page.
Assuming you read the offset in hex anyway.

> >
> >>  static inline bool page_frag_cache_is_pfmemalloc(struct page_frag_cache *nc)
> >> diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c
> >> index 7928e5d50711..d9c9cad17af7 100644
> >> --- a/mm/page_frag_cache.c
> >> +++ b/mm/page_frag_cache.c
> >> @@ -19,6 +19,28 @@
> >>  #include <linux/page_frag_cache.h>
> >>  #include "internal.h"
> >>
> >> +static struct page *__page_frag_cache_recharge(struct page_frag_cache *nc)
> >> +{
> >> +    unsigned long encoded_va = nc->encoded_va;
> >> +    struct page *page;
> >> +
> >> +    page = virt_to_page((void *)encoded_va);
> >> +    if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
> >> +            return NULL;
> >> +
> >> +    if (unlikely(encoded_page_pfmemalloc(encoded_va))) {
> >> +            VM_BUG_ON(compound_order(page) !=
> >> +                      encoded_page_order(encoded_va));
> >> +            free_unref_page(page, encoded_page_order(encoded_va));
> >> +            return NULL;
> >> +    }
> >> +
> >> +    /* OK, page count is 0, we can safely set it */
> >> +    set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
> >> +
> >> +    return page;
> >> +}
> >> +
> >>  static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
> >>                                           gfp_t gfp_mask)
> >>  {
> >> @@ -26,6 +48,14 @@ static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
> >>      struct page *page = NULL;
> >>      gfp_t gfp = gfp_mask;
> >>
> >> +    if (likely(nc->encoded_va)) {
> >> +            page = __page_frag_cache_recharge(nc);
> >> +            if (page) {
> >> +                    order = encoded_page_order(nc->encoded_va);
> >> +                    goto out;
> >> +            }
> >> +    }
> >> +
> >
> > This code has no business here. This is refill, you just dropped
> > recharge in here which will make a complete mess of the ordering and be
> > confusing to say the least.
> >
> > The expectation was that if we are calling this function it is going to
> > overwrite the virtual address to NULL on failure so we discard the old
> > page if there is one present. This changes that behaviour. What you
> > effectively did is made __page_frag_cache_refill into the recharge
> > function.
>
> The idea is to reuse the below for both __page_frag_cache_refill() and
> __page_frag_cache_recharge(), which seems to be about maintainability
> to not having duplicated code. If there is a better idea to avoid that
> duplicated code while keeping the old behaviour, I am happy to change
> it.
>
>         /* reset page count bias and remaining to start of new frag */
>         nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
>         nc->remaining = PAGE_SIZE << order;
>

The only piece that is really reused here is the pagecnt_bias
assignment. What is obfuscated away is that the order is gotten
through one of two paths. Really order isn't order here it is size.
Which should have been fetched already. What you end up doing with
this change is duplicating a bunch of code throughout the function.
You end up having to fetch size multiple times multiple ways. here you
are generating it with order. Then you have to turn around and get it
again at the start of the function, and again after calling this
function in order to pull it back out.

> >
> >>  #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
> >>      gfp_mask = (gfp_mask & ~__GFP_DIRECT_RECLAIM) |  __GFP_COMP |
> >>                 __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC;
> >> @@ -35,7 +65,7 @@ static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
> >>      if (unlikely(!page)) {
> >>              page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
> >>              if (unlikely(!page)) {
> >> -                    nc->encoded_va = 0;
> >> +                    memset(nc, 0, sizeof(*nc));
> >>                      return NULL;
> >>              }
> >>
> >
> > The memset will take a few more instructions than the existing code
> > did. I would prefer to keep this as is if at all possible.
>
> It will not take more instructions for arm64 as it has 'stp' instruction for
> __HAVE_ARCH_MEMSET is set.
> There is something similar for x64?

The x64 does not last I knew without getting into the SSE/AVX type
stuff. This becomes two seperate 8B store instructions.

> >
> >> @@ -45,6 +75,16 @@ static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
> >>      nc->encoded_va = encode_aligned_va(page_address(page), order,
> >>                                         page_is_pfmemalloc(page));
> >>
> >> +    /* Even if we own the page, we do not use atomic_set().
> >> +     * This would break get_page_unless_zero() users.
> >> +     */
> >> +    page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
> >> +
> >> +out:
> >> +    /* reset page count bias and remaining to start of new frag */
> >> +    nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
> >> +    nc->remaining = PAGE_SIZE << order;
> >> +
> >>      return page;
> >>  }
> >>
> >
> > Why bother returning a page at all? It doesn't seem like you don't use
> > it anymore. It looks like the use cases you have for it in patch 11/12
> > all appear to be broken from what I can tell as you are adding page as
> > a variable when we don't need to be passing internal details to the
> > callers of the function when just a simple error return code would do.
>
> It would be good to be more specific about the 'broken' part here.

We are passing internals to the caller. Basically this is generally
frowned upon for many implementations of things as the general idea is
that the internal page we are using should be a pseudo-private value.
I understand that you have one or two callers that need it for the use
cases you have in patches 11/12, but it also seems like you are just
passing it regardless. For example I noticed in a few cases you added
the page pointer in 12 to handle the return value, but then just used
it to check for NULL. My thought would be that rather than returning
the page here you would be better off just returning 0 or an error and
then doing the virt_to_page translation for all the cases where the
page is actually needed since you have to go that route for a cached
page anyway.
Yunsheng Lin July 23, 2024, 1:19 p.m. UTC | #4
On 2024/7/22 23:32, Alexander Duyck wrote:
> On Mon, Jul 22, 2024 at 5:55 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:
>>
>> On 2024/7/22 7:40, Alexander H Duyck wrote:
>>> On Fri, 2024-07-19 at 17:33 +0800, Yunsheng Lin wrote:
>>>> Refactor common codes from __page_frag_alloc_va_align()
>>>> to __page_frag_cache_refill(), so that the new API can
>>>> make use of them.
>>>>
>>>> CC: Alexander Duyck <alexander.duyck@gmail.com>
>>>> Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
>>>> ---
>>>>  include/linux/page_frag_cache.h |  2 +-
>>>>  mm/page_frag_cache.c            | 93 +++++++++++++++++----------------
>>>>  2 files changed, 49 insertions(+), 46 deletions(-)
>>>>
>>>> diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h
>>>> index 12a16f8e8ad0..5aa45de7a9a5 100644
>>>> --- a/include/linux/page_frag_cache.h
>>>> +++ b/include/linux/page_frag_cache.h
>>>> @@ -50,7 +50,7 @@ static inline void *encoded_page_address(unsigned long encoded_va)
>>>>
>>>>  static inline void page_frag_cache_init(struct page_frag_cache *nc)
>>>>  {
>>>> -    nc->encoded_va = 0;
>>>> +    memset(nc, 0, sizeof(*nc));
>>>>  }
>>>>
>>>
>>> I do not like requiring the entire structure to be reset as a part of
>>> init. If encoded_va is 0 then we have reset the page and the flags.
>>> There shouldn't be anything else we need to reset as remaining and bias
>>> will be reset when we reallocate.
>>
>> The argument is about aoviding one checking for fast path by doing the
>> memset in the slow path, which you might already know accroding to your
>> comment in previous version.
>>
>> It is just sometimes hard to understand your preference for maintainability
>> over performance here as sometimes your comment seems to perfer performance
>> over maintainability, like the LEA trick you mentioned and offset count-down
>> before this patchset. It would be good to be more consistent about this,
>> otherwise it is sometimes confusing when doing the refactoring.
> 
> The use of a negative offset is arguably more maintainable in my mind
> rather than being a performance trick. Essentially if you use the
> negative value you can just mask off the upper bits and it is the
> offset in the page. As such it is actually easier for me to read
> versus "remaining" which is an offset from the end of the page.
> Assuming you read the offset in hex anyway.

Reading the above doesn't seems maintainable to me:(

> 
>>>
>>>>  static inline bool page_frag_cache_is_pfmemalloc(struct page_frag_cache *nc)
>>>> diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c
>>>> index 7928e5d50711..d9c9cad17af7 100644
>>>> --- a/mm/page_frag_cache.c
>>>> +++ b/mm/page_frag_cache.c
>>>> @@ -19,6 +19,28 @@
>>>>  #include <linux/page_frag_cache.h>
>>>>  #include "internal.h"
>>>>
>>>> +static struct page *__page_frag_cache_recharge(struct page_frag_cache *nc)
>>>> +{
>>>> +    unsigned long encoded_va = nc->encoded_va;
>>>> +    struct page *page;
>>>> +
>>>> +    page = virt_to_page((void *)encoded_va);
>>>> +    if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
>>>> +            return NULL;
>>>> +
>>>> +    if (unlikely(encoded_page_pfmemalloc(encoded_va))) {
>>>> +            VM_BUG_ON(compound_order(page) !=
>>>> +                      encoded_page_order(encoded_va));
>>>> +            free_unref_page(page, encoded_page_order(encoded_va));
>>>> +            return NULL;
>>>> +    }
>>>> +
>>>> +    /* OK, page count is 0, we can safely set it */
>>>> +    set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
>>>> +
>>>> +    return page;
>>>> +}
>>>> +
>>>>  static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
>>>>                                           gfp_t gfp_mask)
>>>>  {
>>>> @@ -26,6 +48,14 @@ static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
>>>>      struct page *page = NULL;
>>>>      gfp_t gfp = gfp_mask;
>>>>
>>>> +    if (likely(nc->encoded_va)) {
>>>> +            page = __page_frag_cache_recharge(nc);
>>>> +            if (page) {
>>>> +                    order = encoded_page_order(nc->encoded_va);
>>>> +                    goto out;
>>>> +            }
>>>> +    }
>>>> +
>>>
>>> This code has no business here. This is refill, you just dropped
>>> recharge in here which will make a complete mess of the ordering and be
>>> confusing to say the least.
>>>
>>> The expectation was that if we are calling this function it is going to
>>> overwrite the virtual address to NULL on failure so we discard the old
>>> page if there is one present. This changes that behaviour. What you
>>> effectively did is made __page_frag_cache_refill into the recharge
>>> function.
>>
>> The idea is to reuse the below for both __page_frag_cache_refill() and
>> __page_frag_cache_recharge(), which seems to be about maintainability
>> to not having duplicated code. If there is a better idea to avoid that
>> duplicated code while keeping the old behaviour, I am happy to change
>> it.
>>
>>         /* reset page count bias and remaining to start of new frag */
>>         nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
>>         nc->remaining = PAGE_SIZE << order;
>>
> 
> The only piece that is really reused here is the pagecnt_bias
> assignment. What is obfuscated away is that the order is gotten
> through one of two paths. Really order isn't order here it is size.
> Which should have been fetched already. What you end up doing with
> this change is duplicating a bunch of code throughout the function.
> You end up having to fetch size multiple times multiple ways. here you
> are generating it with order. Then you have to turn around and get it
> again at the start of the function, and again after calling this
> function in order to pull it back out.

I am assuming you would like to reserve old behavior as below?

	if(!encoded_va) {
refill:
		__page_frag_cache_refill()
	}


	if(remaining < fragsz) {
		if(!__page_frag_cache_recharge())
			goto refill;
	}

As we are adding new APIs, are we expecting new APIs also duplicate
the above pattern?

> 
>>>
>>>>  #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
>>>>      gfp_mask = (gfp_mask & ~__GFP_DIRECT_RECLAIM) |  __GFP_COMP |
>>>>                 __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC;
>>>> @@ -35,7 +65,7 @@ static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
>>>>      if (unlikely(!page)) {
>>>>              page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
>>>>              if (unlikely(!page)) {
>>>> -                    nc->encoded_va = 0;
>>>> +                    memset(nc, 0, sizeof(*nc));
>>>>                      return NULL;
>>>>              }
>>>>
>>>
>>> The memset will take a few more instructions than the existing code
>>> did. I would prefer to keep this as is if at all possible.
>>
>> It will not take more instructions for arm64 as it has 'stp' instruction for
>> __HAVE_ARCH_MEMSET is set.
>> There is something similar for x64?
> 
> The x64 does not last I knew without getting into the SSE/AVX type
> stuff. This becomes two seperate 8B store instructions.

I can check later when I get hold of a x64 server.
But doesn't it make sense to have one extra 8B store instructions in slow path
to avoid a checking in fast path?

> 
>>>
>>>> @@ -45,6 +75,16 @@ static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
>>>>      nc->encoded_va = encode_aligned_va(page_address(page), order,
>>>>                                         page_is_pfmemalloc(page));
>>>>
>>>> +    /* Even if we own the page, we do not use atomic_set().
>>>> +     * This would break get_page_unless_zero() users.
>>>> +     */
>>>> +    page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
>>>> +
>>>> +out:
>>>> +    /* reset page count bias and remaining to start of new frag */
>>>> +    nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
>>>> +    nc->remaining = PAGE_SIZE << order;
>>>> +
>>>>      return page;
>>>>  }
>>>>
>>>
>>> Why bother returning a page at all? It doesn't seem like you don't use
>>> it anymore. It looks like the use cases you have for it in patch 11/12
>>> all appear to be broken from what I can tell as you are adding page as
>>> a variable when we don't need to be passing internal details to the
>>> callers of the function when just a simple error return code would do.
>>
>> It would be good to be more specific about the 'broken' part here.
> 
> We are passing internals to the caller. Basically this is generally
> frowned upon for many implementations of things as the general idea is
> that the internal page we are using should be a pseudo-private value.

It is implementation detail and it is about avoid calling virt_to_page()
as mentioned below, I am not sure why it is referred as 'broken', it would
be better to provide more doc about why it is bad idea here, as using
'pseudo-private ' wording doesn't seems to justify the 'broken' part here.

> I understand that you have one or two callers that need it for the use
> cases you have in patches 11/12, but it also seems like you are just
> passing it regardless. For example I noticed in a few cases you added
> the page pointer in 12 to handle the return value, but then just used
> it to check for NULL. My thought would be that rather than returning
> the page here you would be better off just returning 0 or an error and
> then doing the virt_to_page translation for all the cases where the
> page is actually needed since you have to go that route for a cached
> page anyway.

Yes, it is about aovid calling virt_to_page() as much as possible.
Yunsheng Lin July 30, 2024, 1:20 p.m. UTC | #5
On 2024/7/23 21:19, Yunsheng Lin wrote:

...

>>>>>  static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
>>>>>                                           gfp_t gfp_mask)
>>>>>  {
>>>>> @@ -26,6 +48,14 @@ static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
>>>>>      struct page *page = NULL;
>>>>>      gfp_t gfp = gfp_mask;
>>>>>
>>>>> +    if (likely(nc->encoded_va)) {
>>>>> +            page = __page_frag_cache_recharge(nc);
>>>>> +            if (page) {
>>>>> +                    order = encoded_page_order(nc->encoded_va);
>>>>> +                    goto out;
>>>>> +            }
>>>>> +    }
>>>>> +
>>>>
>>>> This code has no business here. This is refill, you just dropped
>>>> recharge in here which will make a complete mess of the ordering and be
>>>> confusing to say the least.
>>>>
>>>> The expectation was that if we are calling this function it is going to
>>>> overwrite the virtual address to NULL on failure so we discard the old
>>>> page if there is one present. This changes that behaviour. What you
>>>> effectively did is made __page_frag_cache_refill into the recharge
>>>> function.
>>>
>>> The idea is to reuse the below for both __page_frag_cache_refill() and
>>> __page_frag_cache_recharge(), which seems to be about maintainability
>>> to not having duplicated code. If there is a better idea to avoid that
>>> duplicated code while keeping the old behaviour, I am happy to change
>>> it.
>>>
>>>         /* reset page count bias and remaining to start of new frag */
>>>         nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
>>>         nc->remaining = PAGE_SIZE << order;
>>>
>>
>> The only piece that is really reused here is the pagecnt_bias
>> assignment. What is obfuscated away is that the order is gotten
>> through one of two paths. Really order isn't order here it is size.
>> Which should have been fetched already. What you end up doing with
>> this change is duplicating a bunch of code throughout the function.
>> You end up having to fetch size multiple times multiple ways. here you
>> are generating it with order. Then you have to turn around and get it
>> again at the start of the function, and again after calling this
>> function in order to pull it back out.
> 
> I am assuming you would like to reserve old behavior as below?
> 
> 	if(!encoded_va) {
> refill:
> 		__page_frag_cache_refill()
> 	}
> 
> 
> 	if(remaining < fragsz) {
> 		if(!__page_frag_cache_recharge())
> 			goto refill;
> 	}
> 
> As we are adding new APIs, are we expecting new APIs also duplicate
> the above pattern?
> 
>>

How about something like below? __page_frag_cache_refill() and
__page_frag_cache_reuse() does what their function name suggests
as much as possible, __page_frag_cache_reload() is added to avoid
new APIs duplicating similar pattern as much as possible, also
avoid fetching size multiple times multiple ways as much as possible.

static struct page *__page_frag_cache_reuse(unsigned long encoded_va,
                                            unsigned int pagecnt_bias)
{
        struct page *page;

        page = virt_to_page((void *)encoded_va);
        if (!page_ref_sub_and_test(page, pagecnt_bias))
                return NULL;

        if (unlikely(encoded_page_pfmemalloc(encoded_va))) {
                VM_BUG_ON(compound_order(page) !=
                          encoded_page_order(encoded_va));
                free_unref_page(page, encoded_page_order(encoded_va));
                return NULL;
        }

        /* OK, page count is 0, we can safely set it */
        set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
        return page;
}

static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
                                             gfp_t gfp_mask)
{
        unsigned long order = PAGE_FRAG_CACHE_MAX_ORDER;
        struct page *page = NULL;
        gfp_t gfp = gfp_mask;

#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
        gfp_mask = (gfp_mask & ~__GFP_DIRECT_RECLAIM) |  __GFP_COMP |
                   __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC;
        page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
                                PAGE_FRAG_CACHE_MAX_ORDER);
#endif
        if (unlikely(!page)) {
                page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
                if (unlikely(!page)) {
                        memset(nc, 0, sizeof(*nc));
                        return NULL;
                }

                order = 0;
        }

        nc->encoded_va = encode_aligned_va(page_address(page), order,
                                           page_is_pfmemalloc(page));

        /* Even if we own the page, we do not use atomic_set().
         * This would break get_page_unless_zero() users.
         */
        page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);

        return page;
}

static struct page *__page_frag_cache_reload(struct page_frag_cache *nc,
                                             gfp_t gfp_mask)
{
        struct page *page;

        if (likely(nc->encoded_va)) {
                page = __page_frag_cache_reuse(nc->encoded_va, nc->pagecnt_bias);
                if (page)
                        goto out;
        }

        page = __page_frag_cache_refill(nc, gfp_mask);
        if (unlikely(!page))
                return NULL;

out:
        nc->remaining = page_frag_cache_page_size(nc->encoded_va);
        nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
        return page;
}

void *__page_frag_alloc_va_align(struct page_frag_cache *nc,
                                 unsigned int fragsz, gfp_t gfp_mask,
                                 unsigned int align_mask)
{
        unsigned long encoded_va = nc->encoded_va;
        unsigned int remaining;

        remaining = nc->remaining & align_mask;
        if (unlikely(remaining < fragsz)) {
                if (unlikely(fragsz > PAGE_SIZE)) {
                        /*
                         * The caller is trying to allocate a fragment
                         * with fragsz > PAGE_SIZE but the cache isn't big
                         * enough to satisfy the request, this may
                         * happen in low memory conditions.
                         * We don't release the cache page because
                         * it could make memory pressure worse
                         * so we simply return NULL here.
                         */
                        return NULL;
                }

                if (unlikely(!__page_frag_cache_reload(nc, gfp_mask)))
                        return NULL;

                nc->pagecnt_bias--;
                nc->remaining -= fragsz;

                return encoded_page_address(nc->encoded_va);
        }

        nc->pagecnt_bias--;
        nc->remaining = remaining - fragsz;

        return encoded_page_address(encoded_va) +
                (page_frag_cache_page_size(encoded_va) - remaining);
}
Alexander Duyck July 30, 2024, 3:12 p.m. UTC | #6
On Tue, 2024-07-30 at 21:20 +0800, Yunsheng Lin wrote:
> On 2024/7/23 21:19, Yunsheng Lin wrote:
> > > 

...

> > > The only piece that is really reused here is the pagecnt_bias
> > > assignment. What is obfuscated away is that the order is gotten
> > > through one of two paths. Really order isn't order here it is size.
> > > Which should have been fetched already. What you end up doing with
> > > this change is duplicating a bunch of code throughout the function.
> > > You end up having to fetch size multiple times multiple ways. here you
> > > are generating it with order. Then you have to turn around and get it
> > > again at the start of the function, and again after calling this
> > > function in order to pull it back out.
> > 
> > I am assuming you would like to reserve old behavior as below?
> > 
> > 	if(!encoded_va) {
> > refill:
> > 		__page_frag_cache_refill()
> > 	}
> > 
> > 
> > 	if(remaining < fragsz) {
> > 		if(!__page_frag_cache_recharge())
> > 			goto refill;
> > 	}
> > 
> > As we are adding new APIs, are we expecting new APIs also duplicate
> > the above pattern?
> > 
> > > 
> 
> How about something like below? __page_frag_cache_refill() and
> __page_frag_cache_reuse() does what their function name suggests
> as much as possible, __page_frag_cache_reload() is added to avoid
> new APIs duplicating similar pattern as much as possible, also
> avoid fetching size multiple times multiple ways as much as possible.

This is better. Still though with the code getting so spread out we
probably need to start adding more comments to explain things.

> static struct page *__page_frag_cache_reuse(unsigned long encoded_va,
>                                             unsigned int pagecnt_bias)
> {
>         struct page *page;
> 
>         page = virt_to_page((void *)encoded_va);
>         if (!page_ref_sub_and_test(page, pagecnt_bias))
>                 return NULL;
> 
>         if (unlikely(encoded_page_pfmemalloc(encoded_va))) {
>                 VM_BUG_ON(compound_order(page) !=
>                           encoded_page_order(encoded_va));

This VM_BUG_ON here makes no sense. If we are going to have this
anywhere it might make more sense in the cache_refill case below to
verify we are setting the order to match when we are generating the
encoded_va.

>                 free_unref_page(page, encoded_page_order(encoded_va));
>                 return NULL;
>         }
> 
>         /* OK, page count is 0, we can safely set it */
>         set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
>         return page;

Why are you returning page here? It isn't used by any of the callers.
We are refilling the page here anyway so any caller should already have
access to the page since it wasn't changed.

> }
> 
> static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
>                                              gfp_t gfp_mask)
> {
>         unsigned long order = PAGE_FRAG_CACHE_MAX_ORDER;
>         struct page *page = NULL;
>         gfp_t gfp = gfp_mask;
> 
> #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
>         gfp_mask = (gfp_mask & ~__GFP_DIRECT_RECLAIM) |  __GFP_COMP |
>                    __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC;
>         page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
>                                 PAGE_FRAG_CACHE_MAX_ORDER);

I suspect the compliler is probably already doing this, but we should
probably not be updating gfp_mask but instead gfp since that is our
local variable.

> #endif
>         if (unlikely(!page)) {
>                 page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
>                 if (unlikely(!page)) {
>                         memset(nc, 0, sizeof(*nc));
>                         return NULL;
>                 }
> 
>                 order = 0;
>         }
> 
>         nc->encoded_va = encode_aligned_va(page_address(page), order,
>                                            page_is_pfmemalloc(page));
> 
>         /* Even if we own the page, we do not use atomic_set().
>          * This would break get_page_unless_zero() users.
>          */
>         page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
> 
>         return page;

Again, returning page here doesn't make much sense. You are better off
not exposing internals as you have essentially locked the page down for
use by the frag API so you shouldn't be handing out the page directly
to callers.

> }
> 
> static struct page *__page_frag_cache_reload(struct page_frag_cache *nc,
>                                              gfp_t gfp_mask)
> {
>         struct page *page;
> 
>         if (likely(nc->encoded_va)) {
>                 page = __page_frag_cache_reuse(nc->encoded_va, nc->pagecnt_bias);
>                 if (page)
>                         goto out;
>         }
> 
>         page = __page_frag_cache_refill(nc, gfp_mask);
>         if (unlikely(!page))
>                 return NULL;
> 
> out:
>         nc->remaining = page_frag_cache_page_size(nc->encoded_va);
>         nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
>         return page;

Your one current caller doesn't use the page value provided here. I
would recommend just not bothering with the page variable until you
actually need it.

> }
> 
> void *__page_frag_alloc_va_align(struct page_frag_cache *nc,
>                                  unsigned int fragsz, gfp_t gfp_mask,
>                                  unsigned int align_mask)
> {
>         unsigned long encoded_va = nc->encoded_va;
>         unsigned int remaining;
> 
>         remaining = nc->remaining & align_mask;
>         if (unlikely(remaining < fragsz)) {

You might as well swap the code paths. It would likely be much easier
to read the case where you are handling remaining >= fragsz in here
rather than having more if statements buried within the if statement.
With that you will have more room for the comment and such below.

>                 if (unlikely(fragsz > PAGE_SIZE)) {
>                         /*
>                          * The caller is trying to allocate a fragment
>                          * with fragsz > PAGE_SIZE but the cache isn't big
>                          * enough to satisfy the request, this may
>                          * happen in low memory conditions.
>                          * We don't release the cache page because
>                          * it could make memory pressure worse
>                          * so we simply return NULL here.
>                          */
>                         return NULL;
>                 }
> 
>                 if (unlikely(!__page_frag_cache_reload(nc, gfp_mask)))
>                         return NULL;

This is what I am talking about in the earlier comments. You go to the
trouble of returning page through all the callers just to not use it
here. So there isn't any point in passing it through the functions.

> 
>                 nc->pagecnt_bias--;
>                 nc->remaining -= fragsz;
> 
>                 return encoded_page_address(nc->encoded_va);
>         }
> 
>         nc->pagecnt_bias--;
>         nc->remaining = remaining - fragsz;
> 
>         return encoded_page_address(encoded_va) +
>                 (page_frag_cache_page_size(encoded_va) - remaining);

Parenthesis here shouldn't be needed, addition and subtractions
operations can happen in any order with the result coming out the same.
Yunsheng Lin July 31, 2024, 12:35 p.m. UTC | #7
On 2024/7/30 23:12, Alexander H Duyck wrote:

...

>>         }
>>
>>         nc->pagecnt_bias--;
>>         nc->remaining = remaining - fragsz;
>>
>>         return encoded_page_address(encoded_va) +
>>                 (page_frag_cache_page_size(encoded_va) - remaining);
> 
> Parenthesis here shouldn't be needed, addition and subtractions
> operations can happen in any order with the result coming out the same.

I am playing safe to avoid overflow here, as I am not sure if the allocator
will give us the last page. For example, '0xfffffffffffff000 + 0x1000' will
have a overflow.
Alexander Duyck July 31, 2024, 5:02 p.m. UTC | #8
On Wed, 2024-07-31 at 20:35 +0800, Yunsheng Lin wrote:
> On 2024/7/30 23:12, Alexander H Duyck wrote:
> 
> ...
> 
> > >         }
> > > 
> > >         nc->pagecnt_bias--;
> > >         nc->remaining = remaining - fragsz;
> > > 
> > >         return encoded_page_address(encoded_va) +
> > >                 (page_frag_cache_page_size(encoded_va) - remaining);
> > 
> > Parenthesis here shouldn't be needed, addition and subtractions
> > operations can happen in any order with the result coming out the same.
> 
> I am playing safe to avoid overflow here, as I am not sure if the allocator
> will give us the last page. For example, '0xfffffffffffff000 + 0x1000' will
> have a overflow.

So what if it does though? When you subtract remaining it will
underflow and go back to the correct value shouldn't it?
Yunsheng Lin Aug. 1, 2024, 12:53 p.m. UTC | #9
On 2024/8/1 1:02, Alexander H Duyck wrote:
> On Wed, 2024-07-31 at 20:35 +0800, Yunsheng Lin wrote:
>> On 2024/7/30 23:12, Alexander H Duyck wrote:
>>
>> ...
>>
>>>>         }
>>>>
>>>>         nc->pagecnt_bias--;
>>>>         nc->remaining = remaining - fragsz;
>>>>
>>>>         return encoded_page_address(encoded_va) +
>>>>                 (page_frag_cache_page_size(encoded_va) - remaining);
>>>
>>> Parenthesis here shouldn't be needed, addition and subtractions
>>> operations can happen in any order with the result coming out the same.
>>
>> I am playing safe to avoid overflow here, as I am not sure if the allocator
>> will give us the last page. For example, '0xfffffffffffff000 + 0x1000' will
>> have a overflow.
> 
> So what if it does though? When you subtract remaining it will
> underflow and go back to the correct value shouldn't it?

I guess that it is true that underflow will bring back the correct value.
But I am not sure what does it hurt to have a parenthesis here, doesn't having
a parenthesis make it more obvious that 'size - remaining' indicate the offset
of allocated fragment and not having to scratch my head and wondering if there
is overflow/underflow problem? Or is there any performance trick behind the above
comment?
diff mbox series

Patch

diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h
index 12a16f8e8ad0..5aa45de7a9a5 100644
--- a/include/linux/page_frag_cache.h
+++ b/include/linux/page_frag_cache.h
@@ -50,7 +50,7 @@  static inline void *encoded_page_address(unsigned long encoded_va)
 
 static inline void page_frag_cache_init(struct page_frag_cache *nc)
 {
-	nc->encoded_va = 0;
+	memset(nc, 0, sizeof(*nc));
 }
 
 static inline bool page_frag_cache_is_pfmemalloc(struct page_frag_cache *nc)
diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c
index 7928e5d50711..d9c9cad17af7 100644
--- a/mm/page_frag_cache.c
+++ b/mm/page_frag_cache.c
@@ -19,6 +19,28 @@ 
 #include <linux/page_frag_cache.h>
 #include "internal.h"
 
+static struct page *__page_frag_cache_recharge(struct page_frag_cache *nc)
+{
+	unsigned long encoded_va = nc->encoded_va;
+	struct page *page;
+
+	page = virt_to_page((void *)encoded_va);
+	if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
+		return NULL;
+
+	if (unlikely(encoded_page_pfmemalloc(encoded_va))) {
+		VM_BUG_ON(compound_order(page) !=
+			  encoded_page_order(encoded_va));
+		free_unref_page(page, encoded_page_order(encoded_va));
+		return NULL;
+	}
+
+	/* OK, page count is 0, we can safely set it */
+	set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
+
+	return page;
+}
+
 static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
 					     gfp_t gfp_mask)
 {
@@ -26,6 +48,14 @@  static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
 	struct page *page = NULL;
 	gfp_t gfp = gfp_mask;
 
+	if (likely(nc->encoded_va)) {
+		page = __page_frag_cache_recharge(nc);
+		if (page) {
+			order = encoded_page_order(nc->encoded_va);
+			goto out;
+		}
+	}
+
 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
 	gfp_mask = (gfp_mask & ~__GFP_DIRECT_RECLAIM) |  __GFP_COMP |
 		   __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC;
@@ -35,7 +65,7 @@  static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
 	if (unlikely(!page)) {
 		page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
 		if (unlikely(!page)) {
-			nc->encoded_va = 0;
+			memset(nc, 0, sizeof(*nc));
 			return NULL;
 		}
 
@@ -45,6 +75,16 @@  static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
 	nc->encoded_va = encode_aligned_va(page_address(page), order,
 					   page_is_pfmemalloc(page));
 
+	/* Even if we own the page, we do not use atomic_set().
+	 * This would break get_page_unless_zero() users.
+	 */
+	page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
+
+out:
+	/* reset page count bias and remaining to start of new frag */
+	nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
+	nc->remaining = PAGE_SIZE << order;
+
 	return page;
 }
 
@@ -55,7 +95,7 @@  void page_frag_cache_drain(struct page_frag_cache *nc)
 
 	__page_frag_cache_drain(virt_to_head_page((void *)nc->encoded_va),
 				nc->pagecnt_bias);
-	nc->encoded_va = 0;
+	memset(nc, 0, sizeof(*nc));
 }
 EXPORT_SYMBOL(page_frag_cache_drain);
 
@@ -72,31 +112,9 @@  void *__page_frag_alloc_va_align(struct page_frag_cache *nc,
 				 unsigned int fragsz, gfp_t gfp_mask,
 				 unsigned int align_mask)
 {
-	unsigned long encoded_va = nc->encoded_va;
-	unsigned int size, remaining;
-	struct page *page;
-
-	if (unlikely(!encoded_va)) {
-refill:
-		page = __page_frag_cache_refill(nc, gfp_mask);
-		if (!page)
-			return NULL;
-
-		encoded_va = nc->encoded_va;
-		size = page_frag_cache_page_size(encoded_va);
+	unsigned int size = page_frag_cache_page_size(nc->encoded_va);
+	unsigned int remaining = nc->remaining & align_mask;
 
-		/* Even if we own the page, we do not use atomic_set().
-		 * This would break get_page_unless_zero() users.
-		 */
-		page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
-
-		/* reset page count bias and remaining to start of new frag */
-		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
-		nc->remaining = size;
-	}
-
-	size = page_frag_cache_page_size(encoded_va);
-	remaining = nc->remaining & align_mask;
 	if (unlikely(remaining < fragsz)) {
 		if (unlikely(fragsz > PAGE_SIZE)) {
 			/*
@@ -111,32 +129,17 @@  void *__page_frag_alloc_va_align(struct page_frag_cache *nc,
 			return NULL;
 		}
 
-		page = virt_to_page((void *)encoded_va);
-
-		if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
-			goto refill;
-
-		if (unlikely(encoded_page_pfmemalloc(encoded_va))) {
-			VM_BUG_ON(compound_order(page) !=
-				  encoded_page_order(encoded_va));
-			free_unref_page(page, encoded_page_order(encoded_va));
-			goto refill;
-		}
-
-		/* OK, page count is 0, we can safely set it */
-		set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
-
-		/* reset page count bias and remaining to start of new frag */
-		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
-		nc->remaining = size;
+		if (unlikely(!__page_frag_cache_refill(nc, gfp_mask)))
+			return NULL;
 
+		size = page_frag_cache_page_size(nc->encoded_va);
 		remaining = size;
 	}
 
 	nc->pagecnt_bias--;
 	nc->remaining = remaining - fragsz;
 
-	return encoded_page_address(encoded_va) + (size - remaining);
+	return encoded_page_address(nc->encoded_va) + (size - remaining);
 }
 EXPORT_SYMBOL(__page_frag_alloc_va_align);