diff mbox series

[bpf-next,v8,6/6] bpf: Use try_alloc_pages() to allocate pages for bpf needs.

Message ID 20250213033556.9534-7-alexei.starovoitov@gmail.com (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series bpf, mm: Introduce try_alloc_pages() | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR success PR summary
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 194 this patch: 194
netdev/build_tools success Errors and warnings before: 26 (+1) this patch: 26 (+1)
netdev/cc_maintainers warning 12 maintainers not CCed: kpsingh@kernel.org daniel@iogearbox.net sdf@fomichev.me jolsa@kernel.org yonghong.song@linux.dev linux-rt-devel@lists.linux.dev song@kernel.org clrkwllms@kernel.org john.fastabend@gmail.com haoluo@google.com eddyz87@gmail.com martin.lau@linux.dev
netdev/build_clang success Errors and warnings before: 9361 this patch: 9361
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 7018 this patch: 7018
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 66 lines checked
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 6 this patch: 6
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / GCC BPF
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-11 success Logs for aarch64-gcc / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-12 success Logs for aarch64-gcc / veristat-meta
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / GCC BPF
bpf/vmtest-bpf-next-VM_Test-14 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-15 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-18 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-19 success Logs for s390x-gcc / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-20 success Logs for s390x-gcc / veristat-meta
bpf/vmtest-bpf-next-VM_Test-21 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17-O2
bpf/vmtest-bpf-next-VM_Test-40 success Logs for x86_64-llvm-17 / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-41 success Logs for x86_64-llvm-17 / veristat-meta
bpf/vmtest-bpf-next-VM_Test-43 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-44 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18-O2
bpf/vmtest-bpf-next-VM_Test-50 success Logs for x86_64-llvm-18 / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-51 success Logs for x86_64-llvm-18 / veristat-meta
bpf/vmtest-bpf-next-VM_Test-8 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-16 success Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-17 success Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-31 success Logs for x86_64-gcc / veristat-kernel / x86_64-gcc veristat_kernel
bpf/vmtest-bpf-next-VM_Test-32 success Logs for x86_64-gcc / veristat-meta / x86_64-gcc veristat_meta
bpf/vmtest-bpf-next-VM_Test-39 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-22 success Logs for x86_64-gcc / GCC BPF / GCC BPF
bpf/vmtest-bpf-next-VM_Test-26 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-27 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-17 / GCC BPF / GCC BPF
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-37 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-38 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-42 success Logs for x86_64-llvm-18 / GCC BPF / GCC BPF
bpf/vmtest-bpf-next-VM_Test-45 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-49 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-46 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-47 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-48 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18

Commit Message

Alexei Starovoitov Feb. 13, 2025, 3:35 a.m. UTC
From: Alexei Starovoitov <ast@kernel.org>

Use try_alloc_pages() and free_pages_nolock() for BPF needs
when context doesn't allow using normal alloc_pages.
This is a prerequisite for further work.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h  |  2 +-
 kernel/bpf/arena.c   |  5 ++---
 kernel/bpf/syscall.c | 23 ++++++++++++++++++++---
 3 files changed, 23 insertions(+), 7 deletions(-)

Comments

Vlastimil Babka Feb. 18, 2025, 3:36 p.m. UTC | #1
On 2/13/25 04:35, Alexei Starovoitov wrote:
> From: Alexei Starovoitov <ast@kernel.org>
> 
> Use try_alloc_pages() and free_pages_nolock() for BPF needs
> when context doesn't allow using normal alloc_pages.
> This is a prerequisite for further work.
> 
> Signed-off-by: Alexei Starovoitov <ast@kernel.org>
> ---
>  include/linux/bpf.h  |  2 +-
>  kernel/bpf/arena.c   |  5 ++---
>  kernel/bpf/syscall.c | 23 ++++++++++++++++++++---
>  3 files changed, 23 insertions(+), 7 deletions(-)
> 
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index f3f50e29d639..e1838a341817 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -2348,7 +2348,7 @@ int  generic_map_delete_batch(struct bpf_map *map,
>  struct bpf_map *bpf_map_get_curr_or_next(u32 *id);
>  struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id);
>  
> -int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid,
> +int bpf_map_alloc_pages(const struct bpf_map *map, int nid,
>  			unsigned long nr_pages, struct page **page_array);
>  #ifdef CONFIG_MEMCG
>  void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags,
> diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c
> index 0975d7f22544..8ecc62e6b1a2 100644
> --- a/kernel/bpf/arena.c
> +++ b/kernel/bpf/arena.c
> @@ -287,7 +287,7 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf)
>  		return VM_FAULT_SIGSEGV;
>  
>  	/* Account into memcg of the process that created bpf_arena */
> -	ret = bpf_map_alloc_pages(map, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE, 1, &page);
> +	ret = bpf_map_alloc_pages(map, NUMA_NO_NODE, 1, &page);
>  	if (ret) {
>  		range_tree_set(&arena->rt, vmf->pgoff, 1);
>  		return VM_FAULT_SIGSEGV;
> @@ -465,8 +465,7 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt
>  	if (ret)
>  		goto out_free_pages;
>  
> -	ret = bpf_map_alloc_pages(&arena->map, GFP_KERNEL | __GFP_ZERO,
> -				  node_id, page_cnt, pages);
> +	ret = bpf_map_alloc_pages(&arena->map, node_id, page_cnt, pages);
>  	if (ret)
>  		goto out;
>  
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index c420edbfb7c8..a7af8d0185d0 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -569,7 +569,24 @@ static void bpf_map_release_memcg(struct bpf_map *map)
>  }
>  #endif
>  
> -int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid,
> +static bool can_alloc_pages(void)
> +{
> +	return preempt_count() == 0 && !irqs_disabled() &&
> +		!IS_ENABLED(CONFIG_PREEMPT_RT);
> +}
> +

I see this is new since v6 and wasn't yet discussed (or I missed it?)

I wonder how reliable these preempt/irq_disabled checks are for correctness
purposes, e.g. we don't have CONFIG_PREEMPT_COUNT enabled always? As longs
as the callers of bpf_map_alloc_pages() know the context and pass gfp
accordingly, can't we use i.e. gfpflags_allow_blocking() to determine if
try_alloc_pages() should be used or not?

> +static struct page *__bpf_alloc_page(int nid)
> +{
> +	if (!can_alloc_pages())
> +		return try_alloc_pages(nid, 0);
> +
> +	return alloc_pages_node(nid,
> +				GFP_KERNEL | __GFP_ZERO | __GFP_ACCOUNT
> +				| __GFP_NOWARN,
> +				0);
> +}
> +
> +int bpf_map_alloc_pages(const struct bpf_map *map, int nid,
>  			unsigned long nr_pages, struct page **pages)
>  {
>  	unsigned long i, j;
> @@ -582,14 +599,14 @@ int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid,
>  	old_memcg = set_active_memcg(memcg);
>  #endif
>  	for (i = 0; i < nr_pages; i++) {
> -		pg = alloc_pages_node(nid, gfp | __GFP_ACCOUNT, 0);
> +		pg = __bpf_alloc_page(nid);
>  
>  		if (pg) {
>  			pages[i] = pg;
>  			continue;
>  		}
>  		for (j = 0; j < i; j++)
> -			__free_page(pages[j]);
> +			free_pages_nolock(pages[j], 0);
>  		ret = -ENOMEM;
>  		break;
>  	}
Alexei Starovoitov Feb. 19, 2025, 2:38 a.m. UTC | #2
On Tue, Feb 18, 2025 at 7:36 AM Vlastimil Babka <vbabka@suse.cz> wrote:
>
> On 2/13/25 04:35, Alexei Starovoitov wrote:
> > From: Alexei Starovoitov <ast@kernel.org>
> >
> > Use try_alloc_pages() and free_pages_nolock() for BPF needs
> > when context doesn't allow using normal alloc_pages.
> > This is a prerequisite for further work.
> >
> > Signed-off-by: Alexei Starovoitov <ast@kernel.org>
> > ---
> >  include/linux/bpf.h  |  2 +-
> >  kernel/bpf/arena.c   |  5 ++---
> >  kernel/bpf/syscall.c | 23 ++++++++++++++++++++---
> >  3 files changed, 23 insertions(+), 7 deletions(-)
> >
> > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> > index f3f50e29d639..e1838a341817 100644
> > --- a/include/linux/bpf.h
> > +++ b/include/linux/bpf.h
> > @@ -2348,7 +2348,7 @@ int  generic_map_delete_batch(struct bpf_map *map,
> >  struct bpf_map *bpf_map_get_curr_or_next(u32 *id);
> >  struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id);
> >
> > -int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid,
> > +int bpf_map_alloc_pages(const struct bpf_map *map, int nid,
> >                       unsigned long nr_pages, struct page **page_array);
> >  #ifdef CONFIG_MEMCG
> >  void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags,
> > diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c
> > index 0975d7f22544..8ecc62e6b1a2 100644
> > --- a/kernel/bpf/arena.c
> > +++ b/kernel/bpf/arena.c
> > @@ -287,7 +287,7 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf)
> >               return VM_FAULT_SIGSEGV;
> >
> >       /* Account into memcg of the process that created bpf_arena */
> > -     ret = bpf_map_alloc_pages(map, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE, 1, &page);
> > +     ret = bpf_map_alloc_pages(map, NUMA_NO_NODE, 1, &page);
> >       if (ret) {
> >               range_tree_set(&arena->rt, vmf->pgoff, 1);
> >               return VM_FAULT_SIGSEGV;
> > @@ -465,8 +465,7 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt
> >       if (ret)
> >               goto out_free_pages;
> >
> > -     ret = bpf_map_alloc_pages(&arena->map, GFP_KERNEL | __GFP_ZERO,
> > -                               node_id, page_cnt, pages);
> > +     ret = bpf_map_alloc_pages(&arena->map, node_id, page_cnt, pages);
> >       if (ret)
> >               goto out;
> >
> > diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> > index c420edbfb7c8..a7af8d0185d0 100644
> > --- a/kernel/bpf/syscall.c
> > +++ b/kernel/bpf/syscall.c
> > @@ -569,7 +569,24 @@ static void bpf_map_release_memcg(struct bpf_map *map)
> >  }
> >  #endif
> >
> > -int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid,
> > +static bool can_alloc_pages(void)
> > +{
> > +     return preempt_count() == 0 && !irqs_disabled() &&
> > +             !IS_ENABLED(CONFIG_PREEMPT_RT);
> > +}
> > +
>
> I see this is new since v6 and wasn't yet discussed (or I missed it?)

It was in v1:
https://lore.kernel.org/bpf/20241116014854.55141-1-alexei.starovoitov@gmail.com/
See Peter's comments.
In this version I open coded preemptible(), since it's more accurate
and disabled the detection on PREEMPT_RT.

> I wonder how reliable these preempt/irq_disabled checks are for correctness
> purposes, e.g. we don't have CONFIG_PREEMPT_COUNT enabled always?

I believe the above doesn't produce false positives.
It's not exhaustive and might change as we learn more and tune it.
Hence I moved it to be bpf specific to iterate quickly instead of
being in inux/gfp.h and also considering Sebastian's comment
that normal kernel code should better know the calling context.

> As longs
> as the callers of bpf_map_alloc_pages() know the context and pass gfp
> accordingly, can't we use i.e. gfpflags_allow_blocking() to determine if
> try_alloc_pages() should be used or not?

bpf infra has a very coarse knowledge of the context.
There are two categories: sleepable or not.
In sleepable GFP_KERNEL is allowed, but it's very narrow and
represents a tiny slice of use cases compared to
non-sleepable. The try_alloc_pages() is for the latter.
netconsole has a similar problem/challenge.
It doesn't know the context where it will be called.
Currently it's just doing GFP_ATOMIC and praying.
This is something to fix eventually when slab is taught about
gfpflags_allow_blocking.
diff mbox series

Patch

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index f3f50e29d639..e1838a341817 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2348,7 +2348,7 @@  int  generic_map_delete_batch(struct bpf_map *map,
 struct bpf_map *bpf_map_get_curr_or_next(u32 *id);
 struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id);
 
-int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid,
+int bpf_map_alloc_pages(const struct bpf_map *map, int nid,
 			unsigned long nr_pages, struct page **page_array);
 #ifdef CONFIG_MEMCG
 void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags,
diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c
index 0975d7f22544..8ecc62e6b1a2 100644
--- a/kernel/bpf/arena.c
+++ b/kernel/bpf/arena.c
@@ -287,7 +287,7 @@  static vm_fault_t arena_vm_fault(struct vm_fault *vmf)
 		return VM_FAULT_SIGSEGV;
 
 	/* Account into memcg of the process that created bpf_arena */
-	ret = bpf_map_alloc_pages(map, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE, 1, &page);
+	ret = bpf_map_alloc_pages(map, NUMA_NO_NODE, 1, &page);
 	if (ret) {
 		range_tree_set(&arena->rt, vmf->pgoff, 1);
 		return VM_FAULT_SIGSEGV;
@@ -465,8 +465,7 @@  static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt
 	if (ret)
 		goto out_free_pages;
 
-	ret = bpf_map_alloc_pages(&arena->map, GFP_KERNEL | __GFP_ZERO,
-				  node_id, page_cnt, pages);
+	ret = bpf_map_alloc_pages(&arena->map, node_id, page_cnt, pages);
 	if (ret)
 		goto out;
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index c420edbfb7c8..a7af8d0185d0 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -569,7 +569,24 @@  static void bpf_map_release_memcg(struct bpf_map *map)
 }
 #endif
 
-int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid,
+static bool can_alloc_pages(void)
+{
+	return preempt_count() == 0 && !irqs_disabled() &&
+		!IS_ENABLED(CONFIG_PREEMPT_RT);
+}
+
+static struct page *__bpf_alloc_page(int nid)
+{
+	if (!can_alloc_pages())
+		return try_alloc_pages(nid, 0);
+
+	return alloc_pages_node(nid,
+				GFP_KERNEL | __GFP_ZERO | __GFP_ACCOUNT
+				| __GFP_NOWARN,
+				0);
+}
+
+int bpf_map_alloc_pages(const struct bpf_map *map, int nid,
 			unsigned long nr_pages, struct page **pages)
 {
 	unsigned long i, j;
@@ -582,14 +599,14 @@  int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid,
 	old_memcg = set_active_memcg(memcg);
 #endif
 	for (i = 0; i < nr_pages; i++) {
-		pg = alloc_pages_node(nid, gfp | __GFP_ACCOUNT, 0);
+		pg = __bpf_alloc_page(nid);
 
 		if (pg) {
 			pages[i] = pg;
 			continue;
 		}
 		for (j = 0; j < i; j++)
-			__free_page(pages[j]);
+			free_pages_nolock(pages[j], 0);
 		ret = -ENOMEM;
 		break;
 	}