Message ID | 20250213033556.9534-7-alexei.starovoitov@gmail.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | bpf, mm: Introduce try_alloc_pages() | expand |
On 2/13/25 04:35, Alexei Starovoitov wrote: > From: Alexei Starovoitov <ast@kernel.org> > > Use try_alloc_pages() and free_pages_nolock() for BPF needs > when context doesn't allow using normal alloc_pages. > This is a prerequisite for further work. > > Signed-off-by: Alexei Starovoitov <ast@kernel.org> > --- > include/linux/bpf.h | 2 +- > kernel/bpf/arena.c | 5 ++--- > kernel/bpf/syscall.c | 23 ++++++++++++++++++++--- > 3 files changed, 23 insertions(+), 7 deletions(-) > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h > index f3f50e29d639..e1838a341817 100644 > --- a/include/linux/bpf.h > +++ b/include/linux/bpf.h > @@ -2348,7 +2348,7 @@ int generic_map_delete_batch(struct bpf_map *map, > struct bpf_map *bpf_map_get_curr_or_next(u32 *id); > struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id); > > -int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid, > +int bpf_map_alloc_pages(const struct bpf_map *map, int nid, > unsigned long nr_pages, struct page **page_array); > #ifdef CONFIG_MEMCG > void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags, > diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c > index 0975d7f22544..8ecc62e6b1a2 100644 > --- a/kernel/bpf/arena.c > +++ b/kernel/bpf/arena.c > @@ -287,7 +287,7 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf) > return VM_FAULT_SIGSEGV; > > /* Account into memcg of the process that created bpf_arena */ > - ret = bpf_map_alloc_pages(map, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE, 1, &page); > + ret = bpf_map_alloc_pages(map, NUMA_NO_NODE, 1, &page); > if (ret) { > range_tree_set(&arena->rt, vmf->pgoff, 1); > return VM_FAULT_SIGSEGV; > @@ -465,8 +465,7 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt > if (ret) > goto out_free_pages; > > - ret = bpf_map_alloc_pages(&arena->map, GFP_KERNEL | __GFP_ZERO, > - node_id, page_cnt, pages); > + ret = bpf_map_alloc_pages(&arena->map, node_id, page_cnt, pages); > if (ret) > goto out; > > diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c > index c420edbfb7c8..a7af8d0185d0 100644 > --- a/kernel/bpf/syscall.c > +++ b/kernel/bpf/syscall.c > @@ -569,7 +569,24 @@ static void bpf_map_release_memcg(struct bpf_map *map) > } > #endif > > -int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid, > +static bool can_alloc_pages(void) > +{ > + return preempt_count() == 0 && !irqs_disabled() && > + !IS_ENABLED(CONFIG_PREEMPT_RT); > +} > + I see this is new since v6 and wasn't yet discussed (or I missed it?) I wonder how reliable these preempt/irq_disabled checks are for correctness purposes, e.g. we don't have CONFIG_PREEMPT_COUNT enabled always? As longs as the callers of bpf_map_alloc_pages() know the context and pass gfp accordingly, can't we use i.e. gfpflags_allow_blocking() to determine if try_alloc_pages() should be used or not? > +static struct page *__bpf_alloc_page(int nid) > +{ > + if (!can_alloc_pages()) > + return try_alloc_pages(nid, 0); > + > + return alloc_pages_node(nid, > + GFP_KERNEL | __GFP_ZERO | __GFP_ACCOUNT > + | __GFP_NOWARN, > + 0); > +} > + > +int bpf_map_alloc_pages(const struct bpf_map *map, int nid, > unsigned long nr_pages, struct page **pages) > { > unsigned long i, j; > @@ -582,14 +599,14 @@ int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid, > old_memcg = set_active_memcg(memcg); > #endif > for (i = 0; i < nr_pages; i++) { > - pg = alloc_pages_node(nid, gfp | __GFP_ACCOUNT, 0); > + pg = __bpf_alloc_page(nid); > > if (pg) { > pages[i] = pg; > continue; > } > for (j = 0; j < i; j++) > - __free_page(pages[j]); > + free_pages_nolock(pages[j], 0); > ret = -ENOMEM; > break; > }
On Tue, Feb 18, 2025 at 7:36 AM Vlastimil Babka <vbabka@suse.cz> wrote: > > On 2/13/25 04:35, Alexei Starovoitov wrote: > > From: Alexei Starovoitov <ast@kernel.org> > > > > Use try_alloc_pages() and free_pages_nolock() for BPF needs > > when context doesn't allow using normal alloc_pages. > > This is a prerequisite for further work. > > > > Signed-off-by: Alexei Starovoitov <ast@kernel.org> > > --- > > include/linux/bpf.h | 2 +- > > kernel/bpf/arena.c | 5 ++--- > > kernel/bpf/syscall.c | 23 ++++++++++++++++++++--- > > 3 files changed, 23 insertions(+), 7 deletions(-) > > > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h > > index f3f50e29d639..e1838a341817 100644 > > --- a/include/linux/bpf.h > > +++ b/include/linux/bpf.h > > @@ -2348,7 +2348,7 @@ int generic_map_delete_batch(struct bpf_map *map, > > struct bpf_map *bpf_map_get_curr_or_next(u32 *id); > > struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id); > > > > -int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid, > > +int bpf_map_alloc_pages(const struct bpf_map *map, int nid, > > unsigned long nr_pages, struct page **page_array); > > #ifdef CONFIG_MEMCG > > void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags, > > diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c > > index 0975d7f22544..8ecc62e6b1a2 100644 > > --- a/kernel/bpf/arena.c > > +++ b/kernel/bpf/arena.c > > @@ -287,7 +287,7 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf) > > return VM_FAULT_SIGSEGV; > > > > /* Account into memcg of the process that created bpf_arena */ > > - ret = bpf_map_alloc_pages(map, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE, 1, &page); > > + ret = bpf_map_alloc_pages(map, NUMA_NO_NODE, 1, &page); > > if (ret) { > > range_tree_set(&arena->rt, vmf->pgoff, 1); > > return VM_FAULT_SIGSEGV; > > @@ -465,8 +465,7 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt > > if (ret) > > goto out_free_pages; > > > > - ret = bpf_map_alloc_pages(&arena->map, GFP_KERNEL | __GFP_ZERO, > > - node_id, page_cnt, pages); > > + ret = bpf_map_alloc_pages(&arena->map, node_id, page_cnt, pages); > > if (ret) > > goto out; > > > > diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c > > index c420edbfb7c8..a7af8d0185d0 100644 > > --- a/kernel/bpf/syscall.c > > +++ b/kernel/bpf/syscall.c > > @@ -569,7 +569,24 @@ static void bpf_map_release_memcg(struct bpf_map *map) > > } > > #endif > > > > -int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid, > > +static bool can_alloc_pages(void) > > +{ > > + return preempt_count() == 0 && !irqs_disabled() && > > + !IS_ENABLED(CONFIG_PREEMPT_RT); > > +} > > + > > I see this is new since v6 and wasn't yet discussed (or I missed it?) It was in v1: https://lore.kernel.org/bpf/20241116014854.55141-1-alexei.starovoitov@gmail.com/ See Peter's comments. In this version I open coded preemptible(), since it's more accurate and disabled the detection on PREEMPT_RT. > I wonder how reliable these preempt/irq_disabled checks are for correctness > purposes, e.g. we don't have CONFIG_PREEMPT_COUNT enabled always? I believe the above doesn't produce false positives. It's not exhaustive and might change as we learn more and tune it. Hence I moved it to be bpf specific to iterate quickly instead of being in inux/gfp.h and also considering Sebastian's comment that normal kernel code should better know the calling context. > As longs > as the callers of bpf_map_alloc_pages() know the context and pass gfp > accordingly, can't we use i.e. gfpflags_allow_blocking() to determine if > try_alloc_pages() should be used or not? bpf infra has a very coarse knowledge of the context. There are two categories: sleepable or not. In sleepable GFP_KERNEL is allowed, but it's very narrow and represents a tiny slice of use cases compared to non-sleepable. The try_alloc_pages() is for the latter. netconsole has a similar problem/challenge. It doesn't know the context where it will be called. Currently it's just doing GFP_ATOMIC and praying. This is something to fix eventually when slab is taught about gfpflags_allow_blocking.
diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f3f50e29d639..e1838a341817 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2348,7 +2348,7 @@ int generic_map_delete_batch(struct bpf_map *map, struct bpf_map *bpf_map_get_curr_or_next(u32 *id); struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id); -int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid, +int bpf_map_alloc_pages(const struct bpf_map *map, int nid, unsigned long nr_pages, struct page **page_array); #ifdef CONFIG_MEMCG void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags, diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 0975d7f22544..8ecc62e6b1a2 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -287,7 +287,7 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf) return VM_FAULT_SIGSEGV; /* Account into memcg of the process that created bpf_arena */ - ret = bpf_map_alloc_pages(map, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE, 1, &page); + ret = bpf_map_alloc_pages(map, NUMA_NO_NODE, 1, &page); if (ret) { range_tree_set(&arena->rt, vmf->pgoff, 1); return VM_FAULT_SIGSEGV; @@ -465,8 +465,7 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt if (ret) goto out_free_pages; - ret = bpf_map_alloc_pages(&arena->map, GFP_KERNEL | __GFP_ZERO, - node_id, page_cnt, pages); + ret = bpf_map_alloc_pages(&arena->map, node_id, page_cnt, pages); if (ret) goto out; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index c420edbfb7c8..a7af8d0185d0 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -569,7 +569,24 @@ static void bpf_map_release_memcg(struct bpf_map *map) } #endif -int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid, +static bool can_alloc_pages(void) +{ + return preempt_count() == 0 && !irqs_disabled() && + !IS_ENABLED(CONFIG_PREEMPT_RT); +} + +static struct page *__bpf_alloc_page(int nid) +{ + if (!can_alloc_pages()) + return try_alloc_pages(nid, 0); + + return alloc_pages_node(nid, + GFP_KERNEL | __GFP_ZERO | __GFP_ACCOUNT + | __GFP_NOWARN, + 0); +} + +int bpf_map_alloc_pages(const struct bpf_map *map, int nid, unsigned long nr_pages, struct page **pages) { unsigned long i, j; @@ -582,14 +599,14 @@ int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid, old_memcg = set_active_memcg(memcg); #endif for (i = 0; i < nr_pages; i++) { - pg = alloc_pages_node(nid, gfp | __GFP_ACCOUNT, 0); + pg = __bpf_alloc_page(nid); if (pg) { pages[i] = pg; continue; } for (j = 0; j < i; j++) - __free_page(pages[j]); + free_pages_nolock(pages[j], 0); ret = -ENOMEM; break; }