Message ID | 20220826024430.84565-2-alexei.starovoitov@gmail.com (mailing list archive) |
---|---|
State | Superseded |
Delegated to: | BPF |
Headers | show |
Series | bpf: BPF specific memory allocator. | expand |
On 8/26/22 4:44 AM, Alexei Starovoitov wrote: [...] > + > +/* Called from BPF program or from sys_bpf syscall. > + * In both cases migration is disabled. > + */ > +void notrace *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size) > +{ > + int idx; > + void *ret; > + > + if (!size) > + return ZERO_SIZE_PTR; > + > + idx = bpf_mem_cache_idx(size + LLIST_NODE_SZ); > + if (idx < 0) > + return NULL; > + > + ret = unit_alloc(this_cpu_ptr(ma->caches)->cache + idx); > + return !ret ? NULL : ret + LLIST_NODE_SZ; > +} > + > +void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr) > +{ > + int idx; > + > + if (!ptr) > + return; > + > + idx = bpf_mem_cache_idx(__ksize(ptr - LLIST_NODE_SZ)); > + if (idx < 0) > + return; > + > + unit_free(this_cpu_ptr(ma->caches)->cache + idx, ptr); > +} > + > +void notrace *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma) > +{ > + void *ret; > + > + ret = unit_alloc(this_cpu_ptr(ma->cache)); > + return !ret ? NULL : ret + LLIST_NODE_SZ; > +} > + > +void notrace bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr) > +{ > + if (!ptr) > + return; > + > + unit_free(this_cpu_ptr(ma->cache), ptr); > +} Looks like smp_processor_id() needs to be made aware that preemption might be ok just not migration to a different CPU? [...] [ 593.639025] BUG: using smp_processor_id() in preemptible [00000000] code: kworker/u16:246/1946 [ 593.639026] BUG: using smp_processor_id() in preemptible [00000000] code: kworker/u16:83/1642 [ 593.639138] caller is bpf_mem_cache_free+0x14/0x40 [ 593.640971] caller is bpf_mem_cache_free+0x14/0x40 [ 593.641060] CPU: 6 PID: 1642 Comm: kworker/u16:83 Not tainted 5.19.0-gf0d7b67fb6f8 #1 [ 593.641874] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1.1 04/01/2014 [ 593.641874] Workqueue: events_unbound bpf_map_free_deferred [ 593.641874] Call Trace: [ 593.641874] <TASK> [ 593.641874] dump_stack_lvl+0x69/0x9b [ 593.641874] check_preemption_disabled+0x10b/0x120 [ 593.641874] bpf_mem_cache_free+0x14/0x40 [ 593.641874] htab_map_free+0x13f/0x2a0 [ 593.641874] process_one_work+0x28e/0x580 [ 593.641874] worker_thread+0x1fb/0x420 [ 593.641874] ? rcu_lock_release+0x20/0x20 [ 593.641874] kthread+0xf1/0x110 [ 593.641874] ? kthread_blkcg+0x40/0x40 [ 593.641874] ret_from_fork+0x22/0x30 [ 593.641874] </TASK> [ 593.654117] CPU: 5 PID: 1946 Comm: kworker/u16:246 Not tainted 5.19.0-gf0d7b67fb6f8 #1 [ 593.654317] BUG: using smp_processor_id() in preemptible [00000000] code: kworker/u16:83/1642 [ 593.654560] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1.1 04/01/2014 [ 593.654560] Workqueue: events_unbound bpf_map_free_deferred [ 593.654560] Call Trace: [ 593.654560] <TASK> [ 593.654560] dump_stack_lvl+0x69/0x9b [ 593.658872] caller is bpf_mem_cache_free+0x14/0x40 [ 593.654560] check_preemption_disabled+0x10b/0x120 [ 593.654560] bpf_mem_cache_free+0x14/0x40 [ 593.654560] htab_map_free+0x13f/0x2a0 [ 593.654560] process_one_work+0x28e/0x580 [ 593.654560] worker_thread+0x1fb/0x420 [ 593.654560] ? rcu_lock_release+0x20/0x20 [ 593.654560] kthread+0xf1/0x110 [ 593.654560] ? kthread_blkcg+0x40/0x40 [ 593.654560] ret_from_fork+0x22/0x30 [ 593.654560] </TASK> [...] [ 1158.399989] test_maps invoked oom-killer: gfp_mask=0x140cca(GFP_HIGHUSER_MOVABLE|__GFP_COMP), order=0, oom_score_adj=0 [ 1158.401948] CPU: 1 PID: 4147 Comm: test_maps Not tainted 5.19.0-gf0d7b67fb6f8 #1 [ 1158.402612] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1.1 04/01/2014 [ 1158.402666] Call Trace: [ 1158.402666] <TASK> [ 1158.402666] dump_stack_lvl+0x69/0x9b [ 1158.402666] dump_header+0x4d/0x370 [ 1158.402666] out_of_memory+0x5a2/0x5e0 [ 1158.402666] __alloc_pages_slowpath+0xbf4/0x1140 [ 1158.402666] __alloc_pages+0x222/0x2c0 [ 1158.402666] folio_alloc+0x14/0x40 [ 1158.402666] filemap_alloc_folio+0x43/0x150 [ 1158.402666] __filemap_get_folio+0x263/0x3a0 [ 1158.402666] filemap_fault+0x20f/0x540 [ 1158.410527] __do_fault+0x4a/0x100 [ 1158.410527] do_fault+0x13a/0x630 [ 1158.410527] handle_mm_fault+0x83d/0x13d0 [ 1158.410527] do_user_addr_fault+0x33c/0x4e0 [ 1158.410527] exc_page_fault+0x72/0x280 [ 1158.410527] asm_exc_page_fault+0x22/0x30 [ 1158.410527] RIP: 0033:0x7fa4d11a6ffc [ 1158.410527] Code: Unable to access opcode bytes at RIP 0x7fa4d11a6fd2. [ 1158.410527] RSP: 002b:00007ffd4fa8ac00 EFLAGS: 00000206 [ 1158.410527] RAX: 0000000000000240 RBX: 0000000000000075 RCX: 00007fa4d11d4610 [ 1158.410527] RDX: 0000000000000065 RSI: 00007fa4d11d42a0 RDI: 000055d328f91ac8 [ 1158.410527] RBP: 00007ffd4fa8ace0 R08: 00007fa4d1198aa0 R09: 0000000000000001 [ 1158.419624] R10: 00007ffd4fa8ace0 R11: 0000000000000246 R12: 000055d328f91ac8 [ 1158.420234] R13: 000055d328f9e4e0 R14: 0000000000000000 R15: 00007fa4d11d3000 [ 1158.420234] </TASK> [...]
On Mon, Aug 29, 2022 at 2:30 PM Daniel Borkmann <daniel@iogearbox.net> wrote: > > On 8/26/22 4:44 AM, Alexei Starovoitov wrote: > [...] > > + > > +/* Called from BPF program or from sys_bpf syscall. > > + * In both cases migration is disabled. > > + */ > > +void notrace *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size) > > +{ > > + int idx; > > + void *ret; > > + > > + if (!size) > > + return ZERO_SIZE_PTR; > > + > > + idx = bpf_mem_cache_idx(size + LLIST_NODE_SZ); > > + if (idx < 0) > > + return NULL; > > + > > + ret = unit_alloc(this_cpu_ptr(ma->caches)->cache + idx); > > + return !ret ? NULL : ret + LLIST_NODE_SZ; > > +} > > + > > +void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr) > > +{ > > + int idx; > > + > > + if (!ptr) > > + return; > > + > > + idx = bpf_mem_cache_idx(__ksize(ptr - LLIST_NODE_SZ)); > > + if (idx < 0) > > + return; > > + > > + unit_free(this_cpu_ptr(ma->caches)->cache + idx, ptr); > > +} > > + > > +void notrace *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma) > > +{ > > + void *ret; > > + > > + ret = unit_alloc(this_cpu_ptr(ma->cache)); > > + return !ret ? NULL : ret + LLIST_NODE_SZ; > > +} > > + > > +void notrace bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr) > > +{ > > + if (!ptr) > > + return; > > + > > + unit_free(this_cpu_ptr(ma->cache), ptr); > > +} > > Looks like smp_processor_id() needs to be made aware that preemption might > be ok just not migration to a different CPU? ahh. migration is not disabled when map is freed from worker. this_cpu_ptr above and local_irq_save shortly after need to happen on the same cpu, so I'm thinking to add migrate_disable to htab free path.
On 8/26/22 4:44 AM, Alexei Starovoitov wrote: > From: Alexei Starovoitov <ast@kernel.org> > > Tracing BPF programs can attach to kprobe and fentry. Hence they > run in unknown context where calling plain kmalloc() might not be safe. > > Front-end kmalloc() with minimal per-cpu cache of free elements. > Refill this cache asynchronously from irq_work. > > BPF programs always run with migration disabled. > It's safe to allocate from cache of the current cpu with irqs disabled. > Free-ing is always done into bucket of the current cpu as well. > irq_work trims extra free elements from buckets with kfree > and refills them with kmalloc, so global kmalloc logic takes care > of freeing objects allocated by one cpu and freed on another. > > struct bpf_mem_alloc supports two modes: > - When size != 0 create kmem_cache and bpf_mem_cache for each cpu. > This is typical bpf hash map use case when all elements have equal size. > - When size == 0 allocate 11 bpf_mem_cache-s for each cpu, then rely on > kmalloc/kfree. Max allocation size is 4096 in this case. > This is bpf_dynptr and bpf_kptr use case. > > bpf_mem_alloc/bpf_mem_free are bpf specific 'wrappers' of kmalloc/kfree. > bpf_mem_cache_alloc/bpf_mem_cache_free are 'wrappers' of kmem_cache_alloc/kmem_cache_free. > > The allocators are NMI-safe from bpf programs only. They are not NMI-safe in general. > > Acked-by: Kumar Kartikeya Dwivedi <memxor@gmail.com> > Signed-off-by: Alexei Starovoitov <ast@kernel.org> > --- > include/linux/bpf_mem_alloc.h | 26 ++ > kernel/bpf/Makefile | 2 +- > kernel/bpf/memalloc.c | 476 ++++++++++++++++++++++++++++++++++ > 3 files changed, 503 insertions(+), 1 deletion(-) > create mode 100644 include/linux/bpf_mem_alloc.h > create mode 100644 kernel/bpf/memalloc.c > [...] > +#define NUM_CACHES 11 > + > +struct bpf_mem_cache { > + /* per-cpu list of free objects of size 'unit_size'. > + * All accesses are done with interrupts disabled and 'active' counter > + * protection with __llist_add() and __llist_del_first(). > + */ > + struct llist_head free_llist; > + local_t active; > + > + /* Operations on the free_list from unit_alloc/unit_free/bpf_mem_refill > + * are sequenced by per-cpu 'active' counter. But unit_free() cannot > + * fail. When 'active' is busy the unit_free() will add an object to > + * free_llist_extra. > + */ > + struct llist_head free_llist_extra; > + > + /* kmem_cache != NULL when bpf_mem_alloc was created for specific > + * element size. > + */ > + struct kmem_cache *kmem_cache; > + struct irq_work refill_work; > + struct obj_cgroup *objcg; > + int unit_size; > + /* count of objects in free_llist */ > + int free_cnt; > +}; > + > +struct bpf_mem_caches { > + struct bpf_mem_cache cache[NUM_CACHES]; > +}; > + Could we now also completely get rid of the current map prealloc infra (pcpu_freelist* I mean), and replace it with above variant altogether? Would be nice to make it work for this case, too, and then get rid of percpu_freelist.{h,c} .. it's essentially a superset wrt functionality iiuc? Thanks, Daniel
On Mon, Aug 29, 2022 at 2:59 PM Daniel Borkmann <daniel@iogearbox.net> wrote: > > On 8/26/22 4:44 AM, Alexei Starovoitov wrote: > > From: Alexei Starovoitov <ast@kernel.org> > > > > Tracing BPF programs can attach to kprobe and fentry. Hence they > > run in unknown context where calling plain kmalloc() might not be safe. > > > > Front-end kmalloc() with minimal per-cpu cache of free elements. > > Refill this cache asynchronously from irq_work. > > > > BPF programs always run with migration disabled. > > It's safe to allocate from cache of the current cpu with irqs disabled. > > Free-ing is always done into bucket of the current cpu as well. > > irq_work trims extra free elements from buckets with kfree > > and refills them with kmalloc, so global kmalloc logic takes care > > of freeing objects allocated by one cpu and freed on another. > > > > struct bpf_mem_alloc supports two modes: > > - When size != 0 create kmem_cache and bpf_mem_cache for each cpu. > > This is typical bpf hash map use case when all elements have equal size. > > - When size == 0 allocate 11 bpf_mem_cache-s for each cpu, then rely on > > kmalloc/kfree. Max allocation size is 4096 in this case. > > This is bpf_dynptr and bpf_kptr use case. > > > > bpf_mem_alloc/bpf_mem_free are bpf specific 'wrappers' of kmalloc/kfree. > > bpf_mem_cache_alloc/bpf_mem_cache_free are 'wrappers' of kmem_cache_alloc/kmem_cache_free. > > > > The allocators are NMI-safe from bpf programs only. They are not NMI-safe in general. > > > > Acked-by: Kumar Kartikeya Dwivedi <memxor@gmail.com> > > Signed-off-by: Alexei Starovoitov <ast@kernel.org> > > --- > > include/linux/bpf_mem_alloc.h | 26 ++ > > kernel/bpf/Makefile | 2 +- > > kernel/bpf/memalloc.c | 476 ++++++++++++++++++++++++++++++++++ > > 3 files changed, 503 insertions(+), 1 deletion(-) > > create mode 100644 include/linux/bpf_mem_alloc.h > > create mode 100644 kernel/bpf/memalloc.c > > > [...] > > +#define NUM_CACHES 11 > > + > > +struct bpf_mem_cache { > > + /* per-cpu list of free objects of size 'unit_size'. > > + * All accesses are done with interrupts disabled and 'active' counter > > + * protection with __llist_add() and __llist_del_first(). > > + */ > > + struct llist_head free_llist; > > + local_t active; > > + > > + /* Operations on the free_list from unit_alloc/unit_free/bpf_mem_refill > > + * are sequenced by per-cpu 'active' counter. But unit_free() cannot > > + * fail. When 'active' is busy the unit_free() will add an object to > > + * free_llist_extra. > > + */ > > + struct llist_head free_llist_extra; > > + > > + /* kmem_cache != NULL when bpf_mem_alloc was created for specific > > + * element size. > > + */ > > + struct kmem_cache *kmem_cache; > > + struct irq_work refill_work; > > + struct obj_cgroup *objcg; > > + int unit_size; > > + /* count of objects in free_llist */ > > + int free_cnt; > > +}; > > + > > +struct bpf_mem_caches { > > + struct bpf_mem_cache cache[NUM_CACHES]; > > +}; > > + > > Could we now also completely get rid of the current map prealloc infra (pcpu_freelist* > I mean), and replace it with above variant altogether? Would be nice to make it work > for this case, too, and then get rid of percpu_freelist.{h,c} .. it's essentially a > superset wrt functionality iiuc? Eventually it would be possible to get rid of prealloc logic completely, but not so fast. LRU map needs to be converted first. Then a lot of production testing is necessary to gain confidence and make sure we didn't miss any corner cases.
On Thu, Aug 25, 2022 at 07:44:16PM -0700, Alexei Starovoitov wrote: > +/* Mostly runs from irq_work except __init phase. */ > +static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node) > +{ > + struct mem_cgroup *memcg = NULL, *old_memcg; > + unsigned long flags; > + void *obj; > + int i; > + > + memcg = get_memcg(c); > + old_memcg = set_active_memcg(memcg); > + for (i = 0; i < cnt; i++) { > + obj = __alloc(c, node); > + if (!obj) > + break; > + if (IS_ENABLED(CONFIG_PREEMPT_RT)) > + /* In RT irq_work runs in per-cpu kthread, so disable > + * interrupts to avoid preemption and interrupts and > + * reduce the chance of bpf prog executing on this cpu > + * when active counter is busy. > + */ > + local_irq_save(flags); > + if (local_inc_return(&c->active) == 1) { Is it because it is always '== 1' here so that there is no need to free the obj when it is '!= 1' ? > + __llist_add(obj, &c->free_llist); > + c->free_cnt++; > + } > + local_dec(&c->active); > + if (IS_ENABLED(CONFIG_PREEMPT_RT)) > + local_irq_restore(flags); > + } > + set_active_memcg(old_memcg); > + mem_cgroup_put(memcg); > +}
On Mon, Aug 29, 2022 at 3:39 PM Martin KaFai Lau <kafai@fb.com> wrote: > > On Thu, Aug 25, 2022 at 07:44:16PM -0700, Alexei Starovoitov wrote: > > +/* Mostly runs from irq_work except __init phase. */ > > +static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node) > > +{ > > + struct mem_cgroup *memcg = NULL, *old_memcg; > > + unsigned long flags; > > + void *obj; > > + int i; > > + > > + memcg = get_memcg(c); > > + old_memcg = set_active_memcg(memcg); > > + for (i = 0; i < cnt; i++) { > > + obj = __alloc(c, node); > > + if (!obj) > > + break; > > + if (IS_ENABLED(CONFIG_PREEMPT_RT)) > > + /* In RT irq_work runs in per-cpu kthread, so disable > > + * interrupts to avoid preemption and interrupts and > > + * reduce the chance of bpf prog executing on this cpu > > + * when active counter is busy. > > + */ > > + local_irq_save(flags); > > + if (local_inc_return(&c->active) == 1) { > Is it because it is always '== 1' here so that there is > no need to free the obj when it is '!= 1' ? Great catch. It's a bug indeed.
On Tue, 30 Aug 2022 at 00:43, Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote: > > On Mon, Aug 29, 2022 at 3:39 PM Martin KaFai Lau <kafai@fb.com> wrote: > > > > On Thu, Aug 25, 2022 at 07:44:16PM -0700, Alexei Starovoitov wrote: > > > +/* Mostly runs from irq_work except __init phase. */ > > > +static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node) > > > +{ > > > + struct mem_cgroup *memcg = NULL, *old_memcg; > > > + unsigned long flags; > > > + void *obj; > > > + int i; > > > + > > > + memcg = get_memcg(c); > > > + old_memcg = set_active_memcg(memcg); > > > + for (i = 0; i < cnt; i++) { > > > + obj = __alloc(c, node); > > > + if (!obj) > > > + break; > > > + if (IS_ENABLED(CONFIG_PREEMPT_RT)) > > > + /* In RT irq_work runs in per-cpu kthread, so disable > > > + * interrupts to avoid preemption and interrupts and > > > + * reduce the chance of bpf prog executing on this cpu > > > + * when active counter is busy. > > > + */ > > > + local_irq_save(flags); > > > + if (local_inc_return(&c->active) == 1) { > > Is it because it is always '== 1' here so that there is > > no need to free the obj when it is '!= 1' ? > > Great catch. It's a bug indeed. Is it a bug? It seems it will always be true for alloc_bulk. IIUC it is only needed to prevent NMI's unit_alloc, unit_free touching free_llist, so that NMI llist_adds atomically to free_llist_nmi instead. Since this runs from irq_work, we run exclusive to other unit_free, otherwise the __llist_add wouldn't be safe either. unit_free already does local_irq_save.
On Mon, Aug 29, 2022 at 4:00 PM Kumar Kartikeya Dwivedi <memxor@gmail.com> wrote: > > On Tue, 30 Aug 2022 at 00:43, Alexei Starovoitov > <alexei.starovoitov@gmail.com> wrote: > > > > On Mon, Aug 29, 2022 at 3:39 PM Martin KaFai Lau <kafai@fb.com> wrote: > > > > > > On Thu, Aug 25, 2022 at 07:44:16PM -0700, Alexei Starovoitov wrote: > > > > +/* Mostly runs from irq_work except __init phase. */ > > > > +static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node) > > > > +{ > > > > + struct mem_cgroup *memcg = NULL, *old_memcg; > > > > + unsigned long flags; > > > > + void *obj; > > > > + int i; > > > > + > > > > + memcg = get_memcg(c); > > > > + old_memcg = set_active_memcg(memcg); > > > > + for (i = 0; i < cnt; i++) { > > > > + obj = __alloc(c, node); > > > > + if (!obj) > > > > + break; > > > > + if (IS_ENABLED(CONFIG_PREEMPT_RT)) > > > > + /* In RT irq_work runs in per-cpu kthread, so disable > > > > + * interrupts to avoid preemption and interrupts and > > > > + * reduce the chance of bpf prog executing on this cpu > > > > + * when active counter is busy. > > > > + */ > > > > + local_irq_save(flags); > > > > + if (local_inc_return(&c->active) == 1) { > > > Is it because it is always '== 1' here so that there is > > > no need to free the obj when it is '!= 1' ? > > > > Great catch. It's a bug indeed. > > Is it a bug? It seems it will always be true for alloc_bulk. IIUC it > is only needed to prevent NMI's unit_alloc, unit_free touching > free_llist, so that NMI llist_adds atomically to free_llist_nmi > instead. Since this runs from irq_work, we run exclusive to other > unit_free, otherwise the __llist_add wouldn't be safe either. > unit_free already does local_irq_save. Correct. It cannot happen in practice, but the code as written will look 'buggy' to any static analyzer. So we have to 'fix' it. I'm thinking to do: WARN_ON_ONCE(local_inc_return(&c->active) != 1); and the same in free_bulk.
diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h new file mode 100644 index 000000000000..804733070f8d --- /dev/null +++ b/include/linux/bpf_mem_alloc.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ +#ifndef _BPF_MEM_ALLOC_H +#define _BPF_MEM_ALLOC_H +#include <linux/compiler_types.h> + +struct bpf_mem_cache; +struct bpf_mem_caches; + +struct bpf_mem_alloc { + struct bpf_mem_caches __percpu *caches; + struct bpf_mem_cache __percpu *cache; +}; + +int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size); +void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma); + +/* kmalloc/kfree equivalent: */ +void *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size); +void bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr); + +/* kmem_cache_alloc/free equivalent: */ +void *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma); +void bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr); + +#endif /* _BPF_MEM_ALLOC_H */ diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 00e05b69a4df..341c94f208f4 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -13,7 +13,7 @@ obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o obj-$(CONFIG_BPF_JIT) += trampoline.o -obj-$(CONFIG_BPF_SYSCALL) += btf.o +obj-$(CONFIG_BPF_SYSCALL) += btf.o memalloc.o obj-$(CONFIG_BPF_JIT) += dispatcher.o ifeq ($(CONFIG_NET),y) obj-$(CONFIG_BPF_SYSCALL) += devmap.o diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c new file mode 100644 index 000000000000..29f340016e9e --- /dev/null +++ b/kernel/bpf/memalloc.c @@ -0,0 +1,476 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ +#include <linux/mm.h> +#include <linux/llist.h> +#include <linux/bpf.h> +#include <linux/irq_work.h> +#include <linux/bpf_mem_alloc.h> +#include <linux/memcontrol.h> +#include <asm/local.h> + +/* Any context (including NMI) BPF specific memory allocator. + * + * Tracing BPF programs can attach to kprobe and fentry. Hence they + * run in unknown context where calling plain kmalloc() might not be safe. + * + * Front-end kmalloc() with per-cpu per-bucket cache of free elements. + * Refill this cache asynchronously from irq_work. + * + * CPU_0 buckets + * 16 32 64 96 128 196 256 512 1024 2048 4096 + * ... + * CPU_N buckets + * 16 32 64 96 128 196 256 512 1024 2048 4096 + * + * The buckets are prefilled at the start. + * BPF programs always run with migration disabled. + * It's safe to allocate from cache of the current cpu with irqs disabled. + * Free-ing is always done into bucket of the current cpu as well. + * irq_work trims extra free elements from buckets with kfree + * and refills them with kmalloc, so global kmalloc logic takes care + * of freeing objects allocated by one cpu and freed on another. + * + * Every allocated objected is padded with extra 8 bytes that contains + * struct llist_node. + */ +#define LLIST_NODE_SZ sizeof(struct llist_node) + +/* similar to kmalloc, but sizeof == 8 bucket is gone */ +static u8 size_index[24] __ro_after_init = { + 3, /* 8 */ + 3, /* 16 */ + 4, /* 24 */ + 4, /* 32 */ + 5, /* 40 */ + 5, /* 48 */ + 5, /* 56 */ + 5, /* 64 */ + 1, /* 72 */ + 1, /* 80 */ + 1, /* 88 */ + 1, /* 96 */ + 6, /* 104 */ + 6, /* 112 */ + 6, /* 120 */ + 6, /* 128 */ + 2, /* 136 */ + 2, /* 144 */ + 2, /* 152 */ + 2, /* 160 */ + 2, /* 168 */ + 2, /* 176 */ + 2, /* 184 */ + 2 /* 192 */ +}; + +static int bpf_mem_cache_idx(size_t size) +{ + if (!size || size > 4096) + return -1; + + if (size <= 192) + return size_index[(size - 1) / 8] - 1; + + return fls(size - 1) - 1; +} + +#define NUM_CACHES 11 + +struct bpf_mem_cache { + /* per-cpu list of free objects of size 'unit_size'. + * All accesses are done with interrupts disabled and 'active' counter + * protection with __llist_add() and __llist_del_first(). + */ + struct llist_head free_llist; + local_t active; + + /* Operations on the free_list from unit_alloc/unit_free/bpf_mem_refill + * are sequenced by per-cpu 'active' counter. But unit_free() cannot + * fail. When 'active' is busy the unit_free() will add an object to + * free_llist_extra. + */ + struct llist_head free_llist_extra; + + /* kmem_cache != NULL when bpf_mem_alloc was created for specific + * element size. + */ + struct kmem_cache *kmem_cache; + struct irq_work refill_work; + struct obj_cgroup *objcg; + int unit_size; + /* count of objects in free_llist */ + int free_cnt; +}; + +struct bpf_mem_caches { + struct bpf_mem_cache cache[NUM_CACHES]; +}; + +static struct llist_node notrace *__llist_del_first(struct llist_head *head) +{ + struct llist_node *entry, *next; + + entry = head->first; + if (!entry) + return NULL; + next = entry->next; + head->first = next; + return entry; +} + +#define BATCH 48 +#define LOW_WATERMARK 32 +#define HIGH_WATERMARK 96 +/* Assuming the average number of elements per bucket is 64, when all buckets + * are used the total memory will be: 64*16*32 + 64*32*32 + 64*64*32 + ... + + * 64*4096*32 ~ 20Mbyte + */ + +static void *__alloc(struct bpf_mem_cache *c, int node) +{ + /* Allocate, but don't deplete atomic reserves that typical + * GFP_ATOMIC would do. irq_work runs on this cpu and kmalloc + * will allocate from the current numa node which is what we + * want here. + */ + gfp_t flags = GFP_NOWAIT | __GFP_NOWARN | __GFP_ACCOUNT; + + if (c->kmem_cache) + return kmem_cache_alloc_node(c->kmem_cache, flags, node); + + return kmalloc_node(c->unit_size, flags, node); +} + +static struct mem_cgroup *get_memcg(const struct bpf_mem_cache *c) +{ +#ifdef CONFIG_MEMCG_KMEM + if (c->objcg) + return get_mem_cgroup_from_objcg(c->objcg); +#endif + +#ifdef CONFIG_MEMCG + return root_mem_cgroup; +#else + return NULL; +#endif +} + +/* Mostly runs from irq_work except __init phase. */ +static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node) +{ + struct mem_cgroup *memcg = NULL, *old_memcg; + unsigned long flags; + void *obj; + int i; + + memcg = get_memcg(c); + old_memcg = set_active_memcg(memcg); + for (i = 0; i < cnt; i++) { + obj = __alloc(c, node); + if (!obj) + break; + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + /* In RT irq_work runs in per-cpu kthread, so disable + * interrupts to avoid preemption and interrupts and + * reduce the chance of bpf prog executing on this cpu + * when active counter is busy. + */ + local_irq_save(flags); + if (local_inc_return(&c->active) == 1) { + __llist_add(obj, &c->free_llist); + c->free_cnt++; + } + local_dec(&c->active); + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_restore(flags); + } + set_active_memcg(old_memcg); + mem_cgroup_put(memcg); +} + +static void free_one(struct bpf_mem_cache *c, void *obj) +{ + if (c->kmem_cache) + kmem_cache_free(c->kmem_cache, obj); + else + kfree(obj); +} + +static void free_bulk(struct bpf_mem_cache *c) +{ + struct llist_node *llnode, *t; + unsigned long flags; + int cnt; + + do { + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_save(flags); + if (local_inc_return(&c->active) == 1) { + llnode = __llist_del_first(&c->free_llist); + if (llnode) + cnt = --c->free_cnt; + else + cnt = 0; + } + local_dec(&c->active); + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_restore(flags); + free_one(c, llnode); + } while (cnt > (HIGH_WATERMARK + LOW_WATERMARK) / 2); + + /* and drain free_llist_extra */ + llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra)) + free_one(c, llnode); +} + +static void bpf_mem_refill(struct irq_work *work) +{ + struct bpf_mem_cache *c = container_of(work, struct bpf_mem_cache, refill_work); + int cnt; + + /* Racy access to free_cnt. It doesn't need to be 100% accurate */ + cnt = c->free_cnt; + if (cnt < LOW_WATERMARK) + /* irq_work runs on this cpu and kmalloc will allocate + * from the current numa node which is what we want here. + */ + alloc_bulk(c, BATCH, NUMA_NO_NODE); + else if (cnt > HIGH_WATERMARK) + free_bulk(c); +} + +static void notrace irq_work_raise(struct bpf_mem_cache *c) +{ + irq_work_queue(&c->refill_work); +} + +static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu) +{ + init_irq_work(&c->refill_work, bpf_mem_refill); + /* To avoid consuming memory assume that 1st run of bpf + * prog won't be doing more than 4 map_update_elem from + * irq disabled region + */ + alloc_bulk(c, c->unit_size <= 256 ? 4 : 1, cpu_to_node(cpu)); +} + +/* When size != 0 create kmem_cache and bpf_mem_cache for each cpu. + * This is typical bpf hash map use case when all elements have equal size. + * + * When size == 0 allocate 11 bpf_mem_cache-s for each cpu, then rely on + * kmalloc/kfree. Max allocation size is 4096 in this case. + * This is bpf_dynptr and bpf_kptr use case. + */ +int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size) +{ + static u16 sizes[NUM_CACHES] = {96, 192, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096}; + struct bpf_mem_caches *cc, __percpu *pcc; + struct bpf_mem_cache *c, __percpu *pc; + struct kmem_cache *kmem_cache; + struct obj_cgroup *objcg = NULL; + char buf[32]; + int cpu, i; + + if (size) { + pc = __alloc_percpu_gfp(sizeof(*pc), 8, GFP_KERNEL); + if (!pc) + return -ENOMEM; + size += LLIST_NODE_SZ; /* room for llist_node */ + snprintf(buf, sizeof(buf), "bpf-%u", size); + kmem_cache = kmem_cache_create(buf, size, 8, 0, NULL); + if (!kmem_cache) { + free_percpu(pc); + return -ENOMEM; + } +#ifdef CONFIG_MEMCG_KMEM + objcg = get_obj_cgroup_from_current(); +#endif + for_each_possible_cpu(cpu) { + c = per_cpu_ptr(pc, cpu); + c->kmem_cache = kmem_cache; + c->unit_size = size; + c->objcg = objcg; + prefill_mem_cache(c, cpu); + } + ma->cache = pc; + return 0; + } + + pcc = __alloc_percpu_gfp(sizeof(*cc), 8, GFP_KERNEL); + if (!pcc) + return -ENOMEM; +#ifdef CONFIG_MEMCG_KMEM + objcg = get_obj_cgroup_from_current(); +#endif + for_each_possible_cpu(cpu) { + cc = per_cpu_ptr(pcc, cpu); + for (i = 0; i < NUM_CACHES; i++) { + c = &cc->cache[i]; + c->unit_size = sizes[i]; + c->objcg = objcg; + prefill_mem_cache(c, cpu); + } + } + ma->caches = pcc; + return 0; +} + +static void drain_mem_cache(struct bpf_mem_cache *c) +{ + struct llist_node *llnode, *t; + + llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist)) + free_one(c, llnode); + llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra)) + free_one(c, llnode); +} + +void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma) +{ + struct bpf_mem_caches *cc; + struct bpf_mem_cache *c; + int cpu, i; + + if (ma->cache) { + for_each_possible_cpu(cpu) { + c = per_cpu_ptr(ma->cache, cpu); + drain_mem_cache(c); + } + /* kmem_cache and memcg are the same across cpus */ + kmem_cache_destroy(c->kmem_cache); + if (c->objcg) + obj_cgroup_put(c->objcg); + free_percpu(ma->cache); + ma->cache = NULL; + } + if (ma->caches) { + for_each_possible_cpu(cpu) { + cc = per_cpu_ptr(ma->caches, cpu); + for (i = 0; i < NUM_CACHES; i++) { + c = &cc->cache[i]; + drain_mem_cache(c); + } + } + if (c->objcg) + obj_cgroup_put(c->objcg); + free_percpu(ma->caches); + ma->caches = NULL; + } +} + +/* notrace is necessary here and in other functions to make sure + * bpf programs cannot attach to them and cause llist corruptions. + */ +static void notrace *unit_alloc(struct bpf_mem_cache *c) +{ + struct llist_node *llnode = NULL; + unsigned long flags; + int cnt = 0; + + /* Disable irqs to prevent the following race for majority of prog types: + * prog_A + * bpf_mem_alloc + * preemption or irq -> prog_B + * bpf_mem_alloc + * + * but prog_B could be a perf_event NMI prog. + * Use per-cpu 'active' counter to order free_list access between + * unit_alloc/unit_free/bpf_mem_refill. + */ + local_irq_save(flags); + if (local_inc_return(&c->active) == 1) { + llnode = __llist_del_first(&c->free_llist); + if (llnode) + cnt = --c->free_cnt; + } + local_dec(&c->active); + local_irq_restore(flags); + + WARN_ON(cnt < 0); + + if (cnt < LOW_WATERMARK) + irq_work_raise(c); + return llnode; +} + +/* Though 'ptr' object could have been allocated on a different cpu + * add it to the free_llist of the current cpu. + * Let kfree() logic deal with it when it's later called from irq_work. + */ +static void notrace unit_free(struct bpf_mem_cache *c, void *ptr) +{ + struct llist_node *llnode = ptr - LLIST_NODE_SZ; + unsigned long flags; + int cnt = 0; + + BUILD_BUG_ON(LLIST_NODE_SZ > 8); + + local_irq_save(flags); + if (local_inc_return(&c->active) == 1) { + __llist_add(llnode, &c->free_llist); + cnt = ++c->free_cnt; + } else { + /* unit_free() cannot fail. Therefore add an object to atomic + * llist. free_bulk() will drain it. Though free_llist_extra is + * a per-cpu list we have to use atomic llist_add here, since + * it also can be interrupted by bpf nmi prog that does another + * unit_free() into the same free_llist_extra. + */ + llist_add(llnode, &c->free_llist_extra); + } + local_dec(&c->active); + local_irq_restore(flags); + + if (cnt > HIGH_WATERMARK) + /* free few objects from current cpu into global kmalloc pool */ + irq_work_raise(c); +} + +/* Called from BPF program or from sys_bpf syscall. + * In both cases migration is disabled. + */ +void notrace *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size) +{ + int idx; + void *ret; + + if (!size) + return ZERO_SIZE_PTR; + + idx = bpf_mem_cache_idx(size + LLIST_NODE_SZ); + if (idx < 0) + return NULL; + + ret = unit_alloc(this_cpu_ptr(ma->caches)->cache + idx); + return !ret ? NULL : ret + LLIST_NODE_SZ; +} + +void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr) +{ + int idx; + + if (!ptr) + return; + + idx = bpf_mem_cache_idx(__ksize(ptr - LLIST_NODE_SZ)); + if (idx < 0) + return; + + unit_free(this_cpu_ptr(ma->caches)->cache + idx, ptr); +} + +void notrace *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma) +{ + void *ret; + + ret = unit_alloc(this_cpu_ptr(ma->cache)); + return !ret ? NULL : ret + LLIST_NODE_SZ; +} + +void notrace bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr) +{ + if (!ptr) + return; + + unit_free(this_cpu_ptr(ma->cache), ptr); +}