[bpf] bpf: Support for setting numa node in bpf memory allocator

Message ID	20221020142247.1682009-1-houtao@huaweicloud.com (mailing list archive)
State	Changes Requested
Delegated to:	BPF
Headers	show Return-Path: <bpf-owner@kernel.org> From: Hou Tao <houtao@huaweicloud.com> To: bpf@vger.kernel.org, Alexei Starovoitov <ast@kernel.org> Cc: Martin KaFai Lau <martin.lau@linux.dev>, Andrii Nakryiko <andrii@kernel.org>, Song Liu <song@kernel.org>, Hao Luo <haoluo@google.com>, Yonghong Song <yhs@fb.com>, Daniel Borkmann <daniel@iogearbox.net>, KP Singh <kpsingh@kernel.org>, Stanislav Fomichev <sdf@google.com>, Jiri Olsa <jolsa@kernel.org>, John Fastabend <john.fastabend@gmail.com>, houtao1@huawei.com Subject: [PATCH bpf] bpf: Support for setting numa node in bpf memory allocator Date: Thu, 20 Oct 2022 22:22:47 +0800 Message-Id: <20221020142247.1682009-1-houtao@huaweicloud.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Precedence: bulk
Series	[bpf] bpf: Support for setting numa node in bpf memory allocator \| expand [bpf] bpf: Support for setting numa node in bpf memory allocator

Context	Check	Description
bpf/vmtest-bpf-PR	fail	PR summary
bpf/vmtest-bpf-VM_Test-1	pending	Logs for ${{ matrix.test }} on ${{ matrix.arch }} with ${{ matrix.toolchain }}
bpf/vmtest-bpf-VM_Test-2	fail	Logs for build for s390x with gcc
bpf/vmtest-bpf-VM_Test-3	fail	Logs for build for x86_64 with gcc
bpf/vmtest-bpf-VM_Test-4	fail	Logs for build for x86_64 with llvm-16
bpf/vmtest-bpf-VM_Test-5	success	Logs for llvm-toolchain
bpf/vmtest-bpf-VM_Test-6	success	Logs for set-matrix
netdev/tree_selection	success	Clearly marked for bpf
netdev/fixes_present	success	Fixes tag present in non-next series
netdev/subject_prefix	success	Link
netdev/cover_letter	success	Single patches do not need cover letters
netdev/patch_count	success	Link
netdev/header_inline	success	No static functions without inline keyword in header files
netdev/build_32bit	success	Errors and warnings before: 15 this patch: 15
netdev/cc_maintainers	fail	1 blamed authors not CCed: memxor@gmail.com; 1 maintainers not CCed: memxor@gmail.com
netdev/build_clang	success	Errors and warnings before: 5 this patch: 5
netdev/module_param	success	Was 0 now: 0
netdev/verify_signedoff	success	Signed-off-by tag matches author and committer
netdev/check_selftest	success	No net selftest shell script
netdev/verify_fixes	success	Fixes tag looks correct
netdev/build_allmodconfig_warn	success	Errors and warnings before: 15 this patch: 15
netdev/checkpatch	warning	WARNING: line length of 83 exceeds 80 columns WARNING: line length of 84 exceeds 80 columns
netdev/kdoc	success	Errors and warnings before: 0 this patch: 0
netdev/source_inline	fail	Was 0 now: 2

diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h index 3e164b8efaa9..5b1e34d6f133 100644 --- a/include/linux/bpf_mem_alloc.h +++ b/include/linux/bpf_mem_alloc.h @@ -14,7 +14,8 @@ struct bpf_mem_alloc { struct work_struct work; }; -int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu); +int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, int numa_node, + bool percpu); void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma); /* kmalloc/kfree equivalent: */ diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index ed3f8a53603b..34954195841d 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -568,12 +568,14 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) goto free_prealloc; } } else { - err = bpf_mem_alloc_init(&htab->ma, htab->elem_size, false); + err = bpf_mem_alloc_init(&htab->ma, htab->elem_size, + htab->map.numa_node, false); if (err) goto free_map_locked; if (percpu) { err = bpf_mem_alloc_init(&htab->pcpu_ma, - round_up(htab->map.value_size, 8), true); + round_up(htab->map.value_size, 8), + htab->map.numa_node, true); if (err) goto free_map_locked; } diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index fc116cf47d24..44c531ba9534 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -6,6 +6,7 @@ #include <linux/irq_work.h> #include <linux/bpf_mem_alloc.h> #include <linux/memcontrol.h> +#include <linux/nodemask.h> #include <asm/local.h> /* Any context (including NMI) BPF specific memory allocator. @@ -98,6 +99,7 @@ struct bpf_mem_cache { int free_cnt; int low_watermark, high_watermark, batch; int percpu_size; + int numa_node; struct rcu_head rcu; struct llist_head free_by_rcu; @@ -125,8 +127,8 @@ static void *__alloc(struct bpf_mem_cache *c, int node) { /* Allocate, but don't deplete atomic reserves that typical * GFP_ATOMIC would do. irq_work runs on this cpu and kmalloc - * will allocate from the current numa node which is what we - * want here. + * will allocate from the current numa node if numa_node is + * NUMA_NO_NODE, else will allocate from specific numa_node. */ gfp_t flags = GFP_NOWAIT | __GFP_NOWARN | __GFP_ACCOUNT; @@ -301,9 +303,10 @@ static void bpf_mem_refill(struct irq_work *work) cnt = c->free_cnt; if (cnt < c->low_watermark) /* irq_work runs on this cpu and kmalloc will allocate - * from the current numa node which is what we want here. + * from the current numa node if numa_node is NUMA_NO_NODE, + * else allocate from specific numa_node. */ - alloc_bulk(c, c->batch, NUMA_NO_NODE); + alloc_bulk(c, c->batch, c->numa_node); else if (cnt > c->high_watermark) free_bulk(c); } @@ -328,7 +331,7 @@ static void notrace irq_work_raise(struct bpf_mem_cache *c) * bpf progs can and should share bpf_mem_cache when possible. */ -static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu) +static void prefill_mem_cache(struct bpf_mem_cache *c, int node) { init_irq_work(&c->refill_work, bpf_mem_refill); if (c->unit_size <= 256) { @@ -349,7 +352,28 @@ static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu) * prog won't be doing more than 4 map_update_elem from * irq disabled region */ - alloc_bulk(c, c->unit_size <= 256 ? 4 : 1, cpu_to_node(cpu)); + alloc_bulk(c, c->unit_size <= 256 ? 4 : 1, node); +} + +static inline bool is_valid_numa_node(int numa_node, bool percpu) +{ + return numa_node == NUMA_NO_NODE || + (!percpu && (unsigned int)numa_node < nr_node_ids); +} + +/* The initial prefill is running in the context of map creation process, so + * if the preferred numa node is NUMA_NO_NODE, needs to use numa node of the + * specific cpu instead. + */ +static inline int get_prefill_numa_node(int numa_node, int cpu) +{ + int prefill_numa_node; + + if (numa_node == NUMA_NO_NODE) + prefill_numa_node = cpu_to_node(cpu); + else + prefill_numa_node = numa_node; + return prefill_numa_node; } /* When size != 0 bpf_mem_cache for each cpu. @@ -359,13 +383,17 @@ static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu) * kmalloc/kfree. Max allocation size is 4096 in this case. * This is bpf_dynptr and bpf_kptr use case. */ -int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) +int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, int numa_node, + bool percpu) { static u16 sizes[NUM_CACHES] = {96, 192, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096}; struct bpf_mem_caches *cc, __percpu *pcc; + int cpu, i, unit_size, percpu_size = 0; struct bpf_mem_cache *c, __percpu *pc; struct obj_cgroup *objcg = NULL; - int cpu, i, unit_size, percpu_size = 0; + + if (!is_valid_numa_node(numa_node, percpu)) + return -EINVAL; if (size) { pc = __alloc_percpu_gfp(sizeof(*pc), 8, GFP_KERNEL); @@ -387,7 +415,8 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) c->unit_size = unit_size; c->objcg = objcg; c->percpu_size = percpu_size; - prefill_mem_cache(c, cpu); + c->numa_node = numa_node; + prefill_mem_cache(c, get_prefill_numa_node(numa_node, cpu)); } ma->cache = pc; return 0; @@ -409,7 +438,8 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) c = &cc->cache[i]; c->unit_size = sizes[i]; c->objcg = objcg; - prefill_mem_cache(c, cpu); + c->numa_node = numa_node; + prefill_mem_cache(c, get_prefill_numa_node(numa_node, cpu)); } } ma->caches = pcc;

[bpf] bpf: Support for setting numa node in bpf memory allocator

Checks

Commit Message

Comments

Patch