[bpf-next,1/2] bpf: add new map ops ->map_pressure

Message ID	20230531110511.64612-2-aspsk@isovalent.com (mailing list archive)
State	Changes Requested
Delegated to:	BPF
Headers	show Received: from lindbergh.monkeyblade.net (lindbergh.monkeyblade.net [23.128.96.19]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 3AB9DC8C8 for <bpf@vger.kernel.org>; Wed, 31 May 2023 11:04:56 +0000 (UTC) From: Anton Protopopov <aspsk@isovalent.com> To: bpf@vger.kernel.org Cc: Anton Protopopov <aspsk@isovalent.com>, Joe Stringer <joe@isovalent.com>, John Fastabend <john.fastabend@gmail.com> Subject: [PATCH bpf-next 1/2] bpf: add new map ops ->map_pressure Date: Wed, 31 May 2023 11:05:10 +0000 Message-Id: <20230531110511.64612-2-aspsk@isovalent.com> In-Reply-To: <20230531110511.64612-1-aspsk@isovalent.com> References: <20230531110511.64612-1-aspsk@isovalent.com> Precedence: bulk MIME-Version: 1.0 Content-Transfer-Encoding: 8bit
Series	add mechanism to report map pressure \| expand [bpf-next,0/2] add mechanism to report map pressure [bpf-next,1/2] bpf: add new map ops ->map_pressure [bpf-next,2/2] selftests/bpf: test map pressure

Context	Check	Description
bpf/vmtest-bpf-next-PR	fail	PR summary
bpf/vmtest-bpf-next-VM_Test-1	success	Logs for ${{ matrix.test }} on ${{ matrix.arch }} with ${{ matrix.toolchain_full }}
bpf/vmtest-bpf-next-VM_Test-2	success	Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-3	success	Logs for build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-4	success	Logs for build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-5	success	Logs for build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-6	fail	Logs for build for x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-7	success	Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-8	success	Logs for veristat
netdev/series_format	success	Posting correctly formatted
netdev/tree_selection	success	Clearly marked for bpf-next, async
netdev/fixes_present	success	Fixes tag not required for -next series
netdev/header_inline	success	No static functions without inline keyword in header files
netdev/build_32bit	success	Errors and warnings before: 1732 this patch: 1732
netdev/cc_maintainers	warning	10 maintainers not CCed: daniel@iogearbox.net yhs@fb.com kpsingh@kernel.org martin.lau@linux.dev song@kernel.org sdf@google.com andrii@kernel.org jolsa@kernel.org haoluo@google.com ast@kernel.org
netdev/build_clang	success	Errors and warnings before: 182 this patch: 182
netdev/verify_signedoff	success	Signed-off-by tag matches author and committer
netdev/deprecated_api	success	None detected
netdev/check_selftest	success	No net selftest shell script
netdev/verify_fixes	success	No Fixes tag
netdev/build_allmodconfig_warn	success	Errors and warnings before: 1731 this patch: 1731
netdev/checkpatch	warning	CHECK: Unbalanced braces around else statement CHECK: braces {} should be used on all arms of this statement WARNING: line length of 81 exceeds 80 columns WARNING: line length of 82 exceeds 80 columns WARNING: line length of 83 exceeds 80 columns WARNING: line length of 84 exceeds 80 columns WARNING: line length of 85 exceeds 80 columns
netdev/kdoc	success	Errors and warnings before: 0 this patch: 0
netdev/source_inline	success	Was 0 now: 0

diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f58895830ada..4d33fc6ed2ea 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -162,6 +162,7 @@ struct bpf_map_ops { void *callback_ctx, u64 flags); u64 (*map_mem_usage)(const struct bpf_map *map); + u32 (*map_pressure)(const struct bpf_map *map); /* BTF id of struct allocated by map_alloc */ int *map_btf_id; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 9273c654743c..99580f2d006b 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -6363,7 +6363,7 @@ struct bpf_map_info { __u32 btf_id; __u32 btf_key_type_id; __u32 btf_value_type_id; - __u32 :32; /* alignment pad */ + __u32 raw_pressure; __u64 map_extra; } __attribute__((aligned(8))); diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 9901efee4339..331a923e29d5 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -133,6 +133,63 @@ static inline bool htab_is_prealloc(const struct bpf_htab *htab) return !(htab->map.map_flags & BPF_F_NO_PREALLOC); } +/* compute_batch_value() computes batch value as num_online_cpus() * 2 + * and __percpu_counter_compare() needs + * htab->max_entries - cur_number_of_elems to be more than batch * num_online_cpus() + * for percpu_counter to be faster than atomic_t. In practice the average bpf + * hash map size is 10k, which means that a system with 64 cpus will fill + * hashmap to 20% of 10k before percpu_counter becomes ineffective. Therefore + * define our own batch count as 32 then 10k hash map can be filled up to 80%: + * 10k - 8k > 32 _batch_ * 64 _cpus_ + * and __percpu_counter_compare() will still be fast. At that point hash map + * collisions will dominate its performance anyway. Assume that hash map filled + * to 50+% isn't going to be O(1) and use the following formula to choose + * between percpu_counter and atomic_t. + * + * For preallocated maps we only increase/decrease counters on adding/removing + * an element to be later fetched by htab_map_pressure, so we always enable the + * per-cpu version in favor of atomic + */ +#define PERCPU_COUNTER_BATCH 32 +static bool htab_use_percpu_counter(union bpf_attr *attr) +{ + return (attr->max_entries / 2 > num_online_cpus() * PERCPU_COUNTER_BATCH || + !(attr->map_flags & BPF_F_NO_PREALLOC)); +} + +static bool is_map_full(struct bpf_htab *htab) +{ + if (htab->use_percpu_counter) + return __percpu_counter_compare(&htab->pcount, htab->map.max_entries, + PERCPU_COUNTER_BATCH) >= 0; + return atomic_read(&htab->count) >= htab->map.max_entries; +} + +static void inc_elem_count(struct bpf_htab *htab) +{ + if (htab->use_percpu_counter) + percpu_counter_add_batch(&htab->pcount, 1, PERCPU_COUNTER_BATCH); + else + atomic_inc(&htab->count); +} + +static void dec_elem_count(struct bpf_htab *htab) +{ + if (htab->use_percpu_counter) + percpu_counter_add_batch(&htab->pcount, -1, PERCPU_COUNTER_BATCH); + else + atomic_dec(&htab->count); +} + +static u32 htab_map_pressure(const struct bpf_map *map) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + + if (htab->use_percpu_counter) + return __percpu_counter_sum(&htab->pcount); + return atomic_read(&htab->count); +} + static void htab_init_buckets(struct bpf_htab *htab) { unsigned int i; @@ -539,23 +596,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) htab_init_buckets(htab); -/* compute_batch_value() computes batch value as num_online_cpus() * 2 - * and __percpu_counter_compare() needs - * htab->max_entries - cur_number_of_elems to be more than batch * num_online_cpus() - * for percpu_counter to be faster than atomic_t. In practice the average bpf - * hash map size is 10k, which means that a system with 64 cpus will fill - * hashmap to 20% of 10k before percpu_counter becomes ineffective. Therefore - * define our own batch count as 32 then 10k hash map can be filled up to 80%: - * 10k - 8k > 32 _batch_ * 64 _cpus_ - * and __percpu_counter_compare() will still be fast. At that point hash map - * collisions will dominate its performance anyway. Assume that hash map filled - * to 50+% isn't going to be O(1) and use the following formula to choose - * between percpu_counter and atomic_t. - */ -#define PERCPU_COUNTER_BATCH 32 - if (attr->max_entries / 2 > num_online_cpus() * PERCPU_COUNTER_BATCH) - htab->use_percpu_counter = true; - + htab->use_percpu_counter = htab_use_percpu_counter(attr); if (htab->use_percpu_counter) { err = percpu_counter_init(&htab->pcount, 0, GFP_KERNEL); if (err) @@ -810,6 +851,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) if (l == tgt_l) { hlist_nulls_del_rcu(&l->hash_node); check_and_free_fields(htab, l); + dec_elem_count(htab); break; } @@ -896,40 +938,16 @@ static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l) } } -static bool is_map_full(struct bpf_htab *htab) -{ - if (htab->use_percpu_counter) - return __percpu_counter_compare(&htab->pcount, htab->map.max_entries, - PERCPU_COUNTER_BATCH) >= 0; - return atomic_read(&htab->count) >= htab->map.max_entries; -} - -static void inc_elem_count(struct bpf_htab *htab) -{ - if (htab->use_percpu_counter) - percpu_counter_add_batch(&htab->pcount, 1, PERCPU_COUNTER_BATCH); - else - atomic_inc(&htab->count); -} - -static void dec_elem_count(struct bpf_htab *htab) -{ - if (htab->use_percpu_counter) - percpu_counter_add_batch(&htab->pcount, -1, PERCPU_COUNTER_BATCH); - else - atomic_dec(&htab->count); -} - - static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) { htab_put_fd_value(htab, l); + dec_elem_count(htab); + if (htab_is_prealloc(htab)) { check_and_free_fields(htab, l); __pcpu_freelist_push(&htab->freelist, &l->fnode); } else { - dec_elem_count(htab); htab_elem_free(htab, l); } } @@ -1006,6 +1024,7 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, if (!l) return ERR_PTR(-E2BIG); l_new = container_of(l, struct htab_elem, fnode); + inc_elem_count(htab); } } else { if (is_map_full(htab)) @@ -1227,9 +1246,11 @@ static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value * concurrent search will find it before old elem */ hlist_nulls_add_head_rcu(&l_new->hash_node, head); + inc_elem_count(htab); if (l_old) { bpf_lru_node_set_ref(&l_new->lru_node); hlist_nulls_del_rcu(&l_old->hash_node); + dec_elem_count(htab); } ret = 0; @@ -1357,6 +1378,7 @@ static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, pcpu_init_value(htab, htab_elem_get_ptr(l_new, key_size), value, onallcpus); hlist_nulls_add_head_rcu(&l_new->hash_node, head); + inc_elem_count(htab); l_new = NULL; } ret = 0; @@ -1443,9 +1465,10 @@ static long htab_lru_map_delete_elem(struct bpf_map *map, void *key) l = lookup_elem_raw(head, hash, key, key_size); - if (l) + if (l) { + dec_elem_count(htab); hlist_nulls_del_rcu(&l->hash_node); - else + } else ret = -ENOENT; htab_unlock_bucket(htab, b, hash, flags); @@ -2249,6 +2272,7 @@ const struct bpf_map_ops htab_map_ops = { .map_set_for_each_callback_args = map_set_for_each_callback_args, .map_for_each_callback = bpf_for_each_hash_elem, .map_mem_usage = htab_map_mem_usage, + .map_pressure = htab_map_pressure, BATCH_OPS(htab), .map_btf_id = &htab_map_btf_ids[0], .iter_seq_info = &iter_seq_info, @@ -2271,6 +2295,7 @@ const struct bpf_map_ops htab_lru_map_ops = { .map_set_for_each_callback_args = map_set_for_each_callback_args, .map_for_each_callback = bpf_for_each_hash_elem, .map_mem_usage = htab_map_mem_usage, + .map_pressure = htab_map_pressure, BATCH_OPS(htab_lru), .map_btf_id = &htab_map_btf_ids[0], .iter_seq_info = &iter_seq_info, @@ -2423,6 +2448,7 @@ const struct bpf_map_ops htab_percpu_map_ops = { .map_set_for_each_callback_args = map_set_for_each_callback_args, .map_for_each_callback = bpf_for_each_hash_elem, .map_mem_usage = htab_map_mem_usage, + .map_pressure = htab_map_pressure, BATCH_OPS(htab_percpu), .map_btf_id = &htab_map_btf_ids[0], .iter_seq_info = &iter_seq_info, @@ -2443,6 +2469,7 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = { .map_set_for_each_callback_args = map_set_for_each_callback_args, .map_for_each_callback = bpf_for_each_hash_elem, .map_mem_usage = htab_map_mem_usage, + .map_pressure = htab_map_pressure, BATCH_OPS(htab_lru_percpu), .map_btf_id = &htab_map_btf_ids[0], .iter_seq_info = &iter_seq_info, @@ -2581,6 +2608,7 @@ const struct bpf_map_ops htab_of_maps_map_ops = { .map_gen_lookup = htab_of_map_gen_lookup, .map_check_btf = map_check_no_btf, .map_mem_usage = htab_map_mem_usage, + .map_pressure = htab_map_pressure, BATCH_OPS(htab), .map_btf_id = &htab_map_btf_ids[0], }; diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index e0d3ddf2037a..24ff5feb07ca 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -730,6 +730,13 @@ static u64 trie_mem_usage(const struct bpf_map *map) return elem_size * READ_ONCE(trie->n_entries); } +static u32 trie_map_pressure(const struct bpf_map *map) +{ + struct lpm_trie *trie = container_of(map, struct lpm_trie, map); + + return READ_ONCE(trie->n_entries); +} + BTF_ID_LIST_SINGLE(trie_map_btf_ids, struct, lpm_trie) const struct bpf_map_ops trie_map_ops = { .map_meta_equal = bpf_map_meta_equal, @@ -744,5 +751,6 @@ const struct bpf_map_ops trie_map_ops = { .map_delete_batch = generic_map_delete_batch, .map_check_btf = trie_check_btf, .map_mem_usage = trie_mem_usage, + .map_pressure = trie_map_pressure, .map_btf_id = &trie_map_btf_ids[0], }; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 92a57efc77de..6ea30a24f057 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -794,6 +794,13 @@ static fmode_t map_get_sys_perms(struct bpf_map *map, struct fd f) return mode; } +static u32 bpf_map_pressure(const struct bpf_map *map) +{ + if (map->ops->map_pressure) + return map->ops->map_pressure(map); + return 0; +} + #ifdef CONFIG_PROC_FS /* Show the memory usage of a bpf map */ static u64 bpf_map_memory_usage(const struct bpf_map *map) @@ -804,6 +811,7 @@ static u64 bpf_map_memory_usage(const struct bpf_map *map) static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) { struct bpf_map *map = filp->private_data; + char map_pressure_buf[36] = ""; u32 type = 0, jited = 0; if (map_type_contains_progs(map)) { @@ -813,6 +821,10 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) spin_unlock(&map->owner.lock); } + if (map->ops->map_pressure) + snprintf(map_pressure_buf, sizeof(map_pressure_buf), + "raw_pressure:\t%u\n", map->ops->map_pressure(map)); + seq_printf(m, "map_type:\t%u\n" "key_size:\t%u\n" @@ -821,6 +833,7 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) "map_flags:\t%#x\n" "map_extra:\t%#llx\n" "memlock:\t%llu\n" + "%s" "map_id:\t%u\n" "frozen:\t%u\n", map->map_type, @@ -830,6 +843,7 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) map->map_flags, (unsigned long long)map->map_extra, bpf_map_memory_usage(map), + map_pressure_buf, map->id, READ_ONCE(map->frozen)); if (type) { @@ -4275,6 +4289,7 @@ static int bpf_map_get_info_by_fd(struct file *file, info.value_size = map->value_size; info.max_entries = map->max_entries; info.map_flags = map->map_flags; + info.raw_pressure = bpf_map_pressure(map); info.map_extra = map->map_extra; memcpy(info.name, map->name, sizeof(map->name)); diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 9273c654743c..99580f2d006b 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -6363,7 +6363,7 @@ struct bpf_map_info { __u32 btf_id; __u32 btf_key_type_id; __u32 btf_value_type_id; - __u32 :32; /* alignment pad */ + __u32 raw_pressure; __u64 map_extra; } __attribute__((aligned(8)));

[bpf-next,1/2] bpf: add new map ops ->map_pressure

Checks

Commit Message

Comments

Patch