[bpf-next,v4,1/5] bpf: Add bitset map with bloom filter capabilities

Message ID	20211006222103.3631981-2-joannekoong@fb.com (mailing list archive)
State	Changes Requested
Delegated to:	BPF
Headers	show Return-Path: <bpf-owner@kernel.org> From: Joanne Koong <joannekoong@fb.com> To: <bpf@vger.kernel.org> CC: <Kernel-team@fb.com>, Joanne Koong <joannekoong@fb.com> Subject: [PATCH bpf-next v4 1/5] bpf: Add bitset map with bloom filter capabilities Date: Wed, 6 Oct 2021 15:20:59 -0700 Message-ID: <20211006222103.3631981-2-joannekoong@fb.com> In-Reply-To: <20211006222103.3631981-1-joannekoong@fb.com> References: <20211006222103.3631981-1-joannekoong@fb.com> MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain Precedence: bulk
Series	Implement bitset maps, with bloom filter \| expand [bpf-next,v4,0/5] Implement bitset maps, with bloom filter [bpf-next,v4,1/5] bpf: Add bitset map with bloom filter capabilities [bpf-next,v4,2/5] libbpf: Add "map_extra" as a per-map-type extra flag [bpf-next,v4,3/5] selftests/bpf: Add bitset map test cases [bpf-next,v4,4/5] bpf/benchs: Add benchmark tests for bloom filter throughput + false positive [bpf-next,v4,5/5] bpf/benchs: Add benchmarks for comparing hashmap lookups w/ vs. w/out bloom filter

Context	Check	Description
netdev/cover_letter	success	Series has a cover letter
netdev/fixes_present	success	Fixes tag not required for -next series
netdev/patch_count	success	Link
netdev/tree_selection	success	Clearly marked for bpf-next
netdev/subject_prefix	success	Link
netdev/cc_maintainers	fail	14 maintainers not CCed: brouer@redhat.com joe@cilium.io netdev@vger.kernel.org jackmanb@google.com kafai@fb.com daniel@iogearbox.net yhs@fb.com toke@redhat.com andrii@kernel.org liuhangbin@gmail.com john.fastabend@gmail.com songliubraving@fb.com kpsingh@kernel.org ast@kernel.org
netdev/source_inline	fail	Was 0 now: 1
netdev/verify_signedoff	success	Signed-off-by tag matches author and committer
netdev/module_param	success	Was 0 now: 0
netdev/build_32bit	success	Errors and warnings before: 11790 this patch: 11790
netdev/kdoc	success	Errors and warnings before: 0 this patch: 0
netdev/verify_fixes	success	No Fixes tag
netdev/checkpatch	warning	WARNING: added, moved or deleted file(s), does MAINTAINERS need updating? WARNING: line length of 81 exceeds 80 columns WARNING: line length of 82 exceeds 80 columns WARNING: line length of 84 exceeds 80 columns WARNING: line length of 85 exceeds 80 columns WARNING: line length of 86 exceeds 80 columns
netdev/build_allmodconfig_warn	success	Errors and warnings before: 11421 this patch: 11421
netdev/header_inline	success	No static functions without inline keyword in header files
bpf/vmtest-bpf-next-PR	success	PR summary
bpf/vmtest-bpf-next	success	VM_Test

diff --git a/include/linux/bpf.h b/include/linux/bpf.h index d604c8251d88..eac5bcecc6a7 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -193,6 +193,8 @@ struct bpf_map { struct work_struct work; struct mutex freeze_mutex; u64 writecnt; /* writable mmap cnt; protected by freeze_mutex */ + + u32 map_extra; /* any per-map-type extra fields */ }; static inline bool map_value_has_spin_lock(const struct bpf_map *map) diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 9c81724e4b98..85339faeca71 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -125,6 +125,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_STACK, stack_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_STRUCT_OPS, bpf_struct_ops_map_ops) #endif BPF_MAP_TYPE(BPF_MAP_TYPE_RINGBUF, ringbuf_map_ops) +BPF_MAP_TYPE(BPF_MAP_TYPE_BITSET, bitset_map_ops) BPF_LINK_TYPE(BPF_LINK_TYPE_RAW_TRACEPOINT, raw_tracepoint) BPF_LINK_TYPE(BPF_LINK_TYPE_TRACING, tracing) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 6fc59d61937a..b40fa1a72a75 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -906,6 +906,7 @@ enum bpf_map_type { BPF_MAP_TYPE_RINGBUF, BPF_MAP_TYPE_INODE_STORAGE, BPF_MAP_TYPE_TASK_STORAGE, + BPF_MAP_TYPE_BITSET, }; /* Note that tracing related programs such as @@ -1252,6 +1253,13 @@ struct bpf_stack_build_id { #define BPF_OBJ_NAME_LEN 16U +/* map_extra flags for bitset maps + * + * The lowest 4 bits are reserved for indicating the number of hash functions. + * If the number of hash functions is greater than 0, the bitset map will + * be used as a bloom filter. + */ + union bpf_attr { struct { /* anonymous struct used by BPF_MAP_CREATE command */ __u32 map_type; /* one of enum bpf_map_type */ @@ -1274,6 +1282,7 @@ union bpf_attr { * struct stored as the * map value */ + __u32 map_extra; /* any per-map-type extra fields */ }; struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 7f33098ca63f..72e381adc708 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -7,7 +7,7 @@ endif CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy) obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o -obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o +obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bitset.o obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o diff --git a/kernel/bpf/bitset.c b/kernel/bpf/bitset.c new file mode 100644 index 000000000000..a5fca0ebb520 --- /dev/null +++ b/kernel/bpf/bitset.c @@ -0,0 +1,256 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ + +#include <linux/bitmap.h> +#include <linux/bpf.h> +#include <linux/btf.h> +#include <linux/err.h> +#include <linux/jhash.h> +#include <linux/random.h> + +#define BITSET_MAP_CREATE_FLAG_MASK \ + (BPF_F_NUMA_NODE | BPF_F_ZERO_SEED | BPF_F_ACCESS_MASK) + +struct bpf_bitset_map { + struct bpf_map map; + + /* If the number of hash functions at map creation time is greater + * than 0, the bitset map will function as a bloom filter and the fields + * in the struct below will be initialized accordingly. + */ + struct { + u32 nr_hash_funcs; + u32 bitset_mask; + u32 hash_seed; + /* If the size of the values in the bloom filter is u32 aligned, + * then it is more performant to use jhash2 as the underlying hash + * function, else we use jhash. This tracks the number of u32s + * in an u32-aligned value size. If the value size is not u32 aligned, + * this will be 0. + */ + u32 aligned_u32_count; + } bloom_filter; + + unsigned long bitset[]; +}; + +static inline bool use_bloom_filter(struct bpf_bitset_map *map) +{ + return map->bloom_filter.nr_hash_funcs > 0; +} + +static u32 hash(struct bpf_bitset_map *map, void *value, + u64 value_size, u32 index) +{ + u32 h; + + if (map->bloom_filter.aligned_u32_count) + h = jhash2(value, map->bloom_filter.aligned_u32_count, + map->bloom_filter.hash_seed + index); + else + h = jhash(value, value_size, map->bloom_filter.hash_seed + index); + + return h & map->bloom_filter.bitset_mask; +} + +static int bitset_map_push_elem(struct bpf_map *map, void *value, + u64 flags) +{ + struct bpf_bitset_map *bitset_map = + container_of(map, struct bpf_bitset_map, map); + u32 i, h, bitset_index; + + if (flags != BPF_ANY) + return -EINVAL; + + if (use_bloom_filter(bitset_map)) { + for (i = 0; i < bitset_map->bloom_filter.nr_hash_funcs; i++) { + h = hash(bitset_map, value, map->value_size, i); + set_bit(h, bitset_map->bitset); + } + } else { + bitset_index = *(u32 *)value; + + if (bitset_index >= map->max_entries) + return -EINVAL; + + set_bit(bitset_index, bitset_map->bitset); + } + + return 0; +} + +static int bitset_map_peek_elem(struct bpf_map *map, void *value) +{ + struct bpf_bitset_map *bitset_map = + container_of(map, struct bpf_bitset_map, map); + u32 i, h, bitset_index; + + if (use_bloom_filter(bitset_map)) { + for (i = 0; i < bitset_map->bloom_filter.nr_hash_funcs; i++) { + h = hash(bitset_map, value, map->value_size, i); + if (!test_bit(h, bitset_map->bitset)) + return -ENOENT; + } + } else { + bitset_index = *(u32 *)value; + if (bitset_index >= map->max_entries) + return -EINVAL; + + if (!test_bit(bitset_index, bitset_map->bitset)) + return -ENOENT; + } + + return 0; +} + +static int bitset_map_pop_elem(struct bpf_map *map, void *value) +{ + struct bpf_bitset_map *bitset_map = + container_of(map, struct bpf_bitset_map, map); + u32 bitset_index; + + if (use_bloom_filter(bitset_map)) + return -EOPNOTSUPP; + + bitset_index = *(u32 *)value; + + if (!test_and_clear_bit(bitset_index, bitset_map->bitset)) + return -EINVAL; + + return 0; +} + +static void init_bloom_filter(struct bpf_bitset_map *bitset_map, union bpf_attr *attr, + u32 nr_hash_funcs, u32 bitset_mask) +{ + bitset_map->bloom_filter.nr_hash_funcs = nr_hash_funcs; + bitset_map->bloom_filter.bitset_mask = bitset_mask; + + /* Check whether the value size is u32-aligned */ + if ((attr->value_size & (sizeof(u32) - 1)) == 0) + bitset_map->bloom_filter.aligned_u32_count = + attr->value_size / sizeof(u32); + + if (!(attr->map_flags & BPF_F_ZERO_SEED)) + bitset_map->bloom_filter.hash_seed = get_random_int(); +} + +static struct bpf_map *bitset_map_alloc(union bpf_attr *attr) +{ + int numa_node = bpf_map_attr_numa_node(attr); + u32 bitset_bytes, bitset_mask, nr_hash_funcs; + struct bpf_bitset_map *bitset_map; + u64 nr_bits_roundup_pow2; + + if (!bpf_capable()) + return ERR_PTR(-EPERM); + + if (attr->key_size != 0 || attr->max_entries == 0 || + attr->map_flags & ~BITSET_MAP_CREATE_FLAG_MASK || + !bpf_map_flags_access_ok(attr->map_flags)) + return ERR_PTR(-EINVAL); + + if (attr->map_extra & ~0xF) + return ERR_PTR(-EINVAL); + + /* The lower 4 bits of map_extra specify the number of hash functions */ + nr_hash_funcs = attr->map_extra & 0xF; + + if (!nr_hash_funcs) { + if (attr->value_size != sizeof(u32)) + return ERR_PTR(-EINVAL); + + /* Round up to the size of an unsigned long since a bit gets set + * at the granularity of an unsigned long. + */ + bitset_bytes = roundup(BITS_TO_BYTES(attr->max_entries), + sizeof(unsigned long)); + } else { + /* If the number of hash functions > 0, then the map will + * function as a bloom filter + */ + + if (attr->value_size == 0) + return ERR_PTR(-EINVAL); + + /* We round up the size of the bitset to the nearest power of two to + * enable more efficient hashing using a bitmask. The bitmask will be + * the bitset size - 1. + */ + nr_bits_roundup_pow2 = roundup_pow_of_two(attr->max_entries); + bitset_mask = nr_bits_roundup_pow2 - 1; + + bitset_bytes = roundup(BITS_TO_BYTES(nr_bits_roundup_pow2), + sizeof(unsigned long)); + } + + bitset_map = bpf_map_area_alloc(sizeof(*bitset_map) + bitset_bytes, + numa_node); + if (!bitset_map) + return ERR_PTR(-ENOMEM); + + bpf_map_init_from_attr(&bitset_map->map, attr); + + if (nr_hash_funcs) + init_bloom_filter(bitset_map, attr, nr_hash_funcs, bitset_mask); + + return &bitset_map->map; +} + +static void bitset_map_free(struct bpf_map *map) +{ + struct bpf_bitset_map *bitset_map = + container_of(map, struct bpf_bitset_map, map); + + bpf_map_area_free(bitset_map); +} + +static void *bitset_map_lookup_elem(struct bpf_map *map, void *key) +{ + /* The eBPF program should use map_peek_elem instead */ + return ERR_PTR(-EINVAL); +} + +static int bitset_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 flags) +{ + /* The eBPF program should use map_push_elem instead */ + return -EINVAL; +} + +static int bitset_map_delete_elem(struct bpf_map *map, void *key) +{ + return -EOPNOTSUPP; +} + +static int bitset_map_get_next_key(struct bpf_map *map, void *key, + void *next_key) +{ + return -EOPNOTSUPP; +} + +static int bitset_map_check_btf(const struct bpf_map *map, const struct btf *btf, + const struct btf_type *key_type, + const struct btf_type *value_type) +{ + /* Bitset maps are keyless */ + return btf_type_is_void(key_type) ? 0 : -EINVAL; +} + +static int bpf_bitset_map_btf_id; +const struct bpf_map_ops bitset_map_ops = { + .map_meta_equal = bpf_map_meta_equal, + .map_alloc = bitset_map_alloc, + .map_free = bitset_map_free, + .map_push_elem = bitset_map_push_elem, + .map_peek_elem = bitset_map_peek_elem, + .map_pop_elem = bitset_map_pop_elem, + .map_lookup_elem = bitset_map_lookup_elem, + .map_update_elem = bitset_map_update_elem, + .map_delete_elem = bitset_map_delete_elem, + .map_get_next_key = bitset_map_get_next_key, + .map_check_btf = bitset_map_check_btf, + .map_btf_name = "bpf_bitset_map", + .map_btf_id = &bpf_bitset_map_btf_id, +}; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 4e50c0bfdb7d..7726774d972a 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -199,7 +199,8 @@ static int bpf_map_update_value(struct bpf_map *map, struct fd f, void *key, err = bpf_fd_reuseport_array_update_elem(map, key, value, flags); } else if (map->map_type == BPF_MAP_TYPE_QUEUE || - map->map_type == BPF_MAP_TYPE_STACK) { + map->map_type == BPF_MAP_TYPE_STACK || + map->map_type == BPF_MAP_TYPE_BITSET) { err = map->ops->map_push_elem(map, value, flags); } else { rcu_read_lock(); @@ -238,7 +239,8 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value, } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) { err = bpf_fd_reuseport_array_lookup_elem(map, key, value); } else if (map->map_type == BPF_MAP_TYPE_QUEUE || - map->map_type == BPF_MAP_TYPE_STACK) { + map->map_type == BPF_MAP_TYPE_STACK || + map->map_type == BPF_MAP_TYPE_BITSET) { err = map->ops->map_peek_elem(map, value); } else if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { /* struct_ops map requires directly updating "value" */ @@ -348,6 +350,7 @@ void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr) map->max_entries = attr->max_entries; map->map_flags = bpf_map_flags_retain_permanent(attr->map_flags); map->numa_node = bpf_map_attr_numa_node(attr); + map->map_extra = attr->map_extra; } static int bpf_map_alloc_id(struct bpf_map *map) @@ -553,6 +556,7 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) "value_size:\t%u\n" "max_entries:\t%u\n" "map_flags:\t%#x\n" + "map_extra:\t%#x\n" "memlock:\t%lu\n" "map_id:\t%u\n" "frozen:\t%u\n", @@ -561,6 +565,7 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) map->value_size, map->max_entries, map->map_flags, + map->map_extra, bpf_map_memory_footprint(map), map->id, READ_ONCE(map->frozen)); @@ -810,7 +815,7 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, return ret; } -#define BPF_MAP_CREATE_LAST_FIELD btf_vmlinux_value_type_id +#define BPF_MAP_CREATE_LAST_FIELD map_extra /* called via syscall */ static int map_create(union bpf_attr *attr) { @@ -1080,6 +1085,14 @@ static int map_lookup_elem(union bpf_attr *attr) if (!value) goto free_key; + if (map->map_type == BPF_MAP_TYPE_BITSET) { + if (copy_from_user(value, uvalue, value_size)) + err = -EFAULT; + else + err = bpf_map_copy_value(map, key, value, attr->flags); + goto free_value; + } + err = bpf_map_copy_value(map, key, value, attr->flags); if (err) goto free_value; @@ -1549,6 +1562,12 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr) if (map->map_type == BPF_MAP_TYPE_QUEUE || map->map_type == BPF_MAP_TYPE_STACK) { err = map->ops->map_pop_elem(map, value); + } else if (map->map_type == BPF_MAP_TYPE_BITSET) { + if (copy_from_user(value, uvalue, value_size)) + err = -EFAULT; + else + err = map->ops->map_pop_elem(map, value); + goto free_value; } else if (map->map_type == BPF_MAP_TYPE_HASH || map->map_type == BPF_MAP_TYPE_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_LRU_HASH || diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 20900a1bac12..731cc90b6e98 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5007,7 +5007,11 @@ static int resolve_map_arg_type(struct bpf_verifier_env *env, return -EINVAL; } break; - + case BPF_MAP_TYPE_BITSET: + if (meta->func_id == BPF_FUNC_map_peek_elem || + meta->func_id == BPF_FUNC_map_pop_elem) + *arg_type = ARG_PTR_TO_MAP_VALUE; + break; default: break; } @@ -5562,6 +5566,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, break; case BPF_MAP_TYPE_QUEUE: case BPF_MAP_TYPE_STACK: + case BPF_MAP_TYPE_BITSET: if (func_id != BPF_FUNC_map_peek_elem && func_id != BPF_FUNC_map_pop_elem && func_id != BPF_FUNC_map_push_elem) @@ -5653,7 +5658,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, case BPF_FUNC_map_pop_elem: case BPF_FUNC_map_push_elem: if (map->map_type != BPF_MAP_TYPE_QUEUE && - map->map_type != BPF_MAP_TYPE_STACK) + map->map_type != BPF_MAP_TYPE_STACK && + map->map_type != BPF_MAP_TYPE_BITSET) goto error; break; case BPF_FUNC_sk_storage_get: diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 6fc59d61937a..b40fa1a72a75 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -906,6 +906,7 @@ enum bpf_map_type { BPF_MAP_TYPE_RINGBUF, BPF_MAP_TYPE_INODE_STORAGE, BPF_MAP_TYPE_TASK_STORAGE, + BPF_MAP_TYPE_BITSET, }; /* Note that tracing related programs such as @@ -1252,6 +1253,13 @@ struct bpf_stack_build_id { #define BPF_OBJ_NAME_LEN 16U +/* map_extra flags for bitset maps + * + * The lowest 4 bits are reserved for indicating the number of hash functions. + * If the number of hash functions is greater than 0, the bitset map will + * be used as a bloom filter. + */ + union bpf_attr { struct { /* anonymous struct used by BPF_MAP_CREATE command */ __u32 map_type; /* one of enum bpf_map_type */ @@ -1274,6 +1282,7 @@ union bpf_attr { * struct stored as the * map value */ + __u32 map_extra; /* any per-map-type extra fields */ }; struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */

[bpf-next,v4,1/5] bpf: Add bitset map with bloom filter capabilities

Checks

Commit Message

Comments

Patch