Message ID | 20230630082516.16286-4-aspsk@isovalent.com (mailing list archive) |
---|---|
State | Superseded |
Delegated to: | BPF |
Headers | show |
Series | bpf: add percpu stats for bpf_map | expand |
Hi, On 6/30/2023 4:25 PM, Anton Protopopov wrote: > Initialize and utilize the per-cpu insertions/deletions counters for hash-based > maps. Non-trivial changes only apply to the preallocated maps for which the > {inc,dec}_elem_count functions are not called, as there's no need in counting > elements to sustain proper map operations. > > To increase/decrease percpu counters for preallocated maps we add raw calls to > the bpf_map_{inc,dec}_elem_count functions so that the impact is minimal. For > dynamically allocated maps we add corresponding calls to the existing > {inc,dec}_elem_count functions. > > Signed-off-by: Anton Protopopov <aspsk@isovalent.com> > --- > kernel/bpf/hashtab.c | 23 ++++++++++++++++++++--- > 1 file changed, 20 insertions(+), 3 deletions(-) > > diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c > index 56d3da7d0bc6..faaef4fd3df0 100644 > --- a/kernel/bpf/hashtab.c > +++ b/kernel/bpf/hashtab.c > @@ -581,8 +581,14 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) > } > } > > + err = bpf_map_init_elem_count(&htab->map); > + if (err) > + goto free_extra_elements; Considering the per-cpu counter is not always needed, is it a good idea to make the elem_count being optional by introducing a new map flag ? > + > return &htab->map; > > +free_extra_elements: > + free_percpu(htab->extra_elems); > free_prealloc: > prealloc_destroy(htab); Need to check prealloc before calling prealloc_destroy(htab), otherwise for non-preallocated percpu htab prealloc_destroy() will trigger invalid memory dereference. > free_map_locked: > @@ -804,6 +810,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) > if (l == tgt_l) { > hlist_nulls_del_rcu(&l->hash_node); > check_and_free_fields(htab, l); > + bpf_map_dec_elem_count(&htab->map); > break; > } > > @@ -900,6 +907,8 @@ static bool is_map_full(struct bpf_htab *htab) > > static void inc_elem_count(struct bpf_htab *htab) > { > + bpf_map_inc_elem_count(&htab->map); > + > if (htab->use_percpu_counter) > percpu_counter_add_batch(&htab->pcount, 1, PERCPU_COUNTER_BATCH); > else > @@ -908,6 +917,8 @@ static void inc_elem_count(struct bpf_htab *htab) > > static void dec_elem_count(struct bpf_htab *htab) > { > + bpf_map_dec_elem_count(&htab->map); > + > if (htab->use_percpu_counter) > percpu_counter_add_batch(&htab->pcount, -1, PERCPU_COUNTER_BATCH); > else > @@ -920,6 +931,7 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) > htab_put_fd_value(htab, l); > > if (htab_is_prealloc(htab)) { > + bpf_map_dec_elem_count(&htab->map); > check_and_free_fields(htab, l); > __pcpu_freelist_push(&htab->freelist, &l->fnode); > } else { > @@ -1000,6 +1012,7 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, > if (!l) > return ERR_PTR(-E2BIG); > l_new = container_of(l, struct htab_elem, fnode); > + bpf_map_inc_elem_count(&htab->map); > } > } else { > if (is_map_full(htab)) > @@ -1224,7 +1237,8 @@ static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value > if (l_old) { > bpf_lru_node_set_ref(&l_new->lru_node); > hlist_nulls_del_rcu(&l_old->hash_node); > - } > + } else > + bpf_map_inc_elem_count(&htab->map); > ret = 0; > > err: > @@ -1351,6 +1365,7 @@ static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, > pcpu_init_value(htab, htab_elem_get_ptr(l_new, key_size), > value, onallcpus); > hlist_nulls_add_head_rcu(&l_new->hash_node, head); > + bpf_map_inc_elem_count(&htab->map); > l_new = NULL; > } > ret = 0; > @@ -1437,9 +1452,10 @@ static long htab_lru_map_delete_elem(struct bpf_map *map, void *key) > > l = lookup_elem_raw(head, hash, key, key_size); > > - if (l) > + if (l) { > + bpf_map_dec_elem_count(&htab->map); > hlist_nulls_del_rcu(&l->hash_node); > - else > + } else > ret = -ENOENT; Also need to decrease elem_count for __htab_map_lookup_and_delete_batch() and __htab_map_lookup_and_delete_elem() when is_lru_map is true. Maybe for LRU map, we could just do bpf_map_dec_elem_count() in htab_lru_push_free() and do bpf_map_inc_elem_count() in prealloc_lru_pop(). > > htab_unlock_bucket(htab, b, hash, flags); > @@ -1523,6 +1539,7 @@ static void htab_map_free(struct bpf_map *map) > prealloc_destroy(htab); > } > > + bpf_map_free_elem_count(map); > free_percpu(htab->extra_elems); > bpf_map_area_free(htab->buckets); > bpf_mem_alloc_destroy(&htab->pcpu_ma);
On Tue, Jul 04, 2023 at 09:56:36PM +0800, Hou Tao wrote: > Hi, > > On 6/30/2023 4:25 PM, Anton Protopopov wrote: > > Initialize and utilize the per-cpu insertions/deletions counters for hash-based > > maps. Non-trivial changes only apply to the preallocated maps for which the > > {inc,dec}_elem_count functions are not called, as there's no need in counting > > elements to sustain proper map operations. > > > > To increase/decrease percpu counters for preallocated maps we add raw calls to > > the bpf_map_{inc,dec}_elem_count functions so that the impact is minimal. For > > dynamically allocated maps we add corresponding calls to the existing > > {inc,dec}_elem_count functions. > > > > Signed-off-by: Anton Protopopov <aspsk@isovalent.com> > > --- > > kernel/bpf/hashtab.c | 23 ++++++++++++++++++++--- > > 1 file changed, 20 insertions(+), 3 deletions(-) > > > > diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c > > index 56d3da7d0bc6..faaef4fd3df0 100644 > > --- a/kernel/bpf/hashtab.c > > +++ b/kernel/bpf/hashtab.c > > @@ -581,8 +581,14 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) > > } > > } > > > > + err = bpf_map_init_elem_count(&htab->map); > > + if (err) > > + goto free_extra_elements; > Considering the per-cpu counter is not always needed, is it a good idea > to make the elem_count being optional by introducing a new map flag ? Per-map-flag or a static key? For me it looked like just doing an unconditional `inc` for a per-cpu variable is better vs. doing a check then `inc` or an unconditional jump. > > + > > return &htab->map; > > > > +free_extra_elements: > > + free_percpu(htab->extra_elems); > > free_prealloc: > > prealloc_destroy(htab); > Need to check prealloc before calling prealloc_destroy(htab), otherwise > for non-preallocated percpu htab prealloc_destroy() will trigger invalid > memory dereference. Thanks! > > free_map_locked: > > @@ -804,6 +810,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) > > if (l == tgt_l) { > > hlist_nulls_del_rcu(&l->hash_node); > > check_and_free_fields(htab, l); > > + bpf_map_dec_elem_count(&htab->map); > > break; > > } > > > > @@ -900,6 +907,8 @@ static bool is_map_full(struct bpf_htab *htab) > > > > static void inc_elem_count(struct bpf_htab *htab) > > { > > + bpf_map_inc_elem_count(&htab->map); > > + > > if (htab->use_percpu_counter) > > percpu_counter_add_batch(&htab->pcount, 1, PERCPU_COUNTER_BATCH); > > else > > @@ -908,6 +917,8 @@ static void inc_elem_count(struct bpf_htab *htab) > > > > static void dec_elem_count(struct bpf_htab *htab) > > { > > + bpf_map_dec_elem_count(&htab->map); > > + > > if (htab->use_percpu_counter) > > percpu_counter_add_batch(&htab->pcount, -1, PERCPU_COUNTER_BATCH); > > else > > @@ -920,6 +931,7 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) > > htab_put_fd_value(htab, l); > > > > if (htab_is_prealloc(htab)) { > > + bpf_map_dec_elem_count(&htab->map); > > check_and_free_fields(htab, l); > > __pcpu_freelist_push(&htab->freelist, &l->fnode); > > } else { > > @@ -1000,6 +1012,7 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, > > if (!l) > > return ERR_PTR(-E2BIG); > > l_new = container_of(l, struct htab_elem, fnode); > > + bpf_map_inc_elem_count(&htab->map); > > } > > } else { > > if (is_map_full(htab)) > > @@ -1224,7 +1237,8 @@ static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value > > if (l_old) { > > bpf_lru_node_set_ref(&l_new->lru_node); > > hlist_nulls_del_rcu(&l_old->hash_node); > > - } > > + } else > > + bpf_map_inc_elem_count(&htab->map); > > ret = 0; > > > > err: > > @@ -1351,6 +1365,7 @@ static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, > > pcpu_init_value(htab, htab_elem_get_ptr(l_new, key_size), > > value, onallcpus); > > hlist_nulls_add_head_rcu(&l_new->hash_node, head); > > + bpf_map_inc_elem_count(&htab->map); > > l_new = NULL; > > } > > ret = 0; > > @@ -1437,9 +1452,10 @@ static long htab_lru_map_delete_elem(struct bpf_map *map, void *key) > > > > l = lookup_elem_raw(head, hash, key, key_size); > > > > - if (l) > > + if (l) { > > + bpf_map_dec_elem_count(&htab->map); > > hlist_nulls_del_rcu(&l->hash_node); > > - else > > + } else > > ret = -ENOENT; > Also need to decrease elem_count for > __htab_map_lookup_and_delete_batch() and > __htab_map_lookup_and_delete_elem() when is_lru_map is true. Maybe for > LRU map, we could just do bpf_map_dec_elem_count() in > htab_lru_push_free() and do bpf_map_inc_elem_count() in prealloc_lru_pop(). Thanks. I will fix the logic and extend the selftest to test the batch ops as well. > > > > htab_unlock_bucket(htab, b, hash, flags); > > @@ -1523,6 +1539,7 @@ static void htab_map_free(struct bpf_map *map) > > prealloc_destroy(htab); > > } > > > > + bpf_map_free_elem_count(map); > > free_percpu(htab->extra_elems); > > bpf_map_area_free(htab->buckets); > > bpf_mem_alloc_destroy(&htab->pcpu_ma); >
Hi, On 7/4/2023 10:34 PM, Anton Protopopov wrote: > On Tue, Jul 04, 2023 at 09:56:36PM +0800, Hou Tao wrote: >> Hi, >> >> On 6/30/2023 4:25 PM, Anton Protopopov wrote: >>> Initialize and utilize the per-cpu insertions/deletions counters for hash-based >>> maps. Non-trivial changes only apply to the preallocated maps for which the >>> {inc,dec}_elem_count functions are not called, as there's no need in counting >>> elements to sustain proper map operations. >>> >>> To increase/decrease percpu counters for preallocated maps we add raw calls to >>> the bpf_map_{inc,dec}_elem_count functions so that the impact is minimal. For >>> dynamically allocated maps we add corresponding calls to the existing >>> {inc,dec}_elem_count functions. >>> >>> Signed-off-by: Anton Protopopov <aspsk@isovalent.com> >>> --- >>> kernel/bpf/hashtab.c | 23 ++++++++++++++++++++--- >>> 1 file changed, 20 insertions(+), 3 deletions(-) >>> >>> diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c >>> index 56d3da7d0bc6..faaef4fd3df0 100644 >>> --- a/kernel/bpf/hashtab.c >>> +++ b/kernel/bpf/hashtab.c >>> @@ -581,8 +581,14 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) >>> } >>> } >>> >>> + err = bpf_map_init_elem_count(&htab->map); >>> + if (err) >>> + goto free_extra_elements; >> Considering the per-cpu counter is not always needed, is it a good idea >> to make the elem_count being optional by introducing a new map flag ? > Per-map-flag or a static key? For me it looked like just doing an unconditional > `inc` for a per-cpu variable is better vs. doing a check then `inc` or an > unconditional jump. Sorry I didn't make it clear that I was worried about the allocated per-cpu memory. Previous I thought the per-cpu memory is limited, but after did some experiments I found it was almost the same as kmalloc() which could use all available memory to fulfill the allocation request. For a host with 72-cpus, the memory overhead for 10k hash map is about ~6MB. The overhead is tiny compared with the total available memory, but it is avoidable.
On Thu, Jul 06, 2023 at 10:01:26AM +0800, Hou Tao wrote: > Hi, > > On 7/4/2023 10:34 PM, Anton Protopopov wrote: > > On Tue, Jul 04, 2023 at 09:56:36PM +0800, Hou Tao wrote: > >> Hi, > >> > >> On 6/30/2023 4:25 PM, Anton Protopopov wrote: > >>> Initialize and utilize the per-cpu insertions/deletions counters for hash-based > >>> maps. Non-trivial changes only apply to the preallocated maps for which the > >>> {inc,dec}_elem_count functions are not called, as there's no need in counting > >>> elements to sustain proper map operations. > >>> > >>> To increase/decrease percpu counters for preallocated maps we add raw calls to > >>> the bpf_map_{inc,dec}_elem_count functions so that the impact is minimal. For > >>> dynamically allocated maps we add corresponding calls to the existing > >>> {inc,dec}_elem_count functions. > >>> > >>> Signed-off-by: Anton Protopopov <aspsk@isovalent.com> > >>> --- > >>> kernel/bpf/hashtab.c | 23 ++++++++++++++++++++--- > >>> 1 file changed, 20 insertions(+), 3 deletions(-) > >>> > >>> diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c > >>> index 56d3da7d0bc6..faaef4fd3df0 100644 > >>> --- a/kernel/bpf/hashtab.c > >>> +++ b/kernel/bpf/hashtab.c > >>> @@ -581,8 +581,14 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) > >>> } > >>> } > >>> > >>> + err = bpf_map_init_elem_count(&htab->map); > >>> + if (err) > >>> + goto free_extra_elements; > >> Considering the per-cpu counter is not always needed, is it a good idea > >> to make the elem_count being optional by introducing a new map flag ? > > Per-map-flag or a static key? For me it looked like just doing an unconditional > > `inc` for a per-cpu variable is better vs. doing a check then `inc` or an > > unconditional jump. > > Sorry I didn't make it clear that I was worried about the allocated > per-cpu memory. Previous I thought the per-cpu memory is limited, but > after did some experiments I found it was almost the same as kmalloc() > which could use all available memory to fulfill the allocation request. > For a host with 72-cpus, the memory overhead for 10k hash map is about > ~6MB. The overhead is tiny compared with the total available memory, but > it is avoidable. So, in my first patch I've only added new counters for preallocated maps. But then the feedback was that we need a generic percpu inc/dec counters, so I added them by default. For me a percpu s64 looks cheap enough for a hash map...
Hi, On 7/6/2023 8:25 PM, Anton Protopopov wrote: > On Thu, Jul 06, 2023 at 10:01:26AM +0800, Hou Tao wrote: >> Hi, >> >> On 7/4/2023 10:34 PM, Anton Protopopov wrote: >>> On Tue, Jul 04, 2023 at 09:56:36PM +0800, Hou Tao wrote: SNIP >>>> by introducing a new map flag ? >>> Per-map-flag or a static key? For me it looked like just doing an unconditional >>> `inc` for a per-cpu variable is better vs. doing a check then `inc` or an >>> unconditional jump. >> Sorry I didn't make it clear that I was worried about the allocated >> per-cpu memory. Previous I thought the per-cpu memory is limited, but >> after did some experiments I found it was almost the same as kmalloc() >> which could use all available memory to fulfill the allocation request. >> For a host with 72-cpus, the memory overhead for 10k hash map is about >> ~6MB. The overhead is tiny compared with the total available memory, but >> it is avoidable. > So, in my first patch I've only added new counters for preallocated maps. But > then the feedback was that we need a generic percpu inc/dec counters, so I > added them by default. For me a percpu s64 looks cheap enough for a hash map... Thanks for the explanation. Let's just allocate the per-cpu elem_count in hash map. If there are use cases which need to make it optional, we can revise that later. > .
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 56d3da7d0bc6..faaef4fd3df0 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -581,8 +581,14 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) } } + err = bpf_map_init_elem_count(&htab->map); + if (err) + goto free_extra_elements; + return &htab->map; +free_extra_elements: + free_percpu(htab->extra_elems); free_prealloc: prealloc_destroy(htab); free_map_locked: @@ -804,6 +810,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) if (l == tgt_l) { hlist_nulls_del_rcu(&l->hash_node); check_and_free_fields(htab, l); + bpf_map_dec_elem_count(&htab->map); break; } @@ -900,6 +907,8 @@ static bool is_map_full(struct bpf_htab *htab) static void inc_elem_count(struct bpf_htab *htab) { + bpf_map_inc_elem_count(&htab->map); + if (htab->use_percpu_counter) percpu_counter_add_batch(&htab->pcount, 1, PERCPU_COUNTER_BATCH); else @@ -908,6 +917,8 @@ static void inc_elem_count(struct bpf_htab *htab) static void dec_elem_count(struct bpf_htab *htab) { + bpf_map_dec_elem_count(&htab->map); + if (htab->use_percpu_counter) percpu_counter_add_batch(&htab->pcount, -1, PERCPU_COUNTER_BATCH); else @@ -920,6 +931,7 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) htab_put_fd_value(htab, l); if (htab_is_prealloc(htab)) { + bpf_map_dec_elem_count(&htab->map); check_and_free_fields(htab, l); __pcpu_freelist_push(&htab->freelist, &l->fnode); } else { @@ -1000,6 +1012,7 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, if (!l) return ERR_PTR(-E2BIG); l_new = container_of(l, struct htab_elem, fnode); + bpf_map_inc_elem_count(&htab->map); } } else { if (is_map_full(htab)) @@ -1224,7 +1237,8 @@ static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value if (l_old) { bpf_lru_node_set_ref(&l_new->lru_node); hlist_nulls_del_rcu(&l_old->hash_node); - } + } else + bpf_map_inc_elem_count(&htab->map); ret = 0; err: @@ -1351,6 +1365,7 @@ static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, pcpu_init_value(htab, htab_elem_get_ptr(l_new, key_size), value, onallcpus); hlist_nulls_add_head_rcu(&l_new->hash_node, head); + bpf_map_inc_elem_count(&htab->map); l_new = NULL; } ret = 0; @@ -1437,9 +1452,10 @@ static long htab_lru_map_delete_elem(struct bpf_map *map, void *key) l = lookup_elem_raw(head, hash, key, key_size); - if (l) + if (l) { + bpf_map_dec_elem_count(&htab->map); hlist_nulls_del_rcu(&l->hash_node); - else + } else ret = -ENOENT; htab_unlock_bucket(htab, b, hash, flags); @@ -1523,6 +1539,7 @@ static void htab_map_free(struct bpf_map *map) prealloc_destroy(htab); } + bpf_map_free_elem_count(map); free_percpu(htab->extra_elems); bpf_map_area_free(htab->buckets); bpf_mem_alloc_destroy(&htab->pcpu_ma);
Initialize and utilize the per-cpu insertions/deletions counters for hash-based maps. Non-trivial changes only apply to the preallocated maps for which the {inc,dec}_elem_count functions are not called, as there's no need in counting elements to sustain proper map operations. To increase/decrease percpu counters for preallocated maps we add raw calls to the bpf_map_{inc,dec}_elem_count functions so that the impact is minimal. For dynamically allocated maps we add corresponding calls to the existing {inc,dec}_elem_count functions. Signed-off-by: Anton Protopopov <aspsk@isovalent.com> --- kernel/bpf/hashtab.c | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-)