diff mbox series

[v3,bpf-next,3/6] bpf: populate the per-cpu insertions/deletions counters for hashmaps

Message ID 20230630082516.16286-4-aspsk@isovalent.com (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series bpf: add percpu stats for bpf_map | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR success PR summary
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 20 this patch: 20
netdev/cc_maintainers success CCed 12 of 12 maintainers
netdev/build_clang fail Errors and warnings before: 18 this patch: 18
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 20 this patch: 20
netdev/checkpatch warning CHECK: Unbalanced braces around else statement CHECK: braces {} should be used on all arms of this statement
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-2 success Logs for build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-4 success Logs for build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-5 success Logs for build for x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-6 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-3 success Logs for build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-7 success Logs for test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-11 success Logs for test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-15 success Logs for test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-19 success Logs for test_progs_no_alu32_parallel on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-22 success Logs for test_progs_parallel on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for test_maps on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-13 success Logs for test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-14 success Logs for test_progs on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-17 success Logs for test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-18 success Logs for test_progs_no_alu32 on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-20 success Logs for test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-21 success Logs for test_progs_no_alu32_parallel on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-23 success Logs for test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for test_progs_parallel on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-27 success Logs for test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-28 success Logs for test_verifier on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-29 success Logs for veristat
bpf/vmtest-bpf-next-VM_Test-26 success Logs for test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-12 success Logs for test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-16 success Logs for test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-8 success Logs for test_maps on s390x with gcc

Commit Message

Anton Protopopov June 30, 2023, 8:25 a.m. UTC
Initialize and utilize the per-cpu insertions/deletions counters for hash-based
maps. Non-trivial changes only apply to the preallocated maps for which the
{inc,dec}_elem_count functions are not called, as there's no need in counting
elements to sustain proper map operations.

To increase/decrease percpu counters for preallocated maps we add raw calls to
the bpf_map_{inc,dec}_elem_count functions so that the impact is minimal. For
dynamically allocated maps we add corresponding calls to the existing
{inc,dec}_elem_count functions.

Signed-off-by: Anton Protopopov <aspsk@isovalent.com>
---
 kernel/bpf/hashtab.c | 23 ++++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

Comments

Hou Tao July 4, 2023, 1:56 p.m. UTC | #1
Hi,

On 6/30/2023 4:25 PM, Anton Protopopov wrote:
> Initialize and utilize the per-cpu insertions/deletions counters for hash-based
> maps. Non-trivial changes only apply to the preallocated maps for which the
> {inc,dec}_elem_count functions are not called, as there's no need in counting
> elements to sustain proper map operations.
>
> To increase/decrease percpu counters for preallocated maps we add raw calls to
> the bpf_map_{inc,dec}_elem_count functions so that the impact is minimal. For
> dynamically allocated maps we add corresponding calls to the existing
> {inc,dec}_elem_count functions.
>
> Signed-off-by: Anton Protopopov <aspsk@isovalent.com>
> ---
>  kernel/bpf/hashtab.c | 23 ++++++++++++++++++++---
>  1 file changed, 20 insertions(+), 3 deletions(-)
>
> diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
> index 56d3da7d0bc6..faaef4fd3df0 100644
> --- a/kernel/bpf/hashtab.c
> +++ b/kernel/bpf/hashtab.c
> @@ -581,8 +581,14 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
>  		}
>  	}
>  
> +	err = bpf_map_init_elem_count(&htab->map);
> +	if (err)
> +		goto free_extra_elements;
Considering the per-cpu counter is not always needed, is it a good idea
to make the elem_count being optional by introducing a new map flag ?
> +
>  	return &htab->map;
>  
> +free_extra_elements:
> +	free_percpu(htab->extra_elems);
>  free_prealloc:
>  	prealloc_destroy(htab);
Need to check prealloc before calling prealloc_destroy(htab), otherwise
for non-preallocated percpu htab prealloc_destroy() will trigger invalid
memory dereference.
>  free_map_locked:
> @@ -804,6 +810,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
>  		if (l == tgt_l) {
>  			hlist_nulls_del_rcu(&l->hash_node);
>  			check_and_free_fields(htab, l);
> +			bpf_map_dec_elem_count(&htab->map);
>  			break;
>  		}
>  
> @@ -900,6 +907,8 @@ static bool is_map_full(struct bpf_htab *htab)
>  
>  static void inc_elem_count(struct bpf_htab *htab)
>  {
> +	bpf_map_inc_elem_count(&htab->map);
> +
>  	if (htab->use_percpu_counter)
>  		percpu_counter_add_batch(&htab->pcount, 1, PERCPU_COUNTER_BATCH);
>  	else
> @@ -908,6 +917,8 @@ static void inc_elem_count(struct bpf_htab *htab)
>  
>  static void dec_elem_count(struct bpf_htab *htab)
>  {
> +	bpf_map_dec_elem_count(&htab->map);
> +
>  	if (htab->use_percpu_counter)
>  		percpu_counter_add_batch(&htab->pcount, -1, PERCPU_COUNTER_BATCH);
>  	else
> @@ -920,6 +931,7 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
>  	htab_put_fd_value(htab, l);
>  
>  	if (htab_is_prealloc(htab)) {
> +		bpf_map_dec_elem_count(&htab->map);
>  		check_and_free_fields(htab, l);
>  		__pcpu_freelist_push(&htab->freelist, &l->fnode);
>  	} else {
> @@ -1000,6 +1012,7 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
>  			if (!l)
>  				return ERR_PTR(-E2BIG);
>  			l_new = container_of(l, struct htab_elem, fnode);
> +			bpf_map_inc_elem_count(&htab->map);
>  		}
>  	} else {
>  		if (is_map_full(htab))
> @@ -1224,7 +1237,8 @@ static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value
>  	if (l_old) {
>  		bpf_lru_node_set_ref(&l_new->lru_node);
>  		hlist_nulls_del_rcu(&l_old->hash_node);
> -	}
> +	} else
> +		bpf_map_inc_elem_count(&htab->map);
>  	ret = 0;
>  
>  err:
> @@ -1351,6 +1365,7 @@ static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
>  		pcpu_init_value(htab, htab_elem_get_ptr(l_new, key_size),
>  				value, onallcpus);
>  		hlist_nulls_add_head_rcu(&l_new->hash_node, head);
> +		bpf_map_inc_elem_count(&htab->map);
>  		l_new = NULL;
>  	}
>  	ret = 0;
> @@ -1437,9 +1452,10 @@ static long htab_lru_map_delete_elem(struct bpf_map *map, void *key)
>  
>  	l = lookup_elem_raw(head, hash, key, key_size);
>  
> -	if (l)
> +	if (l) {
> +		bpf_map_dec_elem_count(&htab->map);
>  		hlist_nulls_del_rcu(&l->hash_node);
> -	else
> +	} else
>  		ret = -ENOENT;
Also need to decrease elem_count for
__htab_map_lookup_and_delete_batch() and
__htab_map_lookup_and_delete_elem() when is_lru_map is true. Maybe for
LRU map, we could just do bpf_map_dec_elem_count() in
htab_lru_push_free() and do bpf_map_inc_elem_count() in prealloc_lru_pop().
>  
>  	htab_unlock_bucket(htab, b, hash, flags);
> @@ -1523,6 +1539,7 @@ static void htab_map_free(struct bpf_map *map)
>  		prealloc_destroy(htab);
>  	}
>  
> +	bpf_map_free_elem_count(map);
>  	free_percpu(htab->extra_elems);
>  	bpf_map_area_free(htab->buckets);
>  	bpf_mem_alloc_destroy(&htab->pcpu_ma);
Anton Protopopov July 4, 2023, 2:34 p.m. UTC | #2
On Tue, Jul 04, 2023 at 09:56:36PM +0800, Hou Tao wrote:
> Hi,
> 
> On 6/30/2023 4:25 PM, Anton Protopopov wrote:
> > Initialize and utilize the per-cpu insertions/deletions counters for hash-based
> > maps. Non-trivial changes only apply to the preallocated maps for which the
> > {inc,dec}_elem_count functions are not called, as there's no need in counting
> > elements to sustain proper map operations.
> >
> > To increase/decrease percpu counters for preallocated maps we add raw calls to
> > the bpf_map_{inc,dec}_elem_count functions so that the impact is minimal. For
> > dynamically allocated maps we add corresponding calls to the existing
> > {inc,dec}_elem_count functions.
> >
> > Signed-off-by: Anton Protopopov <aspsk@isovalent.com>
> > ---
> >  kernel/bpf/hashtab.c | 23 ++++++++++++++++++++---
> >  1 file changed, 20 insertions(+), 3 deletions(-)
> >
> > diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
> > index 56d3da7d0bc6..faaef4fd3df0 100644
> > --- a/kernel/bpf/hashtab.c
> > +++ b/kernel/bpf/hashtab.c
> > @@ -581,8 +581,14 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
> >  		}
> >  	}
> >  
> > +	err = bpf_map_init_elem_count(&htab->map);
> > +	if (err)
> > +		goto free_extra_elements;
> Considering the per-cpu counter is not always needed, is it a good idea
> to make the elem_count being optional by introducing a new map flag ?

Per-map-flag or a static key? For me it looked like just doing an unconditional
`inc` for a per-cpu variable is better vs. doing a check then `inc` or an
unconditional jump.

> > +
> >  	return &htab->map;
> >  
> > +free_extra_elements:
> > +	free_percpu(htab->extra_elems);
> >  free_prealloc:
> >  	prealloc_destroy(htab);
> Need to check prealloc before calling prealloc_destroy(htab), otherwise
> for non-preallocated percpu htab prealloc_destroy() will trigger invalid
> memory dereference.

Thanks!

> >  free_map_locked:
> > @@ -804,6 +810,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
> >  		if (l == tgt_l) {
> >  			hlist_nulls_del_rcu(&l->hash_node);
> >  			check_and_free_fields(htab, l);
> > +			bpf_map_dec_elem_count(&htab->map);
> >  			break;
> >  		}
> >  
> > @@ -900,6 +907,8 @@ static bool is_map_full(struct bpf_htab *htab)
> >  
> >  static void inc_elem_count(struct bpf_htab *htab)
> >  {
> > +	bpf_map_inc_elem_count(&htab->map);
> > +
> >  	if (htab->use_percpu_counter)
> >  		percpu_counter_add_batch(&htab->pcount, 1, PERCPU_COUNTER_BATCH);
> >  	else
> > @@ -908,6 +917,8 @@ static void inc_elem_count(struct bpf_htab *htab)
> >  
> >  static void dec_elem_count(struct bpf_htab *htab)
> >  {
> > +	bpf_map_dec_elem_count(&htab->map);
> > +
> >  	if (htab->use_percpu_counter)
> >  		percpu_counter_add_batch(&htab->pcount, -1, PERCPU_COUNTER_BATCH);
> >  	else
> > @@ -920,6 +931,7 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
> >  	htab_put_fd_value(htab, l);
> >  
> >  	if (htab_is_prealloc(htab)) {
> > +		bpf_map_dec_elem_count(&htab->map);
> >  		check_and_free_fields(htab, l);
> >  		__pcpu_freelist_push(&htab->freelist, &l->fnode);
> >  	} else {
> > @@ -1000,6 +1012,7 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
> >  			if (!l)
> >  				return ERR_PTR(-E2BIG);
> >  			l_new = container_of(l, struct htab_elem, fnode);
> > +			bpf_map_inc_elem_count(&htab->map);
> >  		}
> >  	} else {
> >  		if (is_map_full(htab))
> > @@ -1224,7 +1237,8 @@ static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value
> >  	if (l_old) {
> >  		bpf_lru_node_set_ref(&l_new->lru_node);
> >  		hlist_nulls_del_rcu(&l_old->hash_node);
> > -	}
> > +	} else
> > +		bpf_map_inc_elem_count(&htab->map);
> >  	ret = 0;
> >  
> >  err:
> > @@ -1351,6 +1365,7 @@ static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
> >  		pcpu_init_value(htab, htab_elem_get_ptr(l_new, key_size),
> >  				value, onallcpus);
> >  		hlist_nulls_add_head_rcu(&l_new->hash_node, head);
> > +		bpf_map_inc_elem_count(&htab->map);
> >  		l_new = NULL;
> >  	}
> >  	ret = 0;
> > @@ -1437,9 +1452,10 @@ static long htab_lru_map_delete_elem(struct bpf_map *map, void *key)
> >  
> >  	l = lookup_elem_raw(head, hash, key, key_size);
> >  
> > -	if (l)
> > +	if (l) {
> > +		bpf_map_dec_elem_count(&htab->map);
> >  		hlist_nulls_del_rcu(&l->hash_node);
> > -	else
> > +	} else
> >  		ret = -ENOENT;
> Also need to decrease elem_count for
> __htab_map_lookup_and_delete_batch() and
> __htab_map_lookup_and_delete_elem() when is_lru_map is true. Maybe for
> LRU map, we could just do bpf_map_dec_elem_count() in
> htab_lru_push_free() and do bpf_map_inc_elem_count() in prealloc_lru_pop().

Thanks. I will fix the logic and extend the selftest to test the batch ops as well.

> >  
> >  	htab_unlock_bucket(htab, b, hash, flags);
> > @@ -1523,6 +1539,7 @@ static void htab_map_free(struct bpf_map *map)
> >  		prealloc_destroy(htab);
> >  	}
> >  
> > +	bpf_map_free_elem_count(map);
> >  	free_percpu(htab->extra_elems);
> >  	bpf_map_area_free(htab->buckets);
> >  	bpf_mem_alloc_destroy(&htab->pcpu_ma);
>
Hou Tao July 6, 2023, 2:01 a.m. UTC | #3
Hi,

On 7/4/2023 10:34 PM, Anton Protopopov wrote:
> On Tue, Jul 04, 2023 at 09:56:36PM +0800, Hou Tao wrote:
>> Hi,
>>
>> On 6/30/2023 4:25 PM, Anton Protopopov wrote:
>>> Initialize and utilize the per-cpu insertions/deletions counters for hash-based
>>> maps. Non-trivial changes only apply to the preallocated maps for which the
>>> {inc,dec}_elem_count functions are not called, as there's no need in counting
>>> elements to sustain proper map operations.
>>>
>>> To increase/decrease percpu counters for preallocated maps we add raw calls to
>>> the bpf_map_{inc,dec}_elem_count functions so that the impact is minimal. For
>>> dynamically allocated maps we add corresponding calls to the existing
>>> {inc,dec}_elem_count functions.
>>>
>>> Signed-off-by: Anton Protopopov <aspsk@isovalent.com>
>>> ---
>>>  kernel/bpf/hashtab.c | 23 ++++++++++++++++++++---
>>>  1 file changed, 20 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
>>> index 56d3da7d0bc6..faaef4fd3df0 100644
>>> --- a/kernel/bpf/hashtab.c
>>> +++ b/kernel/bpf/hashtab.c
>>> @@ -581,8 +581,14 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
>>>  		}
>>>  	}
>>>  
>>> +	err = bpf_map_init_elem_count(&htab->map);
>>> +	if (err)
>>> +		goto free_extra_elements;
>> Considering the per-cpu counter is not always needed, is it a good idea
>> to make the elem_count being optional by introducing a new map flag ?
> Per-map-flag or a static key? For me it looked like just doing an unconditional
> `inc` for a per-cpu variable is better vs. doing a check then `inc` or an
> unconditional jump.

Sorry I didn't make it clear that I was worried about the allocated
per-cpu memory. Previous I thought the per-cpu memory is limited, but
after did some experiments I found it was almost the same as kmalloc()
which could use all available memory to fulfill the allocation request.
For a host with 72-cpus, the memory overhead for 10k hash map is about
~6MB. The overhead is tiny compared with the total available memory, but
it is avoidable.
Anton Protopopov July 6, 2023, 12:25 p.m. UTC | #4
On Thu, Jul 06, 2023 at 10:01:26AM +0800, Hou Tao wrote:
> Hi,
> 
> On 7/4/2023 10:34 PM, Anton Protopopov wrote:
> > On Tue, Jul 04, 2023 at 09:56:36PM +0800, Hou Tao wrote:
> >> Hi,
> >>
> >> On 6/30/2023 4:25 PM, Anton Protopopov wrote:
> >>> Initialize and utilize the per-cpu insertions/deletions counters for hash-based
> >>> maps. Non-trivial changes only apply to the preallocated maps for which the
> >>> {inc,dec}_elem_count functions are not called, as there's no need in counting
> >>> elements to sustain proper map operations.
> >>>
> >>> To increase/decrease percpu counters for preallocated maps we add raw calls to
> >>> the bpf_map_{inc,dec}_elem_count functions so that the impact is minimal. For
> >>> dynamically allocated maps we add corresponding calls to the existing
> >>> {inc,dec}_elem_count functions.
> >>>
> >>> Signed-off-by: Anton Protopopov <aspsk@isovalent.com>
> >>> ---
> >>>  kernel/bpf/hashtab.c | 23 ++++++++++++++++++++---
> >>>  1 file changed, 20 insertions(+), 3 deletions(-)
> >>>
> >>> diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
> >>> index 56d3da7d0bc6..faaef4fd3df0 100644
> >>> --- a/kernel/bpf/hashtab.c
> >>> +++ b/kernel/bpf/hashtab.c
> >>> @@ -581,8 +581,14 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
> >>>  		}
> >>>  	}
> >>>  
> >>> +	err = bpf_map_init_elem_count(&htab->map);
> >>> +	if (err)
> >>> +		goto free_extra_elements;
> >> Considering the per-cpu counter is not always needed, is it a good idea
> >> to make the elem_count being optional by introducing a new map flag ?
> > Per-map-flag or a static key? For me it looked like just doing an unconditional
> > `inc` for a per-cpu variable is better vs. doing a check then `inc` or an
> > unconditional jump.
> 
> Sorry I didn't make it clear that I was worried about the allocated
> per-cpu memory. Previous I thought the per-cpu memory is limited, but
> after did some experiments I found it was almost the same as kmalloc()
> which could use all available memory to fulfill the allocation request.
> For a host with 72-cpus, the memory overhead for 10k hash map is about
> ~6MB. The overhead is tiny compared with the total available memory, but
> it is avoidable.

So, in my first patch I've only added new counters for preallocated maps. But
then the feedback was that we need a generic percpu inc/dec counters, so I
added them by default. For me a percpu s64 looks cheap enough for a hash map...
Hou Tao July 6, 2023, 12:30 p.m. UTC | #5
Hi,

On 7/6/2023 8:25 PM, Anton Protopopov wrote:
> On Thu, Jul 06, 2023 at 10:01:26AM +0800, Hou Tao wrote:
>> Hi,
>>
>> On 7/4/2023 10:34 PM, Anton Protopopov wrote:
>>> On Tue, Jul 04, 2023 at 09:56:36PM +0800, Hou Tao wrote:
SNIP
>>>> by introducing a new map flag ?
>>> Per-map-flag or a static key? For me it looked like just doing an unconditional
>>> `inc` for a per-cpu variable is better vs. doing a check then `inc` or an
>>> unconditional jump.
>> Sorry I didn't make it clear that I was worried about the allocated
>> per-cpu memory. Previous I thought the per-cpu memory is limited, but
>> after did some experiments I found it was almost the same as kmalloc()
>> which could use all available memory to fulfill the allocation request.
>> For a host with 72-cpus, the memory overhead for 10k hash map is about
>> ~6MB. The overhead is tiny compared with the total available memory, but
>> it is avoidable.
> So, in my first patch I've only added new counters for preallocated maps. But
> then the feedback was that we need a generic percpu inc/dec counters, so I
> added them by default. For me a percpu s64 looks cheap enough for a hash map...

Thanks for the explanation. Let's just allocate the per-cpu elem_count
in hash map. If there are use cases which need to make it optional, we
can revise that later.
> .
diff mbox series

Patch

diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 56d3da7d0bc6..faaef4fd3df0 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -581,8 +581,14 @@  static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 		}
 	}
 
+	err = bpf_map_init_elem_count(&htab->map);
+	if (err)
+		goto free_extra_elements;
+
 	return &htab->map;
 
+free_extra_elements:
+	free_percpu(htab->extra_elems);
 free_prealloc:
 	prealloc_destroy(htab);
 free_map_locked:
@@ -804,6 +810,7 @@  static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
 		if (l == tgt_l) {
 			hlist_nulls_del_rcu(&l->hash_node);
 			check_and_free_fields(htab, l);
+			bpf_map_dec_elem_count(&htab->map);
 			break;
 		}
 
@@ -900,6 +907,8 @@  static bool is_map_full(struct bpf_htab *htab)
 
 static void inc_elem_count(struct bpf_htab *htab)
 {
+	bpf_map_inc_elem_count(&htab->map);
+
 	if (htab->use_percpu_counter)
 		percpu_counter_add_batch(&htab->pcount, 1, PERCPU_COUNTER_BATCH);
 	else
@@ -908,6 +917,8 @@  static void inc_elem_count(struct bpf_htab *htab)
 
 static void dec_elem_count(struct bpf_htab *htab)
 {
+	bpf_map_dec_elem_count(&htab->map);
+
 	if (htab->use_percpu_counter)
 		percpu_counter_add_batch(&htab->pcount, -1, PERCPU_COUNTER_BATCH);
 	else
@@ -920,6 +931,7 @@  static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
 	htab_put_fd_value(htab, l);
 
 	if (htab_is_prealloc(htab)) {
+		bpf_map_dec_elem_count(&htab->map);
 		check_and_free_fields(htab, l);
 		__pcpu_freelist_push(&htab->freelist, &l->fnode);
 	} else {
@@ -1000,6 +1012,7 @@  static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
 			if (!l)
 				return ERR_PTR(-E2BIG);
 			l_new = container_of(l, struct htab_elem, fnode);
+			bpf_map_inc_elem_count(&htab->map);
 		}
 	} else {
 		if (is_map_full(htab))
@@ -1224,7 +1237,8 @@  static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value
 	if (l_old) {
 		bpf_lru_node_set_ref(&l_new->lru_node);
 		hlist_nulls_del_rcu(&l_old->hash_node);
-	}
+	} else
+		bpf_map_inc_elem_count(&htab->map);
 	ret = 0;
 
 err:
@@ -1351,6 +1365,7 @@  static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
 		pcpu_init_value(htab, htab_elem_get_ptr(l_new, key_size),
 				value, onallcpus);
 		hlist_nulls_add_head_rcu(&l_new->hash_node, head);
+		bpf_map_inc_elem_count(&htab->map);
 		l_new = NULL;
 	}
 	ret = 0;
@@ -1437,9 +1452,10 @@  static long htab_lru_map_delete_elem(struct bpf_map *map, void *key)
 
 	l = lookup_elem_raw(head, hash, key, key_size);
 
-	if (l)
+	if (l) {
+		bpf_map_dec_elem_count(&htab->map);
 		hlist_nulls_del_rcu(&l->hash_node);
-	else
+	} else
 		ret = -ENOENT;
 
 	htab_unlock_bucket(htab, b, hash, flags);
@@ -1523,6 +1539,7 @@  static void htab_map_free(struct bpf_map *map)
 		prealloc_destroy(htab);
 	}
 
+	bpf_map_free_elem_count(map);
 	free_percpu(htab->extra_elems);
 	bpf_map_area_free(htab->buckets);
 	bpf_mem_alloc_destroy(&htab->pcpu_ma);