diff mbox series

[RFC,v2,bpf-next,2/4] bpf: populate the per-cpu insertions/deletions counters for hashmaps

Message ID 20230622095330.1023453-3-aspsk@isovalent.com (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series bpf: add percpu stats for bpf_map | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR fail PR summary
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ${{ matrix.test }} on ${{ matrix.arch }} with ${{ matrix.toolchain_full }}
bpf/vmtest-bpf-next-VM_Test-2 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-3 success Logs for build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-4 fail Logs for build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-5 success Logs for build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-6 success Logs for build for x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-7 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-8 success Logs for veristat
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 20 this patch: 20
netdev/cc_maintainers success CCed 12 of 12 maintainers
netdev/build_clang success Errors and warnings before: 8 this patch: 8
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 20 this patch: 20
netdev/checkpatch warning CHECK: Unbalanced braces around else statement CHECK: braces {} should be used on all arms of this statement WARNING: line length of 81 exceeds 80 columns WARNING: line length of 82 exceeds 80 columns WARNING: line length of 84 exceeds 80 columns WARNING: line length of 85 exceeds 80 columns
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Anton Protopopov June 22, 2023, 9:53 a.m. UTC
Initialize and utilize the per-cpu insertions/deletions counters for hash-based
maps. The non-trivial changes only apply to the preallocated maps for which the
inc_elem_count/dec_elem_count functions were not called before.

Signed-off-by: Anton Protopopov <aspsk@isovalent.com>
---
 kernel/bpf/hashtab.c | 102 +++++++++++++++++++++++++------------------
 1 file changed, 60 insertions(+), 42 deletions(-)

Comments

Alexei Starovoitov June 22, 2023, 8:18 p.m. UTC | #1
On Thu, Jun 22, 2023 at 09:53:28AM +0000, Anton Protopopov wrote:
> Initialize and utilize the per-cpu insertions/deletions counters for hash-based
> maps. The non-trivial changes only apply to the preallocated maps for which the
> inc_elem_count/dec_elem_count functions were not called before.
> 
> Signed-off-by: Anton Protopopov <aspsk@isovalent.com>
> ---
>  kernel/bpf/hashtab.c | 102 +++++++++++++++++++++++++------------------
>  1 file changed, 60 insertions(+), 42 deletions(-)
> 
> diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
> index 9901efee4339..0b4a2d61afa9 100644
> --- a/kernel/bpf/hashtab.c
> +++ b/kernel/bpf/hashtab.c
> @@ -217,6 +217,49 @@ static bool htab_has_extra_elems(struct bpf_htab *htab)
>  	return !htab_is_percpu(htab) && !htab_is_lru(htab);
>  }
>  
> +/* compute_batch_value() computes batch value as num_online_cpus() * 2
> + * and __percpu_counter_compare() needs
> + * htab->max_entries - cur_number_of_elems to be more than batch * num_online_cpus()
> + * for percpu_counter to be faster than atomic_t. In practice the average bpf
> + * hash map size is 10k, which means that a system with 64 cpus will fill
> + * hashmap to 20% of 10k before percpu_counter becomes ineffective. Therefore
> + * define our own batch count as 32 then 10k hash map can be filled up to 80%:
> + * 10k - 8k > 32 _batch_ * 64 _cpus_
> + * and __percpu_counter_compare() will still be fast. At that point hash map
> + * collisions will dominate its performance anyway. Assume that hash map filled
> + * to 50+% isn't going to be O(1) and use the following formula to choose
> + * between percpu_counter and atomic_t.
> + */

please split moving of helpers into separate patch.

> +#define PERCPU_COUNTER_BATCH 32
> +
> +static bool is_map_full(struct bpf_htab *htab)
> +{
> +	if (htab->use_percpu_counter)
> +		return __percpu_counter_compare(&htab->pcount, htab->map.max_entries,
> +						PERCPU_COUNTER_BATCH) >= 0;
> +	return atomic_read(&htab->count) >= htab->map.max_entries;
> +}
> +
> +static void inc_elem_count(struct bpf_htab *htab)
> +{
> +	bpf_map_inc_elements_counter(&htab->map);

and add this in the next patch.

> +
> +	if (htab->use_percpu_counter)
> +		percpu_counter_add_batch(&htab->pcount, 1, PERCPU_COUNTER_BATCH);
> +	else
> +		atomic_inc(&htab->count);
> +}
> +
> +static void dec_elem_count(struct bpf_htab *htab)
> +{
> +	bpf_map_dec_elements_counter(&htab->map);
> +
> +	if (htab->use_percpu_counter)
> +		percpu_counter_add_batch(&htab->pcount, -1, PERCPU_COUNTER_BATCH);
> +	else
> +		atomic_dec(&htab->count);
> +}
> +
>  static void htab_free_prealloced_timers(struct bpf_htab *htab)
>  {
>  	u32 num_entries = htab->map.max_entries;
> @@ -539,20 +582,6 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
>  
>  	htab_init_buckets(htab);
>  
> -/* compute_batch_value() computes batch value as num_online_cpus() * 2
> - * and __percpu_counter_compare() needs
> - * htab->max_entries - cur_number_of_elems to be more than batch * num_online_cpus()
> - * for percpu_counter to be faster than atomic_t. In practice the average bpf
> - * hash map size is 10k, which means that a system with 64 cpus will fill
> - * hashmap to 20% of 10k before percpu_counter becomes ineffective. Therefore
> - * define our own batch count as 32 then 10k hash map can be filled up to 80%:
> - * 10k - 8k > 32 _batch_ * 64 _cpus_
> - * and __percpu_counter_compare() will still be fast. At that point hash map
> - * collisions will dominate its performance anyway. Assume that hash map filled
> - * to 50+% isn't going to be O(1) and use the following formula to choose
> - * between percpu_counter and atomic_t.
> - */
> -#define PERCPU_COUNTER_BATCH 32
>  	if (attr->max_entries / 2 > num_online_cpus() * PERCPU_COUNTER_BATCH)
>  		htab->use_percpu_counter = true;
>  
> @@ -587,8 +616,14 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
>  		}
>  	}
>  
> +	err = bpf_map_init_elements_counter(&htab->map);
> +	if (err)
> +		goto free_extra_elements;
> +
>  	return &htab->map;
>  
> +free_extra_elements:
> +	free_percpu(htab->extra_elems);
>  free_prealloc:
>  	prealloc_destroy(htab);
>  free_map_locked:
> @@ -810,6 +845,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
>  		if (l == tgt_l) {
>  			hlist_nulls_del_rcu(&l->hash_node);
>  			check_and_free_fields(htab, l);
> +			dec_elem_count(htab);
>  			break;
>  		}
>  
> @@ -896,40 +932,16 @@ static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l)
>  	}
>  }
>  
> -static bool is_map_full(struct bpf_htab *htab)
> -{
> -	if (htab->use_percpu_counter)
> -		return __percpu_counter_compare(&htab->pcount, htab->map.max_entries,
> -						PERCPU_COUNTER_BATCH) >= 0;
> -	return atomic_read(&htab->count) >= htab->map.max_entries;
> -}
> -
> -static void inc_elem_count(struct bpf_htab *htab)
> -{
> -	if (htab->use_percpu_counter)
> -		percpu_counter_add_batch(&htab->pcount, 1, PERCPU_COUNTER_BATCH);
> -	else
> -		atomic_inc(&htab->count);
> -}
> -
> -static void dec_elem_count(struct bpf_htab *htab)
> -{
> -	if (htab->use_percpu_counter)
> -		percpu_counter_add_batch(&htab->pcount, -1, PERCPU_COUNTER_BATCH);
> -	else
> -		atomic_dec(&htab->count);
> -}
> -
> -
>  static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
>  {
>  	htab_put_fd_value(htab, l);
>  
> +	dec_elem_count(htab);
> +
>  	if (htab_is_prealloc(htab)) {
>  		check_and_free_fields(htab, l);
>  		__pcpu_freelist_push(&htab->freelist, &l->fnode);
>  	} else {
> -		dec_elem_count(htab);
>  		htab_elem_free(htab, l);
>  	}
>  }
> @@ -1006,6 +1018,7 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
>  			if (!l)
>  				return ERR_PTR(-E2BIG);
>  			l_new = container_of(l, struct htab_elem, fnode);
> +			inc_elem_count(htab);
>  		}
>  	} else {
>  		if (is_map_full(htab))
> @@ -1227,9 +1240,11 @@ static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value
>  	 * concurrent search will find it before old elem
>  	 */
>  	hlist_nulls_add_head_rcu(&l_new->hash_node, head);
> +	inc_elem_count(htab);
>  	if (l_old) {
>  		bpf_lru_node_set_ref(&l_new->lru_node);
>  		hlist_nulls_del_rcu(&l_old->hash_node);
> +		dec_elem_count(htab);

instead of inc and instant dec.
Please do inc once.

>  	}
>  	ret = 0;
>  
> @@ -1357,6 +1372,7 @@ static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
>  		pcpu_init_value(htab, htab_elem_get_ptr(l_new, key_size),
>  				value, onallcpus);
>  		hlist_nulls_add_head_rcu(&l_new->hash_node, head);
> +		inc_elem_count(htab);
>  		l_new = NULL;
>  	}
>  	ret = 0;
> @@ -1443,9 +1459,10 @@ static long htab_lru_map_delete_elem(struct bpf_map *map, void *key)
>  
>  	l = lookup_elem_raw(head, hash, key, key_size);
>  
> -	if (l)
> +	if (l) {
> +		dec_elem_count(htab);
>  		hlist_nulls_del_rcu(&l->hash_node);
> -	else
> +	} else
>  		ret = -ENOENT;
>  
>  	htab_unlock_bucket(htab, b, hash, flags);
> @@ -1529,6 +1546,7 @@ static void htab_map_free(struct bpf_map *map)
>  		prealloc_destroy(htab);
>  	}
>  
> +	bpf_map_free_elements_counter(map);
>  	free_percpu(htab->extra_elems);
>  	bpf_map_area_free(htab->buckets);
>  	bpf_mem_alloc_destroy(&htab->pcpu_ma);
> -- 
> 2.34.1
>
diff mbox series

Patch

diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 9901efee4339..0b4a2d61afa9 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -217,6 +217,49 @@  static bool htab_has_extra_elems(struct bpf_htab *htab)
 	return !htab_is_percpu(htab) && !htab_is_lru(htab);
 }
 
+/* compute_batch_value() computes batch value as num_online_cpus() * 2
+ * and __percpu_counter_compare() needs
+ * htab->max_entries - cur_number_of_elems to be more than batch * num_online_cpus()
+ * for percpu_counter to be faster than atomic_t. In practice the average bpf
+ * hash map size is 10k, which means that a system with 64 cpus will fill
+ * hashmap to 20% of 10k before percpu_counter becomes ineffective. Therefore
+ * define our own batch count as 32 then 10k hash map can be filled up to 80%:
+ * 10k - 8k > 32 _batch_ * 64 _cpus_
+ * and __percpu_counter_compare() will still be fast. At that point hash map
+ * collisions will dominate its performance anyway. Assume that hash map filled
+ * to 50+% isn't going to be O(1) and use the following formula to choose
+ * between percpu_counter and atomic_t.
+ */
+#define PERCPU_COUNTER_BATCH 32
+
+static bool is_map_full(struct bpf_htab *htab)
+{
+	if (htab->use_percpu_counter)
+		return __percpu_counter_compare(&htab->pcount, htab->map.max_entries,
+						PERCPU_COUNTER_BATCH) >= 0;
+	return atomic_read(&htab->count) >= htab->map.max_entries;
+}
+
+static void inc_elem_count(struct bpf_htab *htab)
+{
+	bpf_map_inc_elements_counter(&htab->map);
+
+	if (htab->use_percpu_counter)
+		percpu_counter_add_batch(&htab->pcount, 1, PERCPU_COUNTER_BATCH);
+	else
+		atomic_inc(&htab->count);
+}
+
+static void dec_elem_count(struct bpf_htab *htab)
+{
+	bpf_map_dec_elements_counter(&htab->map);
+
+	if (htab->use_percpu_counter)
+		percpu_counter_add_batch(&htab->pcount, -1, PERCPU_COUNTER_BATCH);
+	else
+		atomic_dec(&htab->count);
+}
+
 static void htab_free_prealloced_timers(struct bpf_htab *htab)
 {
 	u32 num_entries = htab->map.max_entries;
@@ -539,20 +582,6 @@  static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 
 	htab_init_buckets(htab);
 
-/* compute_batch_value() computes batch value as num_online_cpus() * 2
- * and __percpu_counter_compare() needs
- * htab->max_entries - cur_number_of_elems to be more than batch * num_online_cpus()
- * for percpu_counter to be faster than atomic_t. In practice the average bpf
- * hash map size is 10k, which means that a system with 64 cpus will fill
- * hashmap to 20% of 10k before percpu_counter becomes ineffective. Therefore
- * define our own batch count as 32 then 10k hash map can be filled up to 80%:
- * 10k - 8k > 32 _batch_ * 64 _cpus_
- * and __percpu_counter_compare() will still be fast. At that point hash map
- * collisions will dominate its performance anyway. Assume that hash map filled
- * to 50+% isn't going to be O(1) and use the following formula to choose
- * between percpu_counter and atomic_t.
- */
-#define PERCPU_COUNTER_BATCH 32
 	if (attr->max_entries / 2 > num_online_cpus() * PERCPU_COUNTER_BATCH)
 		htab->use_percpu_counter = true;
 
@@ -587,8 +616,14 @@  static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 		}
 	}
 
+	err = bpf_map_init_elements_counter(&htab->map);
+	if (err)
+		goto free_extra_elements;
+
 	return &htab->map;
 
+free_extra_elements:
+	free_percpu(htab->extra_elems);
 free_prealloc:
 	prealloc_destroy(htab);
 free_map_locked:
@@ -810,6 +845,7 @@  static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
 		if (l == tgt_l) {
 			hlist_nulls_del_rcu(&l->hash_node);
 			check_and_free_fields(htab, l);
+			dec_elem_count(htab);
 			break;
 		}
 
@@ -896,40 +932,16 @@  static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l)
 	}
 }
 
-static bool is_map_full(struct bpf_htab *htab)
-{
-	if (htab->use_percpu_counter)
-		return __percpu_counter_compare(&htab->pcount, htab->map.max_entries,
-						PERCPU_COUNTER_BATCH) >= 0;
-	return atomic_read(&htab->count) >= htab->map.max_entries;
-}
-
-static void inc_elem_count(struct bpf_htab *htab)
-{
-	if (htab->use_percpu_counter)
-		percpu_counter_add_batch(&htab->pcount, 1, PERCPU_COUNTER_BATCH);
-	else
-		atomic_inc(&htab->count);
-}
-
-static void dec_elem_count(struct bpf_htab *htab)
-{
-	if (htab->use_percpu_counter)
-		percpu_counter_add_batch(&htab->pcount, -1, PERCPU_COUNTER_BATCH);
-	else
-		atomic_dec(&htab->count);
-}
-
-
 static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
 {
 	htab_put_fd_value(htab, l);
 
+	dec_elem_count(htab);
+
 	if (htab_is_prealloc(htab)) {
 		check_and_free_fields(htab, l);
 		__pcpu_freelist_push(&htab->freelist, &l->fnode);
 	} else {
-		dec_elem_count(htab);
 		htab_elem_free(htab, l);
 	}
 }
@@ -1006,6 +1018,7 @@  static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
 			if (!l)
 				return ERR_PTR(-E2BIG);
 			l_new = container_of(l, struct htab_elem, fnode);
+			inc_elem_count(htab);
 		}
 	} else {
 		if (is_map_full(htab))
@@ -1227,9 +1240,11 @@  static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value
 	 * concurrent search will find it before old elem
 	 */
 	hlist_nulls_add_head_rcu(&l_new->hash_node, head);
+	inc_elem_count(htab);
 	if (l_old) {
 		bpf_lru_node_set_ref(&l_new->lru_node);
 		hlist_nulls_del_rcu(&l_old->hash_node);
+		dec_elem_count(htab);
 	}
 	ret = 0;
 
@@ -1357,6 +1372,7 @@  static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
 		pcpu_init_value(htab, htab_elem_get_ptr(l_new, key_size),
 				value, onallcpus);
 		hlist_nulls_add_head_rcu(&l_new->hash_node, head);
+		inc_elem_count(htab);
 		l_new = NULL;
 	}
 	ret = 0;
@@ -1443,9 +1459,10 @@  static long htab_lru_map_delete_elem(struct bpf_map *map, void *key)
 
 	l = lookup_elem_raw(head, hash, key, key_size);
 
-	if (l)
+	if (l) {
+		dec_elem_count(htab);
 		hlist_nulls_del_rcu(&l->hash_node);
-	else
+	} else
 		ret = -ENOENT;
 
 	htab_unlock_bucket(htab, b, hash, flags);
@@ -1529,6 +1546,7 @@  static void htab_map_free(struct bpf_map *map)
 		prealloc_destroy(htab);
 	}
 
+	bpf_map_free_elements_counter(map);
 	free_percpu(htab->extra_elems);
 	bpf_map_area_free(htab->buckets);
 	bpf_mem_alloc_destroy(&htab->pcpu_ma);