diff mbox series

[2/2] objpool: cache nr_possible_cpus() and avoid caching nr_cpu_ids

Message ID 20240424215214.3956041-3-andrii@kernel.org (mailing list archive)
State Handled Elsewhere
Headers show
Series Objpool performance improvements | expand

Checks

Context Check Description
netdev/tree_selection success Not a local patch

Commit Message

Andrii Nakryiko April 24, 2024, 9:52 p.m. UTC
Profiling shows that calling nr_possible_cpus() in objpool_pop() takes
a noticeable amount of CPU (when profiled on 80-core machine), as we
need to recalculate number of set bits in a CPU bit mask. This number
can't change, so there is no point in paying the price for recalculating
it. As such, cache this value in struct objpool_head and use it in
objpool_pop().

On the other hand, cached pool->nr_cpus isn't necessary, as it's not
used in hot path and is also a pretty trivial value to retrieve. So drop
pool->nr_cpus in favor of using nr_cpu_ids everywhere. This way the size
of struct objpool_head remains the same, which is a nice bonus.

Same BPF selftests benchmarks were used to evaluate the effect. Using
changes in previous patch (inlining of objpool_pop/objpool_push) as
baseline, here are the differences:

BASELINE
========
kretprobe      :    9.937 ± 0.174M/s
kretprobe-multi:   10.440 ± 0.108M/s

AFTER
=====
kretprobe      :   10.106 ± 0.120M/s (+1.7%)
kretprobe-multi:   10.515 ± 0.180M/s (+0.7%)

Cc: Matt (Qiang) Wu <wuqiang.matt@bytedance.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
 include/linux/objpool.h |  6 +++---
 lib/objpool.c           | 12 ++++++------
 2 files changed, 9 insertions(+), 9 deletions(-)

Comments

Jiri Olsa May 28, 2024, 8:30 p.m. UTC | #1
On Wed, Apr 24, 2024 at 02:52:14PM -0700, Andrii Nakryiko wrote:
> Profiling shows that calling nr_possible_cpus() in objpool_pop() takes
> a noticeable amount of CPU (when profiled on 80-core machine), as we
> need to recalculate number of set bits in a CPU bit mask. This number
> can't change, so there is no point in paying the price for recalculating
> it. As such, cache this value in struct objpool_head and use it in
> objpool_pop().
> 
> On the other hand, cached pool->nr_cpus isn't necessary, as it's not
> used in hot path and is also a pretty trivial value to retrieve. So drop
> pool->nr_cpus in favor of using nr_cpu_ids everywhere. This way the size
> of struct objpool_head remains the same, which is a nice bonus.
> 
> Same BPF selftests benchmarks were used to evaluate the effect. Using
> changes in previous patch (inlining of objpool_pop/objpool_push) as
> baseline, here are the differences:
> 
> BASELINE
> ========
> kretprobe      :    9.937 ± 0.174M/s
> kretprobe-multi:   10.440 ± 0.108M/s
> 
> AFTER
> =====
> kretprobe      :   10.106 ± 0.120M/s (+1.7%)
> kretprobe-multi:   10.515 ± 0.180M/s (+0.7%)

nice, overall lgtm

jirka

> 
> Cc: Matt (Qiang) Wu <wuqiang.matt@bytedance.com>
> Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
> ---
>  include/linux/objpool.h |  6 +++---
>  lib/objpool.c           | 12 ++++++------
>  2 files changed, 9 insertions(+), 9 deletions(-)
> 
> diff --git a/include/linux/objpool.h b/include/linux/objpool.h
> index d8b1f7b91128..cb1758eaa2d3 100644
> --- a/include/linux/objpool.h
> +++ b/include/linux/objpool.h
> @@ -73,7 +73,7 @@ typedef int (*objpool_fini_cb)(struct objpool_head *head, void *context);
>   * struct objpool_head - object pooling metadata
>   * @obj_size:   object size, aligned to sizeof(void *)
>   * @nr_objs:    total objs (to be pre-allocated with objpool)
> - * @nr_cpus:    local copy of nr_cpu_ids
> + * @nr_possible_cpus: cached value of num_possible_cpus()
>   * @capacity:   max objs can be managed by one objpool_slot
>   * @gfp:        gfp flags for kmalloc & vmalloc
>   * @ref:        refcount of objpool
> @@ -85,7 +85,7 @@ typedef int (*objpool_fini_cb)(struct objpool_head *head, void *context);
>  struct objpool_head {
>  	int                     obj_size;
>  	int                     nr_objs;
> -	int                     nr_cpus;
> +	int                     nr_possible_cpus;
>  	int                     capacity;
>  	gfp_t                   gfp;
>  	refcount_t              ref;
> @@ -176,7 +176,7 @@ static inline void *objpool_pop(struct objpool_head *pool)
>  	raw_local_irq_save(flags);
>  
>  	cpu = raw_smp_processor_id();
> -	for (i = 0; i < num_possible_cpus(); i++) {
> +	for (i = 0; i < pool->nr_possible_cpus; i++) {
>  		obj = __objpool_try_get_slot(pool, cpu);
>  		if (obj)
>  			break;
> diff --git a/lib/objpool.c b/lib/objpool.c
> index f696308fc026..234f9d0bd081 100644
> --- a/lib/objpool.c
> +++ b/lib/objpool.c
> @@ -50,7 +50,7 @@ objpool_init_percpu_slots(struct objpool_head *pool, int nr_objs,
>  {
>  	int i, cpu_count = 0;
>  
> -	for (i = 0; i < pool->nr_cpus; i++) {
> +	for (i = 0; i < nr_cpu_ids; i++) {
>  
>  		struct objpool_slot *slot;
>  		int nodes, size, rc;
> @@ -60,8 +60,8 @@ objpool_init_percpu_slots(struct objpool_head *pool, int nr_objs,
>  			continue;
>  
>  		/* compute how many objects to be allocated with this slot */
> -		nodes = nr_objs / num_possible_cpus();
> -		if (cpu_count < (nr_objs % num_possible_cpus()))
> +		nodes = nr_objs / pool->nr_possible_cpus;
> +		if (cpu_count < (nr_objs % pool->nr_possible_cpus))
>  			nodes++;
>  		cpu_count++;
>  
> @@ -103,7 +103,7 @@ static void objpool_fini_percpu_slots(struct objpool_head *pool)
>  	if (!pool->cpu_slots)
>  		return;
>  
> -	for (i = 0; i < pool->nr_cpus; i++)
> +	for (i = 0; i < nr_cpu_ids; i++)
>  		kvfree(pool->cpu_slots[i]);
>  	kfree(pool->cpu_slots);
>  }
> @@ -130,13 +130,13 @@ int objpool_init(struct objpool_head *pool, int nr_objs, int object_size,
>  
>  	/* initialize objpool pool */
>  	memset(pool, 0, sizeof(struct objpool_head));
> -	pool->nr_cpus = nr_cpu_ids;
> +	pool->nr_possible_cpus = num_possible_cpus();
>  	pool->obj_size = object_size;
>  	pool->capacity = capacity;
>  	pool->gfp = gfp & ~__GFP_ZERO;
>  	pool->context = context;
>  	pool->release = release;
> -	slot_size = pool->nr_cpus * sizeof(struct objpool_slot);
> +	slot_size = nr_cpu_ids * sizeof(struct objpool_slot);
>  	pool->cpu_slots = kzalloc(slot_size, pool->gfp);
>  	if (!pool->cpu_slots)
>  		return -ENOMEM;
> -- 
> 2.43.0
> 
>
diff mbox series

Patch

diff --git a/include/linux/objpool.h b/include/linux/objpool.h
index d8b1f7b91128..cb1758eaa2d3 100644
--- a/include/linux/objpool.h
+++ b/include/linux/objpool.h
@@ -73,7 +73,7 @@  typedef int (*objpool_fini_cb)(struct objpool_head *head, void *context);
  * struct objpool_head - object pooling metadata
  * @obj_size:   object size, aligned to sizeof(void *)
  * @nr_objs:    total objs (to be pre-allocated with objpool)
- * @nr_cpus:    local copy of nr_cpu_ids
+ * @nr_possible_cpus: cached value of num_possible_cpus()
  * @capacity:   max objs can be managed by one objpool_slot
  * @gfp:        gfp flags for kmalloc & vmalloc
  * @ref:        refcount of objpool
@@ -85,7 +85,7 @@  typedef int (*objpool_fini_cb)(struct objpool_head *head, void *context);
 struct objpool_head {
 	int                     obj_size;
 	int                     nr_objs;
-	int                     nr_cpus;
+	int                     nr_possible_cpus;
 	int                     capacity;
 	gfp_t                   gfp;
 	refcount_t              ref;
@@ -176,7 +176,7 @@  static inline void *objpool_pop(struct objpool_head *pool)
 	raw_local_irq_save(flags);
 
 	cpu = raw_smp_processor_id();
-	for (i = 0; i < num_possible_cpus(); i++) {
+	for (i = 0; i < pool->nr_possible_cpus; i++) {
 		obj = __objpool_try_get_slot(pool, cpu);
 		if (obj)
 			break;
diff --git a/lib/objpool.c b/lib/objpool.c
index f696308fc026..234f9d0bd081 100644
--- a/lib/objpool.c
+++ b/lib/objpool.c
@@ -50,7 +50,7 @@  objpool_init_percpu_slots(struct objpool_head *pool, int nr_objs,
 {
 	int i, cpu_count = 0;
 
-	for (i = 0; i < pool->nr_cpus; i++) {
+	for (i = 0; i < nr_cpu_ids; i++) {
 
 		struct objpool_slot *slot;
 		int nodes, size, rc;
@@ -60,8 +60,8 @@  objpool_init_percpu_slots(struct objpool_head *pool, int nr_objs,
 			continue;
 
 		/* compute how many objects to be allocated with this slot */
-		nodes = nr_objs / num_possible_cpus();
-		if (cpu_count < (nr_objs % num_possible_cpus()))
+		nodes = nr_objs / pool->nr_possible_cpus;
+		if (cpu_count < (nr_objs % pool->nr_possible_cpus))
 			nodes++;
 		cpu_count++;
 
@@ -103,7 +103,7 @@  static void objpool_fini_percpu_slots(struct objpool_head *pool)
 	if (!pool->cpu_slots)
 		return;
 
-	for (i = 0; i < pool->nr_cpus; i++)
+	for (i = 0; i < nr_cpu_ids; i++)
 		kvfree(pool->cpu_slots[i]);
 	kfree(pool->cpu_slots);
 }
@@ -130,13 +130,13 @@  int objpool_init(struct objpool_head *pool, int nr_objs, int object_size,
 
 	/* initialize objpool pool */
 	memset(pool, 0, sizeof(struct objpool_head));
-	pool->nr_cpus = nr_cpu_ids;
+	pool->nr_possible_cpus = num_possible_cpus();
 	pool->obj_size = object_size;
 	pool->capacity = capacity;
 	pool->gfp = gfp & ~__GFP_ZERO;
 	pool->context = context;
 	pool->release = release;
-	slot_size = pool->nr_cpus * sizeof(struct objpool_slot);
+	slot_size = nr_cpu_ids * sizeof(struct objpool_slot);
 	pool->cpu_slots = kzalloc(slot_size, pool->gfp);
 	if (!pool->cpu_slots)
 		return -ENOMEM;