Message ID | 20240424215214.3956041-3-andrii@kernel.org (mailing list archive) |
---|---|
State | Handled Elsewhere |
Headers | show |
Series | Objpool performance improvements | expand |
Context | Check | Description |
---|---|---|
netdev/tree_selection | success | Not a local patch |
On Wed, Apr 24, 2024 at 02:52:14PM -0700, Andrii Nakryiko wrote: > Profiling shows that calling nr_possible_cpus() in objpool_pop() takes > a noticeable amount of CPU (when profiled on 80-core machine), as we > need to recalculate number of set bits in a CPU bit mask. This number > can't change, so there is no point in paying the price for recalculating > it. As such, cache this value in struct objpool_head and use it in > objpool_pop(). > > On the other hand, cached pool->nr_cpus isn't necessary, as it's not > used in hot path and is also a pretty trivial value to retrieve. So drop > pool->nr_cpus in favor of using nr_cpu_ids everywhere. This way the size > of struct objpool_head remains the same, which is a nice bonus. > > Same BPF selftests benchmarks were used to evaluate the effect. Using > changes in previous patch (inlining of objpool_pop/objpool_push) as > baseline, here are the differences: > > BASELINE > ======== > kretprobe : 9.937 ± 0.174M/s > kretprobe-multi: 10.440 ± 0.108M/s > > AFTER > ===== > kretprobe : 10.106 ± 0.120M/s (+1.7%) > kretprobe-multi: 10.515 ± 0.180M/s (+0.7%) nice, overall lgtm jirka > > Cc: Matt (Qiang) Wu <wuqiang.matt@bytedance.com> > Signed-off-by: Andrii Nakryiko <andrii@kernel.org> > --- > include/linux/objpool.h | 6 +++--- > lib/objpool.c | 12 ++++++------ > 2 files changed, 9 insertions(+), 9 deletions(-) > > diff --git a/include/linux/objpool.h b/include/linux/objpool.h > index d8b1f7b91128..cb1758eaa2d3 100644 > --- a/include/linux/objpool.h > +++ b/include/linux/objpool.h > @@ -73,7 +73,7 @@ typedef int (*objpool_fini_cb)(struct objpool_head *head, void *context); > * struct objpool_head - object pooling metadata > * @obj_size: object size, aligned to sizeof(void *) > * @nr_objs: total objs (to be pre-allocated with objpool) > - * @nr_cpus: local copy of nr_cpu_ids > + * @nr_possible_cpus: cached value of num_possible_cpus() > * @capacity: max objs can be managed by one objpool_slot > * @gfp: gfp flags for kmalloc & vmalloc > * @ref: refcount of objpool > @@ -85,7 +85,7 @@ typedef int (*objpool_fini_cb)(struct objpool_head *head, void *context); > struct objpool_head { > int obj_size; > int nr_objs; > - int nr_cpus; > + int nr_possible_cpus; > int capacity; > gfp_t gfp; > refcount_t ref; > @@ -176,7 +176,7 @@ static inline void *objpool_pop(struct objpool_head *pool) > raw_local_irq_save(flags); > > cpu = raw_smp_processor_id(); > - for (i = 0; i < num_possible_cpus(); i++) { > + for (i = 0; i < pool->nr_possible_cpus; i++) { > obj = __objpool_try_get_slot(pool, cpu); > if (obj) > break; > diff --git a/lib/objpool.c b/lib/objpool.c > index f696308fc026..234f9d0bd081 100644 > --- a/lib/objpool.c > +++ b/lib/objpool.c > @@ -50,7 +50,7 @@ objpool_init_percpu_slots(struct objpool_head *pool, int nr_objs, > { > int i, cpu_count = 0; > > - for (i = 0; i < pool->nr_cpus; i++) { > + for (i = 0; i < nr_cpu_ids; i++) { > > struct objpool_slot *slot; > int nodes, size, rc; > @@ -60,8 +60,8 @@ objpool_init_percpu_slots(struct objpool_head *pool, int nr_objs, > continue; > > /* compute how many objects to be allocated with this slot */ > - nodes = nr_objs / num_possible_cpus(); > - if (cpu_count < (nr_objs % num_possible_cpus())) > + nodes = nr_objs / pool->nr_possible_cpus; > + if (cpu_count < (nr_objs % pool->nr_possible_cpus)) > nodes++; > cpu_count++; > > @@ -103,7 +103,7 @@ static void objpool_fini_percpu_slots(struct objpool_head *pool) > if (!pool->cpu_slots) > return; > > - for (i = 0; i < pool->nr_cpus; i++) > + for (i = 0; i < nr_cpu_ids; i++) > kvfree(pool->cpu_slots[i]); > kfree(pool->cpu_slots); > } > @@ -130,13 +130,13 @@ int objpool_init(struct objpool_head *pool, int nr_objs, int object_size, > > /* initialize objpool pool */ > memset(pool, 0, sizeof(struct objpool_head)); > - pool->nr_cpus = nr_cpu_ids; > + pool->nr_possible_cpus = num_possible_cpus(); > pool->obj_size = object_size; > pool->capacity = capacity; > pool->gfp = gfp & ~__GFP_ZERO; > pool->context = context; > pool->release = release; > - slot_size = pool->nr_cpus * sizeof(struct objpool_slot); > + slot_size = nr_cpu_ids * sizeof(struct objpool_slot); > pool->cpu_slots = kzalloc(slot_size, pool->gfp); > if (!pool->cpu_slots) > return -ENOMEM; > -- > 2.43.0 > >
diff --git a/include/linux/objpool.h b/include/linux/objpool.h index d8b1f7b91128..cb1758eaa2d3 100644 --- a/include/linux/objpool.h +++ b/include/linux/objpool.h @@ -73,7 +73,7 @@ typedef int (*objpool_fini_cb)(struct objpool_head *head, void *context); * struct objpool_head - object pooling metadata * @obj_size: object size, aligned to sizeof(void *) * @nr_objs: total objs (to be pre-allocated with objpool) - * @nr_cpus: local copy of nr_cpu_ids + * @nr_possible_cpus: cached value of num_possible_cpus() * @capacity: max objs can be managed by one objpool_slot * @gfp: gfp flags for kmalloc & vmalloc * @ref: refcount of objpool @@ -85,7 +85,7 @@ typedef int (*objpool_fini_cb)(struct objpool_head *head, void *context); struct objpool_head { int obj_size; int nr_objs; - int nr_cpus; + int nr_possible_cpus; int capacity; gfp_t gfp; refcount_t ref; @@ -176,7 +176,7 @@ static inline void *objpool_pop(struct objpool_head *pool) raw_local_irq_save(flags); cpu = raw_smp_processor_id(); - for (i = 0; i < num_possible_cpus(); i++) { + for (i = 0; i < pool->nr_possible_cpus; i++) { obj = __objpool_try_get_slot(pool, cpu); if (obj) break; diff --git a/lib/objpool.c b/lib/objpool.c index f696308fc026..234f9d0bd081 100644 --- a/lib/objpool.c +++ b/lib/objpool.c @@ -50,7 +50,7 @@ objpool_init_percpu_slots(struct objpool_head *pool, int nr_objs, { int i, cpu_count = 0; - for (i = 0; i < pool->nr_cpus; i++) { + for (i = 0; i < nr_cpu_ids; i++) { struct objpool_slot *slot; int nodes, size, rc; @@ -60,8 +60,8 @@ objpool_init_percpu_slots(struct objpool_head *pool, int nr_objs, continue; /* compute how many objects to be allocated with this slot */ - nodes = nr_objs / num_possible_cpus(); - if (cpu_count < (nr_objs % num_possible_cpus())) + nodes = nr_objs / pool->nr_possible_cpus; + if (cpu_count < (nr_objs % pool->nr_possible_cpus)) nodes++; cpu_count++; @@ -103,7 +103,7 @@ static void objpool_fini_percpu_slots(struct objpool_head *pool) if (!pool->cpu_slots) return; - for (i = 0; i < pool->nr_cpus; i++) + for (i = 0; i < nr_cpu_ids; i++) kvfree(pool->cpu_slots[i]); kfree(pool->cpu_slots); } @@ -130,13 +130,13 @@ int objpool_init(struct objpool_head *pool, int nr_objs, int object_size, /* initialize objpool pool */ memset(pool, 0, sizeof(struct objpool_head)); - pool->nr_cpus = nr_cpu_ids; + pool->nr_possible_cpus = num_possible_cpus(); pool->obj_size = object_size; pool->capacity = capacity; pool->gfp = gfp & ~__GFP_ZERO; pool->context = context; pool->release = release; - slot_size = pool->nr_cpus * sizeof(struct objpool_slot); + slot_size = nr_cpu_ids * sizeof(struct objpool_slot); pool->cpu_slots = kzalloc(slot_size, pool->gfp); if (!pool->cpu_slots) return -ENOMEM;
Profiling shows that calling nr_possible_cpus() in objpool_pop() takes a noticeable amount of CPU (when profiled on 80-core machine), as we need to recalculate number of set bits in a CPU bit mask. This number can't change, so there is no point in paying the price for recalculating it. As such, cache this value in struct objpool_head and use it in objpool_pop(). On the other hand, cached pool->nr_cpus isn't necessary, as it's not used in hot path and is also a pretty trivial value to retrieve. So drop pool->nr_cpus in favor of using nr_cpu_ids everywhere. This way the size of struct objpool_head remains the same, which is a nice bonus. Same BPF selftests benchmarks were used to evaluate the effect. Using changes in previous patch (inlining of objpool_pop/objpool_push) as baseline, here are the differences: BASELINE ======== kretprobe : 9.937 ± 0.174M/s kretprobe-multi: 10.440 ± 0.108M/s AFTER ===== kretprobe : 10.106 ± 0.120M/s (+1.7%) kretprobe-multi: 10.515 ± 0.180M/s (+0.7%) Cc: Matt (Qiang) Wu <wuqiang.matt@bytedance.com> Signed-off-by: Andrii Nakryiko <andrii@kernel.org> --- include/linux/objpool.h | 6 +++--- lib/objpool.c | 12 ++++++------ 2 files changed, 9 insertions(+), 9 deletions(-)