Message ID | 20220704150514.48816-5-elver@google.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | perf/hw_breakpoint: Optimize for thousands of tasks | expand |
On Mon, Jul 4, 2022 at 8:06 AM Marco Elver <elver@google.com> wrote: > > On a machine with 256 CPUs, running the recently added perf breakpoint > benchmark results in: > > | $> perf bench -r 30 breakpoint thread -b 4 -p 64 -t 64 > | # Running 'breakpoint/thread' benchmark: > | # Created/joined 30 threads with 4 breakpoints and 64 parallelism > | Total time: 236.418 [sec] > | > | 123134.794271 usecs/op > | 7880626.833333 usecs/op/cpu > > The benchmark tests inherited breakpoint perf events across many > threads. > > Looking at a perf profile, we can see that the majority of the time is > spent in various hw_breakpoint.c functions, which execute within the > 'nr_bp_mutex' critical sections which then results in contention on that > mutex as well: > > 37.27% [kernel] [k] osq_lock > 34.92% [kernel] [k] mutex_spin_on_owner > 12.15% [kernel] [k] toggle_bp_slot > 11.90% [kernel] [k] __reserve_bp_slot > > The culprit here is task_bp_pinned(), which has a runtime complexity of > O(#tasks) due to storing all task breakpoints in the same list and > iterating through that list looking for a matching task. Clearly, this > does not scale to thousands of tasks. > > Instead, make use of the "rhashtable" variant "rhltable" which stores > multiple items with the same key in a list. This results in average > runtime complexity of O(1) for task_bp_pinned(). > > With the optimization, the benchmark shows: > > | $> perf bench -r 30 breakpoint thread -b 4 -p 64 -t 64 > | # Running 'breakpoint/thread' benchmark: > | # Created/joined 30 threads with 4 breakpoints and 64 parallelism > | Total time: 0.208 [sec] > | > | 108.422396 usecs/op > | 6939.033333 usecs/op/cpu > > On this particular setup that's a speedup of ~1135x. > > While one option would be to make task_struct a breakpoint list node, > this would only further bloat task_struct for infrequently used data. > Furthermore, after all optimizations in this series, there's no evidence > it would result in better performance: later optimizations make the time > spent looking up entries in the hash table negligible (we'll reach the > theoretical ideal performance i.e. no constraints). > > Signed-off-by: Marco Elver <elver@google.com> > Reviewed-by: Dmitry Vyukov <dvyukov@google.com> > --- > v2: > * Commit message tweaks. > --- > include/linux/perf_event.h | 3 +- > kernel/events/hw_breakpoint.c | 56 ++++++++++++++++++++++------------- > 2 files changed, 37 insertions(+), 22 deletions(-) > > diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h > index 01231f1d976c..e27360436dc6 100644 > --- a/include/linux/perf_event.h > +++ b/include/linux/perf_event.h > @@ -36,6 +36,7 @@ struct perf_guest_info_callbacks { > }; > > #ifdef CONFIG_HAVE_HW_BREAKPOINT > +#include <linux/rhashtable-types.h> > #include <asm/hw_breakpoint.h> > #endif > > @@ -178,7 +179,7 @@ struct hw_perf_event { > * creation and event initalization. > */ > struct arch_hw_breakpoint info; > - struct list_head bp_list; > + struct rhlist_head bp_list; nit: perhaps it would be more intention revealing here to rename this to bp_hashtable? Acked-by: Ian Rogers <irogers@google.com> Thanks, Ian > }; > #endif > struct { /* amd_iommu */ > diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c > index 6076c6346291..6d09edc80d19 100644 > --- a/kernel/events/hw_breakpoint.c > +++ b/kernel/events/hw_breakpoint.c > @@ -26,10 +26,10 @@ > #include <linux/irqflags.h> > #include <linux/kdebug.h> > #include <linux/kernel.h> > -#include <linux/list.h> > #include <linux/mutex.h> > #include <linux/notifier.h> > #include <linux/percpu.h> > +#include <linux/rhashtable.h> > #include <linux/sched.h> > #include <linux/slab.h> > > @@ -54,7 +54,13 @@ static struct bp_cpuinfo *get_bp_info(int cpu, enum bp_type_idx type) > } > > /* Keep track of the breakpoints attached to tasks */ > -static LIST_HEAD(bp_task_head); > +static struct rhltable task_bps_ht; > +static const struct rhashtable_params task_bps_ht_params = { > + .head_offset = offsetof(struct hw_perf_event, bp_list), > + .key_offset = offsetof(struct hw_perf_event, target), > + .key_len = sizeof_field(struct hw_perf_event, target), > + .automatic_shrinking = true, > +}; > > static int constraints_initialized; > > @@ -103,17 +109,23 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type) > */ > static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type) > { > - struct task_struct *tsk = bp->hw.target; > + struct rhlist_head *head, *pos; > struct perf_event *iter; > int count = 0; > > - list_for_each_entry(iter, &bp_task_head, hw.bp_list) { > - if (iter->hw.target == tsk && > - find_slot_idx(iter->attr.bp_type) == type && > + rcu_read_lock(); > + head = rhltable_lookup(&task_bps_ht, &bp->hw.target, task_bps_ht_params); > + if (!head) > + goto out; > + > + rhl_for_each_entry_rcu(iter, pos, head, hw.bp_list) { > + if (find_slot_idx(iter->attr.bp_type) == type && > (iter->cpu < 0 || cpu == iter->cpu)) > count += hw_breakpoint_weight(iter); > } > > +out: > + rcu_read_unlock(); > return count; > } > > @@ -186,7 +198,7 @@ static void toggle_bp_task_slot(struct perf_event *bp, int cpu, > /* > * Add/remove the given breakpoint in our constraint table > */ > -static void > +static int > toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, > int weight) > { > @@ -199,7 +211,7 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, > /* Pinned counter cpu profiling */ > if (!bp->hw.target) { > get_bp_info(bp->cpu, type)->cpu_pinned += weight; > - return; > + return 0; > } > > /* Pinned counter task profiling */ > @@ -207,9 +219,9 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, > toggle_bp_task_slot(bp, cpu, type, weight); > > if (enable) > - list_add_tail(&bp->hw.bp_list, &bp_task_head); > + return rhltable_insert(&task_bps_ht, &bp->hw.bp_list, task_bps_ht_params); > else > - list_del(&bp->hw.bp_list); > + return rhltable_remove(&task_bps_ht, &bp->hw.bp_list, task_bps_ht_params); > } > > __weak int arch_reserve_bp_slot(struct perf_event *bp) > @@ -307,9 +319,7 @@ static int __reserve_bp_slot(struct perf_event *bp, u64 bp_type) > if (ret) > return ret; > > - toggle_bp_slot(bp, true, type, weight); > - > - return 0; > + return toggle_bp_slot(bp, true, type, weight); > } > > int reserve_bp_slot(struct perf_event *bp) > @@ -334,7 +344,7 @@ static void __release_bp_slot(struct perf_event *bp, u64 bp_type) > > type = find_slot_idx(bp_type); > weight = hw_breakpoint_weight(bp); > - toggle_bp_slot(bp, false, type, weight); > + WARN_ON(toggle_bp_slot(bp, false, type, weight)); > } > > void release_bp_slot(struct perf_event *bp) > @@ -707,7 +717,7 @@ static struct pmu perf_breakpoint = { > int __init init_hw_breakpoint(void) > { > int cpu, err_cpu; > - int i; > + int i, ret; > > for (i = 0; i < TYPE_MAX; i++) > nr_slots[i] = hw_breakpoint_slots(i); > @@ -718,18 +728,24 @@ int __init init_hw_breakpoint(void) > > info->tsk_pinned = kcalloc(nr_slots[i], sizeof(int), > GFP_KERNEL); > - if (!info->tsk_pinned) > - goto err_alloc; > + if (!info->tsk_pinned) { > + ret = -ENOMEM; > + goto err; > + } > } > } > > + ret = rhltable_init(&task_bps_ht, &task_bps_ht_params); > + if (ret) > + goto err; > + > constraints_initialized = 1; > > perf_pmu_register(&perf_breakpoint, "breakpoint", PERF_TYPE_BREAKPOINT); > > return register_die_notifier(&hw_breakpoint_exceptions_nb); > > - err_alloc: > +err: > for_each_possible_cpu(err_cpu) { > for (i = 0; i < TYPE_MAX; i++) > kfree(get_bp_info(err_cpu, i)->tsk_pinned); > @@ -737,7 +753,5 @@ int __init init_hw_breakpoint(void) > break; > } > > - return -ENOMEM; > + return ret; > } > - > - > -- > 2.37.0.rc0.161.g10f37bed90-goog >
On Wed, 20 Jul 2022 at 17:29, Ian Rogers <irogers@google.com> wrote: > > On Mon, Jul 4, 2022 at 8:06 AM Marco Elver <elver@google.com> wrote: > > > > On a machine with 256 CPUs, running the recently added perf breakpoint > > benchmark results in: > > > > | $> perf bench -r 30 breakpoint thread -b 4 -p 64 -t 64 > > | # Running 'breakpoint/thread' benchmark: > > | # Created/joined 30 threads with 4 breakpoints and 64 parallelism > > | Total time: 236.418 [sec] > > | > > | 123134.794271 usecs/op > > | 7880626.833333 usecs/op/cpu > > > > The benchmark tests inherited breakpoint perf events across many > > threads. > > > > Looking at a perf profile, we can see that the majority of the time is > > spent in various hw_breakpoint.c functions, which execute within the > > 'nr_bp_mutex' critical sections which then results in contention on that > > mutex as well: > > > > 37.27% [kernel] [k] osq_lock > > 34.92% [kernel] [k] mutex_spin_on_owner > > 12.15% [kernel] [k] toggle_bp_slot > > 11.90% [kernel] [k] __reserve_bp_slot > > > > The culprit here is task_bp_pinned(), which has a runtime complexity of > > O(#tasks) due to storing all task breakpoints in the same list and > > iterating through that list looking for a matching task. Clearly, this > > does not scale to thousands of tasks. > > > > Instead, make use of the "rhashtable" variant "rhltable" which stores > > multiple items with the same key in a list. This results in average > > runtime complexity of O(1) for task_bp_pinned(). > > > > With the optimization, the benchmark shows: > > > > | $> perf bench -r 30 breakpoint thread -b 4 -p 64 -t 64 > > | # Running 'breakpoint/thread' benchmark: > > | # Created/joined 30 threads with 4 breakpoints and 64 parallelism > > | Total time: 0.208 [sec] > > | > > | 108.422396 usecs/op > > | 6939.033333 usecs/op/cpu > > > > On this particular setup that's a speedup of ~1135x. > > > > While one option would be to make task_struct a breakpoint list node, > > this would only further bloat task_struct for infrequently used data. > > Furthermore, after all optimizations in this series, there's no evidence > > it would result in better performance: later optimizations make the time > > spent looking up entries in the hash table negligible (we'll reach the > > theoretical ideal performance i.e. no constraints). > > > > Signed-off-by: Marco Elver <elver@google.com> > > Reviewed-by: Dmitry Vyukov <dvyukov@google.com> > > --- > > v2: > > * Commit message tweaks. > > --- > > include/linux/perf_event.h | 3 +- > > kernel/events/hw_breakpoint.c | 56 ++++++++++++++++++++++------------- > > 2 files changed, 37 insertions(+), 22 deletions(-) > > > > diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h > > index 01231f1d976c..e27360436dc6 100644 > > --- a/include/linux/perf_event.h > > +++ b/include/linux/perf_event.h > > @@ -36,6 +36,7 @@ struct perf_guest_info_callbacks { > > }; > > > > #ifdef CONFIG_HAVE_HW_BREAKPOINT > > +#include <linux/rhashtable-types.h> > > #include <asm/hw_breakpoint.h> > > #endif > > > > @@ -178,7 +179,7 @@ struct hw_perf_event { > > * creation and event initalization. > > */ > > struct arch_hw_breakpoint info; > > - struct list_head bp_list; > > + struct rhlist_head bp_list; > > nit: perhaps it would be more intention revealing here to rename this > to bp_hashtable? The naming convention for uses of rhlist_head appears to be either 'list' or 'node' (also inside lib/rhashtable.c). I think this makes sense because internally this struct is used to just append to the bucket's list. > Acked-by: Ian Rogers <irogers@google.com> Thanks! -- Marco
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 01231f1d976c..e27360436dc6 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -36,6 +36,7 @@ struct perf_guest_info_callbacks { }; #ifdef CONFIG_HAVE_HW_BREAKPOINT +#include <linux/rhashtable-types.h> #include <asm/hw_breakpoint.h> #endif @@ -178,7 +179,7 @@ struct hw_perf_event { * creation and event initalization. */ struct arch_hw_breakpoint info; - struct list_head bp_list; + struct rhlist_head bp_list; }; #endif struct { /* amd_iommu */ diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 6076c6346291..6d09edc80d19 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -26,10 +26,10 @@ #include <linux/irqflags.h> #include <linux/kdebug.h> #include <linux/kernel.h> -#include <linux/list.h> #include <linux/mutex.h> #include <linux/notifier.h> #include <linux/percpu.h> +#include <linux/rhashtable.h> #include <linux/sched.h> #include <linux/slab.h> @@ -54,7 +54,13 @@ static struct bp_cpuinfo *get_bp_info(int cpu, enum bp_type_idx type) } /* Keep track of the breakpoints attached to tasks */ -static LIST_HEAD(bp_task_head); +static struct rhltable task_bps_ht; +static const struct rhashtable_params task_bps_ht_params = { + .head_offset = offsetof(struct hw_perf_event, bp_list), + .key_offset = offsetof(struct hw_perf_event, target), + .key_len = sizeof_field(struct hw_perf_event, target), + .automatic_shrinking = true, +}; static int constraints_initialized; @@ -103,17 +109,23 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type) */ static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type) { - struct task_struct *tsk = bp->hw.target; + struct rhlist_head *head, *pos; struct perf_event *iter; int count = 0; - list_for_each_entry(iter, &bp_task_head, hw.bp_list) { - if (iter->hw.target == tsk && - find_slot_idx(iter->attr.bp_type) == type && + rcu_read_lock(); + head = rhltable_lookup(&task_bps_ht, &bp->hw.target, task_bps_ht_params); + if (!head) + goto out; + + rhl_for_each_entry_rcu(iter, pos, head, hw.bp_list) { + if (find_slot_idx(iter->attr.bp_type) == type && (iter->cpu < 0 || cpu == iter->cpu)) count += hw_breakpoint_weight(iter); } +out: + rcu_read_unlock(); return count; } @@ -186,7 +198,7 @@ static void toggle_bp_task_slot(struct perf_event *bp, int cpu, /* * Add/remove the given breakpoint in our constraint table */ -static void +static int toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, int weight) { @@ -199,7 +211,7 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, /* Pinned counter cpu profiling */ if (!bp->hw.target) { get_bp_info(bp->cpu, type)->cpu_pinned += weight; - return; + return 0; } /* Pinned counter task profiling */ @@ -207,9 +219,9 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, toggle_bp_task_slot(bp, cpu, type, weight); if (enable) - list_add_tail(&bp->hw.bp_list, &bp_task_head); + return rhltable_insert(&task_bps_ht, &bp->hw.bp_list, task_bps_ht_params); else - list_del(&bp->hw.bp_list); + return rhltable_remove(&task_bps_ht, &bp->hw.bp_list, task_bps_ht_params); } __weak int arch_reserve_bp_slot(struct perf_event *bp) @@ -307,9 +319,7 @@ static int __reserve_bp_slot(struct perf_event *bp, u64 bp_type) if (ret) return ret; - toggle_bp_slot(bp, true, type, weight); - - return 0; + return toggle_bp_slot(bp, true, type, weight); } int reserve_bp_slot(struct perf_event *bp) @@ -334,7 +344,7 @@ static void __release_bp_slot(struct perf_event *bp, u64 bp_type) type = find_slot_idx(bp_type); weight = hw_breakpoint_weight(bp); - toggle_bp_slot(bp, false, type, weight); + WARN_ON(toggle_bp_slot(bp, false, type, weight)); } void release_bp_slot(struct perf_event *bp) @@ -707,7 +717,7 @@ static struct pmu perf_breakpoint = { int __init init_hw_breakpoint(void) { int cpu, err_cpu; - int i; + int i, ret; for (i = 0; i < TYPE_MAX; i++) nr_slots[i] = hw_breakpoint_slots(i); @@ -718,18 +728,24 @@ int __init init_hw_breakpoint(void) info->tsk_pinned = kcalloc(nr_slots[i], sizeof(int), GFP_KERNEL); - if (!info->tsk_pinned) - goto err_alloc; + if (!info->tsk_pinned) { + ret = -ENOMEM; + goto err; + } } } + ret = rhltable_init(&task_bps_ht, &task_bps_ht_params); + if (ret) + goto err; + constraints_initialized = 1; perf_pmu_register(&perf_breakpoint, "breakpoint", PERF_TYPE_BREAKPOINT); return register_die_notifier(&hw_breakpoint_exceptions_nb); - err_alloc: +err: for_each_possible_cpu(err_cpu) { for (i = 0; i < TYPE_MAX; i++) kfree(get_bp_info(err_cpu, i)->tsk_pinned); @@ -737,7 +753,5 @@ int __init init_hw_breakpoint(void) break; } - return -ENOMEM; + return ret; } - -