Message ID | 20220704150514.48816-7-elver@google.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | perf/hw_breakpoint: Optimize for thousands of tasks | expand |
On Mon, Jul 4, 2022 at 8:06 AM Marco Elver <elver@google.com> wrote: > > Optimize internal hw_breakpoint state if the architecture's number of > breakpoint slots is constant. This avoids several kmalloc() calls and > potentially unnecessary failures if the allocations fail, as well as > subtly improves code generation and cache locality. > > The protocol is that if an architecture defines hw_breakpoint_slots via > the preprocessor, it must be constant and the same for all types. > > Signed-off-by: Marco Elver <elver@google.com> > Acked-by: Dmitry Vyukov <dvyukov@google.com> Acked-by: Ian Rogers <irogers@google.com> Thanks, Ian > --- > arch/sh/include/asm/hw_breakpoint.h | 5 +- > arch/x86/include/asm/hw_breakpoint.h | 5 +- > kernel/events/hw_breakpoint.c | 94 ++++++++++++++++++---------- > 3 files changed, 63 insertions(+), 41 deletions(-) > > diff --git a/arch/sh/include/asm/hw_breakpoint.h b/arch/sh/include/asm/hw_breakpoint.h > index 199d17b765f2..361a0f57bdeb 100644 > --- a/arch/sh/include/asm/hw_breakpoint.h > +++ b/arch/sh/include/asm/hw_breakpoint.h > @@ -48,10 +48,7 @@ struct pmu; > /* Maximum number of UBC channels */ > #define HBP_NUM 2 > > -static inline int hw_breakpoint_slots(int type) > -{ > - return HBP_NUM; > -} > +#define hw_breakpoint_slots(type) (HBP_NUM) > > /* arch/sh/kernel/hw_breakpoint.c */ > extern int arch_check_bp_in_kernelspace(struct arch_hw_breakpoint *hw); > diff --git a/arch/x86/include/asm/hw_breakpoint.h b/arch/x86/include/asm/hw_breakpoint.h > index a1f0e90d0818..0bc931cd0698 100644 > --- a/arch/x86/include/asm/hw_breakpoint.h > +++ b/arch/x86/include/asm/hw_breakpoint.h > @@ -44,10 +44,7 @@ struct arch_hw_breakpoint { > /* Total number of available HW breakpoint registers */ > #define HBP_NUM 4 > > -static inline int hw_breakpoint_slots(int type) > -{ > - return HBP_NUM; > -} > +#define hw_breakpoint_slots(type) (HBP_NUM) > > struct perf_event_attr; > struct perf_event; > diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c > index 7df46b276452..9fb66d358d81 100644 > --- a/kernel/events/hw_breakpoint.c > +++ b/kernel/events/hw_breakpoint.c > @@ -40,13 +40,16 @@ struct bp_cpuinfo { > /* Number of pinned cpu breakpoints in a cpu */ > unsigned int cpu_pinned; > /* tsk_pinned[n] is the number of tasks having n+1 breakpoints */ > +#ifdef hw_breakpoint_slots > + unsigned int tsk_pinned[hw_breakpoint_slots(0)]; > +#else > unsigned int *tsk_pinned; > +#endif > /* Number of non-pinned cpu/task breakpoints in a cpu */ > unsigned int flexible; /* XXX: placeholder, see fetch_this_slot() */ > }; > > static DEFINE_PER_CPU(struct bp_cpuinfo, bp_cpuinfo[TYPE_MAX]); > -static int nr_slots[TYPE_MAX] __ro_after_init; > > static struct bp_cpuinfo *get_bp_info(int cpu, enum bp_type_idx type) > { > @@ -73,6 +76,54 @@ struct bp_busy_slots { > /* Serialize accesses to the above constraints */ > static DEFINE_MUTEX(nr_bp_mutex); > > +#ifdef hw_breakpoint_slots > +/* > + * Number of breakpoint slots is constant, and the same for all types. > + */ > +static_assert(hw_breakpoint_slots(TYPE_INST) == hw_breakpoint_slots(TYPE_DATA)); > +static inline int hw_breakpoint_slots_cached(int type) { return hw_breakpoint_slots(type); } > +static inline int init_breakpoint_slots(void) { return 0; } > +#else > +/* > + * Dynamic number of breakpoint slots. > + */ > +static int __nr_bp_slots[TYPE_MAX] __ro_after_init; > + > +static inline int hw_breakpoint_slots_cached(int type) > +{ > + return __nr_bp_slots[type]; > +} > + > +static __init int init_breakpoint_slots(void) > +{ > + int i, cpu, err_cpu; > + > + for (i = 0; i < TYPE_MAX; i++) > + __nr_bp_slots[i] = hw_breakpoint_slots(i); > + > + for_each_possible_cpu(cpu) { > + for (i = 0; i < TYPE_MAX; i++) { > + struct bp_cpuinfo *info = get_bp_info(cpu, i); > + > + info->tsk_pinned = kcalloc(__nr_bp_slots[i], sizeof(int), GFP_KERNEL); > + if (!info->tsk_pinned) > + goto err; > + } > + } > + > + return 0; > +err: > + for_each_possible_cpu(err_cpu) { > + for (i = 0; i < TYPE_MAX; i++) > + kfree(get_bp_info(err_cpu, i)->tsk_pinned); > + if (err_cpu == cpu) > + break; > + } > + > + return -ENOMEM; > +} > +#endif > + > __weak int hw_breakpoint_weight(struct perf_event *bp) > { > return 1; > @@ -95,7 +146,7 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type) > unsigned int *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned; > int i; > > - for (i = nr_slots[type] - 1; i >= 0; i--) { > + for (i = hw_breakpoint_slots_cached(type) - 1; i >= 0; i--) { > if (tsk_pinned[i] > 0) > return i + 1; > } > @@ -312,7 +363,7 @@ static int __reserve_bp_slot(struct perf_event *bp, u64 bp_type) > fetch_this_slot(&slots, weight); > > /* Flexible counters need to keep at least one slot */ > - if (slots.pinned + (!!slots.flexible) > nr_slots[type]) > + if (slots.pinned + (!!slots.flexible) > hw_breakpoint_slots_cached(type)) > return -ENOSPC; > > ret = arch_reserve_bp_slot(bp); > @@ -632,7 +683,7 @@ bool hw_breakpoint_is_used(void) > if (info->cpu_pinned) > return true; > > - for (int slot = 0; slot < nr_slots[type]; ++slot) { > + for (int slot = 0; slot < hw_breakpoint_slots_cached(type); ++slot) { > if (info->tsk_pinned[slot]) > return true; > } > @@ -716,42 +767,19 @@ static struct pmu perf_breakpoint = { > > int __init init_hw_breakpoint(void) > { > - int cpu, err_cpu; > - int i, ret; > - > - for (i = 0; i < TYPE_MAX; i++) > - nr_slots[i] = hw_breakpoint_slots(i); > - > - for_each_possible_cpu(cpu) { > - for (i = 0; i < TYPE_MAX; i++) { > - struct bp_cpuinfo *info = get_bp_info(cpu, i); > - > - info->tsk_pinned = kcalloc(nr_slots[i], sizeof(int), > - GFP_KERNEL); > - if (!info->tsk_pinned) { > - ret = -ENOMEM; > - goto err; > - } > - } > - } > + int ret; > > ret = rhltable_init(&task_bps_ht, &task_bps_ht_params); > if (ret) > - goto err; > + return ret; > + > + ret = init_breakpoint_slots(); > + if (ret) > + return ret; > > constraints_initialized = true; > > perf_pmu_register(&perf_breakpoint, "breakpoint", PERF_TYPE_BREAKPOINT); > > return register_die_notifier(&hw_breakpoint_exceptions_nb); > - > -err: > - for_each_possible_cpu(err_cpu) { > - for (i = 0; i < TYPE_MAX; i++) > - kfree(get_bp_info(err_cpu, i)->tsk_pinned); > - if (err_cpu == cpu) > - break; > - } > - > - return ret; > } > -- > 2.37.0.rc0.161.g10f37bed90-goog >
diff --git a/arch/sh/include/asm/hw_breakpoint.h b/arch/sh/include/asm/hw_breakpoint.h index 199d17b765f2..361a0f57bdeb 100644 --- a/arch/sh/include/asm/hw_breakpoint.h +++ b/arch/sh/include/asm/hw_breakpoint.h @@ -48,10 +48,7 @@ struct pmu; /* Maximum number of UBC channels */ #define HBP_NUM 2 -static inline int hw_breakpoint_slots(int type) -{ - return HBP_NUM; -} +#define hw_breakpoint_slots(type) (HBP_NUM) /* arch/sh/kernel/hw_breakpoint.c */ extern int arch_check_bp_in_kernelspace(struct arch_hw_breakpoint *hw); diff --git a/arch/x86/include/asm/hw_breakpoint.h b/arch/x86/include/asm/hw_breakpoint.h index a1f0e90d0818..0bc931cd0698 100644 --- a/arch/x86/include/asm/hw_breakpoint.h +++ b/arch/x86/include/asm/hw_breakpoint.h @@ -44,10 +44,7 @@ struct arch_hw_breakpoint { /* Total number of available HW breakpoint registers */ #define HBP_NUM 4 -static inline int hw_breakpoint_slots(int type) -{ - return HBP_NUM; -} +#define hw_breakpoint_slots(type) (HBP_NUM) struct perf_event_attr; struct perf_event; diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 7df46b276452..9fb66d358d81 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -40,13 +40,16 @@ struct bp_cpuinfo { /* Number of pinned cpu breakpoints in a cpu */ unsigned int cpu_pinned; /* tsk_pinned[n] is the number of tasks having n+1 breakpoints */ +#ifdef hw_breakpoint_slots + unsigned int tsk_pinned[hw_breakpoint_slots(0)]; +#else unsigned int *tsk_pinned; +#endif /* Number of non-pinned cpu/task breakpoints in a cpu */ unsigned int flexible; /* XXX: placeholder, see fetch_this_slot() */ }; static DEFINE_PER_CPU(struct bp_cpuinfo, bp_cpuinfo[TYPE_MAX]); -static int nr_slots[TYPE_MAX] __ro_after_init; static struct bp_cpuinfo *get_bp_info(int cpu, enum bp_type_idx type) { @@ -73,6 +76,54 @@ struct bp_busy_slots { /* Serialize accesses to the above constraints */ static DEFINE_MUTEX(nr_bp_mutex); +#ifdef hw_breakpoint_slots +/* + * Number of breakpoint slots is constant, and the same for all types. + */ +static_assert(hw_breakpoint_slots(TYPE_INST) == hw_breakpoint_slots(TYPE_DATA)); +static inline int hw_breakpoint_slots_cached(int type) { return hw_breakpoint_slots(type); } +static inline int init_breakpoint_slots(void) { return 0; } +#else +/* + * Dynamic number of breakpoint slots. + */ +static int __nr_bp_slots[TYPE_MAX] __ro_after_init; + +static inline int hw_breakpoint_slots_cached(int type) +{ + return __nr_bp_slots[type]; +} + +static __init int init_breakpoint_slots(void) +{ + int i, cpu, err_cpu; + + for (i = 0; i < TYPE_MAX; i++) + __nr_bp_slots[i] = hw_breakpoint_slots(i); + + for_each_possible_cpu(cpu) { + for (i = 0; i < TYPE_MAX; i++) { + struct bp_cpuinfo *info = get_bp_info(cpu, i); + + info->tsk_pinned = kcalloc(__nr_bp_slots[i], sizeof(int), GFP_KERNEL); + if (!info->tsk_pinned) + goto err; + } + } + + return 0; +err: + for_each_possible_cpu(err_cpu) { + for (i = 0; i < TYPE_MAX; i++) + kfree(get_bp_info(err_cpu, i)->tsk_pinned); + if (err_cpu == cpu) + break; + } + + return -ENOMEM; +} +#endif + __weak int hw_breakpoint_weight(struct perf_event *bp) { return 1; @@ -95,7 +146,7 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type) unsigned int *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned; int i; - for (i = nr_slots[type] - 1; i >= 0; i--) { + for (i = hw_breakpoint_slots_cached(type) - 1; i >= 0; i--) { if (tsk_pinned[i] > 0) return i + 1; } @@ -312,7 +363,7 @@ static int __reserve_bp_slot(struct perf_event *bp, u64 bp_type) fetch_this_slot(&slots, weight); /* Flexible counters need to keep at least one slot */ - if (slots.pinned + (!!slots.flexible) > nr_slots[type]) + if (slots.pinned + (!!slots.flexible) > hw_breakpoint_slots_cached(type)) return -ENOSPC; ret = arch_reserve_bp_slot(bp); @@ -632,7 +683,7 @@ bool hw_breakpoint_is_used(void) if (info->cpu_pinned) return true; - for (int slot = 0; slot < nr_slots[type]; ++slot) { + for (int slot = 0; slot < hw_breakpoint_slots_cached(type); ++slot) { if (info->tsk_pinned[slot]) return true; } @@ -716,42 +767,19 @@ static struct pmu perf_breakpoint = { int __init init_hw_breakpoint(void) { - int cpu, err_cpu; - int i, ret; - - for (i = 0; i < TYPE_MAX; i++) - nr_slots[i] = hw_breakpoint_slots(i); - - for_each_possible_cpu(cpu) { - for (i = 0; i < TYPE_MAX; i++) { - struct bp_cpuinfo *info = get_bp_info(cpu, i); - - info->tsk_pinned = kcalloc(nr_slots[i], sizeof(int), - GFP_KERNEL); - if (!info->tsk_pinned) { - ret = -ENOMEM; - goto err; - } - } - } + int ret; ret = rhltable_init(&task_bps_ht, &task_bps_ht_params); if (ret) - goto err; + return ret; + + ret = init_breakpoint_slots(); + if (ret) + return ret; constraints_initialized = true; perf_pmu_register(&perf_breakpoint, "breakpoint", PERF_TYPE_BREAKPOINT); return register_die_notifier(&hw_breakpoint_exceptions_nb); - -err: - for_each_possible_cpu(err_cpu) { - for (i = 0; i < TYPE_MAX; i++) - kfree(get_bp_info(err_cpu, i)->tsk_pinned); - if (err_cpu == cpu) - break; - } - - return ret; }