diff mbox series

[v3,bpf-next,1/3] perf: enable branch record for software events

Message ID 20210830214106.4142056-2-songliubraving@fb.com (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series bpf: introduce bpf_get_branch_snapshot | expand

Checks

Context Check Description
bpf/vmtest-bpf-next success Kernel LATEST + selftests
netdev/cover_letter success Link
netdev/fixes_present success Link
netdev/patch_count success Link
netdev/tree_selection success Clearly marked for bpf-next
netdev/subject_prefix success Link
netdev/cc_maintainers warning 17 maintainers not CCed: linux-perf-users@vger.kernel.org hpa@zytor.com alexander.shishkin@linux.intel.com jolsa@redhat.com andrii@kernel.org daniel@iogearbox.net tglx@linutronix.de x86@kernel.org kpsingh@kernel.org netdev@vger.kernel.org yhs@fb.com ast@kernel.org bp@alien8.de mark.rutland@arm.com kafai@fb.com john.fastabend@gmail.com namhyung@kernel.org
netdev/source_inline success Was 0 now: 0
netdev/verify_signedoff success Link
netdev/module_param success Was 0 now: 0
netdev/build_32bit success Errors and warnings before: 2463 this patch: 2463
netdev/kdoc success Errors and warnings before: 181 this patch: 181
netdev/verify_fixes success Link
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 86 lines checked
netdev/build_allmodconfig_warn success Errors and warnings before: 2390 this patch: 2390
netdev/header_inline success Link

Commit Message

Song Liu Aug. 30, 2021, 9:41 p.m. UTC
The typical way to access branch record (e.g. Intel LBR) is via hardware
perf_event. For CPUs with FREEZE_LBRS_ON_PMI support, PMI could capture
reliable LBR. On the other hand, LBR could also be useful in non-PMI
scenario. For example, in kretprobe or bpf fexit program, LBR could
provide a lot of information on what happened with the function. Add API
to use branch record for software use.

Note that, when the software event triggers, it is necessary to stop the
branch record hardware asap. Therefore, static_call is used to remove some
branch instructions in this process.

Signed-off-by: Song Liu <songliubraving@fb.com>
---
 arch/x86/events/intel/core.c | 24 ++++++++++++++++++++++--
 include/linux/perf_event.h   | 24 ++++++++++++++++++++++++
 kernel/events/core.c         |  3 +++
 3 files changed, 49 insertions(+), 2 deletions(-)

Comments

Andrii Nakryiko Aug. 30, 2021, 10:05 p.m. UTC | #1
On Mon, Aug 30, 2021 at 2:42 PM Song Liu <songliubraving@fb.com> wrote:
>
> The typical way to access branch record (e.g. Intel LBR) is via hardware
> perf_event. For CPUs with FREEZE_LBRS_ON_PMI support, PMI could capture
> reliable LBR. On the other hand, LBR could also be useful in non-PMI
> scenario. For example, in kretprobe or bpf fexit program, LBR could
> provide a lot of information on what happened with the function. Add API
> to use branch record for software use.
>
> Note that, when the software event triggers, it is necessary to stop the
> branch record hardware asap. Therefore, static_call is used to remove some
> branch instructions in this process.
>
> Signed-off-by: Song Liu <songliubraving@fb.com>
> ---
>  arch/x86/events/intel/core.c | 24 ++++++++++++++++++++++--
>  include/linux/perf_event.h   | 24 ++++++++++++++++++++++++
>  kernel/events/core.c         |  3 +++
>  3 files changed, 49 insertions(+), 2 deletions(-)
>
> diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
> index ac6fd2dabf6a2..d28d0e12c112c 100644
> --- a/arch/x86/events/intel/core.c
> +++ b/arch/x86/events/intel/core.c
> @@ -2155,9 +2155,9 @@ static void __intel_pmu_disable_all(void)
>
>  static void intel_pmu_disable_all(void)
>  {
> +       intel_pmu_lbr_disable_all();
>         __intel_pmu_disable_all();
>         intel_pmu_pebs_disable_all();
> -       intel_pmu_lbr_disable_all();
>  }
>
>  static void __intel_pmu_enable_all(int added, bool pmi)
> @@ -2186,6 +2186,20 @@ static void intel_pmu_enable_all(int added)
>         __intel_pmu_enable_all(added, false);
>  }
>
> +static int
> +intel_pmu_snapshot_branch_stack(struct perf_branch_snapshot *br_snapshot)
> +{
> +       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
> +
> +       intel_pmu_disable_all();
> +       intel_pmu_lbr_read();
> +       memcpy(br_snapshot->entries, cpuc->lbr_entries,
> +              sizeof(struct perf_branch_entry) * x86_pmu.lbr_nr);
> +       br_snapshot->nr = x86_pmu.lbr_nr;
> +       intel_pmu_enable_all(0);
> +       return 0;
> +}
> +
>  /*
>   * Workaround for:
>   *   Intel Errata AAK100 (model 26)
> @@ -6283,9 +6297,15 @@ __init int intel_pmu_init(void)
>                         x86_pmu.lbr_nr = 0;
>         }
>
> -       if (x86_pmu.lbr_nr)
> +       if (x86_pmu.lbr_nr) {
>                 pr_cont("%d-deep LBR, ", x86_pmu.lbr_nr);
>
> +               /* only support branch_stack snapshot for perfmon >= v2 */
> +               if (x86_pmu.disable_all == intel_pmu_disable_all)
> +                       static_call_update(perf_snapshot_branch_stack,
> +                                          intel_pmu_snapshot_branch_stack);
> +       }
> +
>         intel_pmu_check_extra_regs(x86_pmu.extra_regs);
>
>         /* Support full width counters using alternative MSR range */
> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> index fe156a8170aa3..1f42e91668024 100644
> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -57,6 +57,7 @@ struct perf_guest_info_callbacks {
>  #include <linux/cgroup.h>
>  #include <linux/refcount.h>
>  #include <linux/security.h>
> +#include <linux/static_call.h>
>  #include <asm/local.h>
>
>  struct perf_callchain_entry {
> @@ -1612,4 +1613,27 @@ extern void __weak arch_perf_update_userpage(struct perf_event *event,
>  extern __weak u64 arch_perf_get_page_size(struct mm_struct *mm, unsigned long addr);
>  #endif
>
> +/*
> + * Snapshot branch stack on software events.
> + *
> + * Branch stack can be very useful in understanding software events. For
> + * example, when a long function, e.g. sys_perf_event_open, returns an
> + * errno, it is not obvious why the function failed. Branch stack could
> + * provide very helpful information in this type of scenarios.
> + *
> + * On software event, it is necessary to stop the hardware branch recorder
> + * fast. Otherwise, the hardware register/buffer will be flushed with
> + * entries af the triggering event. Therefore, static call is used to
> + * stop the hardware recorder.
> + */
> +#define MAX_BRANCH_SNAPSHOT 32

Can you please make it an enum instead? It will make this available as
a constant in vmlinux.h nicely, without users having to #define it
every time.

> +
> +struct perf_branch_snapshot {
> +       unsigned int nr;
> +       struct perf_branch_entry entries[MAX_BRANCH_SNAPSHOT];
> +};
> +
> +typedef int (perf_snapshot_branch_stack_t)(struct perf_branch_snapshot *);
> +DECLARE_STATIC_CALL(perf_snapshot_branch_stack, perf_snapshot_branch_stack_t);
> +
>  #endif /* _LINUX_PERF_EVENT_H */
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 011cc5069b7ba..22807864e913b 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -13437,3 +13437,6 @@ struct cgroup_subsys perf_event_cgrp_subsys = {
>         .threaded       = true,
>  };
>  #endif /* CONFIG_CGROUP_PERF */
> +
> +DEFINE_STATIC_CALL_RET0(perf_snapshot_branch_stack,
> +                       perf_snapshot_branch_stack_t);
> --
> 2.30.2
>
Peter Zijlstra Aug. 31, 2021, 3:24 p.m. UTC | #2
On Mon, Aug 30, 2021 at 02:41:04PM -0700, Song Liu wrote:

> diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
> index ac6fd2dabf6a2..d28d0e12c112c 100644
> --- a/arch/x86/events/intel/core.c
> +++ b/arch/x86/events/intel/core.c
> @@ -2155,9 +2155,9 @@ static void __intel_pmu_disable_all(void)
>  
>  static void intel_pmu_disable_all(void)
>  {
> +	intel_pmu_lbr_disable_all();
>  	__intel_pmu_disable_all();
>  	intel_pmu_pebs_disable_all();
> -	intel_pmu_lbr_disable_all();
>  }

Hurmph... I'm not sure about that, I'd rather you sprinkle a few
__always_inline to ensure no actual function is called while you disable
things in the correct order.

You now still have a hole vs PMI.

> +static int
> +intel_pmu_snapshot_branch_stack(struct perf_branch_snapshot *br_snapshot)
> +{
> +	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);

Note that this requires preemption is disabled, then look at the
call-sites in your next patch and spot the problem...

> +
> +	intel_pmu_disable_all();
> +	intel_pmu_lbr_read();
> +	memcpy(br_snapshot->entries, cpuc->lbr_entries,
> +	       sizeof(struct perf_branch_entry) * x86_pmu.lbr_nr);
> +	br_snapshot->nr = x86_pmu.lbr_nr;
> +	intel_pmu_enable_all(0);
> +	return 0;
> +}
> +
>  /*
>   * Workaround for:
>   *   Intel Errata AAK100 (model 26)
> @@ -6283,9 +6297,15 @@ __init int intel_pmu_init(void)
>  			x86_pmu.lbr_nr = 0;
>  	}
>  
> -	if (x86_pmu.lbr_nr)
> +	if (x86_pmu.lbr_nr) {
>  		pr_cont("%d-deep LBR, ", x86_pmu.lbr_nr);
>  
> +		/* only support branch_stack snapshot for perfmon >= v2 */
> +		if (x86_pmu.disable_all == intel_pmu_disable_all)
								  {
> +			static_call_update(perf_snapshot_branch_stack,
> +					   intel_pmu_snapshot_branch_stack);

		}

> +	}
> +
>  	intel_pmu_check_extra_regs(x86_pmu.extra_regs);
>  
>  	/* Support full width counters using alternative MSR range */

> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 011cc5069b7ba..22807864e913b 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -13437,3 +13437,6 @@ struct cgroup_subsys perf_event_cgrp_subsys = {
>  	.threaded	= true,
>  };
>  #endif /* CONFIG_CGROUP_PERF */
> +
> +DEFINE_STATIC_CALL_RET0(perf_snapshot_branch_stack,
> +			perf_snapshot_branch_stack_t);

I'll squint and accept 82 characters :-)
Song Liu Aug. 31, 2021, 4:12 p.m. UTC | #3
> On Aug 31, 2021, at 8:24 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> 
> On Mon, Aug 30, 2021 at 02:41:04PM -0700, Song Liu wrote:
> 
>> diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
>> index ac6fd2dabf6a2..d28d0e12c112c 100644
>> --- a/arch/x86/events/intel/core.c
>> +++ b/arch/x86/events/intel/core.c
>> @@ -2155,9 +2155,9 @@ static void __intel_pmu_disable_all(void)
>> 
>> static void intel_pmu_disable_all(void)
>> {
>> +	intel_pmu_lbr_disable_all();
>> 	__intel_pmu_disable_all();
>> 	intel_pmu_pebs_disable_all();
>> -	intel_pmu_lbr_disable_all();
>> }
> 
> Hurmph... I'm not sure about that, I'd rather you sprinkle a few
> __always_inline to ensure no actual function is called while you disable
> things in the correct order.
> 
> You now still have a hole vs PMI.

Hmm... I will move this back and try some inlining. It may require moving
some functions from ds.c/lbr.c to arch/x86/events/perf_event.h. But I guess
that is OK, as there are similar functions in the header. 

Thanks,
Song
diff mbox series

Patch

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index ac6fd2dabf6a2..d28d0e12c112c 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -2155,9 +2155,9 @@  static void __intel_pmu_disable_all(void)
 
 static void intel_pmu_disable_all(void)
 {
+	intel_pmu_lbr_disable_all();
 	__intel_pmu_disable_all();
 	intel_pmu_pebs_disable_all();
-	intel_pmu_lbr_disable_all();
 }
 
 static void __intel_pmu_enable_all(int added, bool pmi)
@@ -2186,6 +2186,20 @@  static void intel_pmu_enable_all(int added)
 	__intel_pmu_enable_all(added, false);
 }
 
+static int
+intel_pmu_snapshot_branch_stack(struct perf_branch_snapshot *br_snapshot)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+	intel_pmu_disable_all();
+	intel_pmu_lbr_read();
+	memcpy(br_snapshot->entries, cpuc->lbr_entries,
+	       sizeof(struct perf_branch_entry) * x86_pmu.lbr_nr);
+	br_snapshot->nr = x86_pmu.lbr_nr;
+	intel_pmu_enable_all(0);
+	return 0;
+}
+
 /*
  * Workaround for:
  *   Intel Errata AAK100 (model 26)
@@ -6283,9 +6297,15 @@  __init int intel_pmu_init(void)
 			x86_pmu.lbr_nr = 0;
 	}
 
-	if (x86_pmu.lbr_nr)
+	if (x86_pmu.lbr_nr) {
 		pr_cont("%d-deep LBR, ", x86_pmu.lbr_nr);
 
+		/* only support branch_stack snapshot for perfmon >= v2 */
+		if (x86_pmu.disable_all == intel_pmu_disable_all)
+			static_call_update(perf_snapshot_branch_stack,
+					   intel_pmu_snapshot_branch_stack);
+	}
+
 	intel_pmu_check_extra_regs(x86_pmu.extra_regs);
 
 	/* Support full width counters using alternative MSR range */
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index fe156a8170aa3..1f42e91668024 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -57,6 +57,7 @@  struct perf_guest_info_callbacks {
 #include <linux/cgroup.h>
 #include <linux/refcount.h>
 #include <linux/security.h>
+#include <linux/static_call.h>
 #include <asm/local.h>
 
 struct perf_callchain_entry {
@@ -1612,4 +1613,27 @@  extern void __weak arch_perf_update_userpage(struct perf_event *event,
 extern __weak u64 arch_perf_get_page_size(struct mm_struct *mm, unsigned long addr);
 #endif
 
+/*
+ * Snapshot branch stack on software events.
+ *
+ * Branch stack can be very useful in understanding software events. For
+ * example, when a long function, e.g. sys_perf_event_open, returns an
+ * errno, it is not obvious why the function failed. Branch stack could
+ * provide very helpful information in this type of scenarios.
+ *
+ * On software event, it is necessary to stop the hardware branch recorder
+ * fast. Otherwise, the hardware register/buffer will be flushed with
+ * entries af the triggering event. Therefore, static call is used to
+ * stop the hardware recorder.
+ */
+#define MAX_BRANCH_SNAPSHOT 32
+
+struct perf_branch_snapshot {
+	unsigned int nr;
+	struct perf_branch_entry entries[MAX_BRANCH_SNAPSHOT];
+};
+
+typedef int (perf_snapshot_branch_stack_t)(struct perf_branch_snapshot *);
+DECLARE_STATIC_CALL(perf_snapshot_branch_stack, perf_snapshot_branch_stack_t);
+
 #endif /* _LINUX_PERF_EVENT_H */
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 011cc5069b7ba..22807864e913b 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -13437,3 +13437,6 @@  struct cgroup_subsys perf_event_cgrp_subsys = {
 	.threaded	= true,
 };
 #endif /* CONFIG_CGROUP_PERF */
+
+DEFINE_STATIC_CALL_RET0(perf_snapshot_branch_stack,
+			perf_snapshot_branch_stack_t);