diff mbox

ARM: perf: Implement perf_arch_fetch_caller_regs

Message ID 1373685434-1581-1-git-send-email-jld@mozilla.com (mailing list archive)
State New, archived
Headers show

Commit Message

Jed Davis July 13, 2013, 3:17 a.m. UTC
We need a perf_arch_fetch_caller_regs for at least some software events
to be able to get a callchain; even user stacks won't work without
at least the CPSR bits for non-user-mode (see perf_callchain).  In
particular, profiling context switches needs this.

This records the state of the point at which perf_arch_fetch_caller_regs
is expanded, instead of that function activation's call site, because we
need SP and PC to be consistent for EHABI unwinding; hopefully nothing
will be inconvenienced by the extra stack frame.

Signed-off-by: Jed Davis <jld@mozilla.com>
---
 arch/arm/include/asm/perf_event.h |   43 +++++++++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

Comments

Will Deacon July 15, 2013, 1:53 p.m. UTC | #1
Hi Jed,

On Sat, Jul 13, 2013 at 04:17:14AM +0100, Jed Davis wrote:
> We need a perf_arch_fetch_caller_regs for at least some software events
> to be able to get a callchain; even user stacks won't work without
> at least the CPSR bits for non-user-mode (see perf_callchain).  In
> particular, profiling context switches needs this.
> 
> This records the state of the point at which perf_arch_fetch_caller_regs
> is expanded, instead of that function activation's call site, because we
> need SP and PC to be consistent for EHABI unwinding; hopefully nothing
> will be inconvenienced by the extra stack frame.

[...]

> +#ifdef CONFIG_THUMB2_KERNEL
> +#define perf_arch_fetch_caller_regs(regs, ip)				\
> +	do {								\
> +		__u32 _cpsr, _pc;					\
> +		__asm__ __volatile__("str r7, [%[_regs], #(7 * 4)]\n\t" \
> +				     "str r13, [%[_regs], #(13 * 4)]\n\t" \
> +				     "str r14, [%[_regs], #(14 * 4)]\n\t" \

Is this safe? How can we be sure that the registers haven't been clobbered
by the callee before this macro is expanded? For example, you can end up
with the following code:

00000058 <perf_ftrace_function_call>:
  58:   e92d 43f0       stmdb   sp!, {r4, r5, r6, r7, r8, r9, lr}
  5c:   b09b            sub     sp, #108        ; 0x6c
  5e:   ac08            add     r4, sp, #32
  60:   4681            mov     r9, r0
  62:   4688            mov     r8, r1
  64:   4620            mov     r0, r4
  66:   2148            movs    r1, #72 ; 0x48
  68:   f7ff fffe       bl      0 <__memzero>
  6c:   61e7            str     r7, [r4, #28]
  6e:   f8c4 d034       str.w   sp, [r4, #52]   ; 0x34
  72:   f8c4 e038       str.w   lr, [r4, #56]   ; 0x38

but the call to __memzero will have corrupted the lr.

> +				     "mov %[_pc],  r15\n\t"		\
> +				     "mrs %[_cpsr], cpsr\n\t"		\
> +				     : [_cpsr] "=r" (_cpsr),		\
> +				       [_pc] "=r" (_pc)			\
> +				     : [_regs] "r" (&(regs)->uregs)	\

It would be cleaner to pass a separate argument for each register, using the
ARM_rN macros rather than calculating the offset by hand.

Will
Jed Davis July 20, 2013, 3:43 a.m. UTC | #2
On Mon, Jul 15, 2013 at 02:53:42PM +0100, Will Deacon wrote:
> On Sat, Jul 13, 2013 at 04:17:14AM +0100, Jed Davis wrote:
[...]
> > +#ifdef CONFIG_THUMB2_KERNEL
> > +#define perf_arch_fetch_caller_regs(regs, ip)				\
> > +	do {								\
> > +		__u32 _cpsr, _pc;					\
> > +		__asm__ __volatile__("str r7, [%[_regs], #(7 * 4)]\n\t" \
> > +				     "str r13, [%[_regs], #(13 * 4)]\n\t" \
> > +				     "str r14, [%[_regs], #(14 * 4)]\n\t" \
> 
> Is this safe? How can we be sure that the registers haven't been clobbered
> by the callee before this macro is expanded? For example, you can end up
> with the following code:

They might be clobbered, but if they are, then...

> 00000058 <perf_ftrace_function_call>:
>   58:   e92d 43f0       stmdb   sp!, {r4, r5, r6, r7, r8, r9, lr}
>   5c:   b09b            sub     sp, #108        ; 0x6c
>   5e:   ac08            add     r4, sp, #32
>   60:   4681            mov     r9, r0
>   62:   4688            mov     r8, r1
>   64:   4620            mov     r0, r4
>   66:   2148            movs    r1, #72 ; 0x48
>   68:   f7ff fffe       bl      0 <__memzero>
>   6c:   61e7            str     r7, [r4, #28]
>   6e:   f8c4 d034       str.w   sp, [r4, #52]   ; 0x34
>   72:   f8c4 e038       str.w   lr, [r4, #56]   ; 0x38
> 
> but the call to __memzero will have corrupted the lr.

...the function's unwinding entry will restore them from the stack, and
can't assume anything about their values before then.  It's the same
problem as if the code was interrupted by a profiler timer at that point
and we tried unwinding from the trap state.

Hopefully.  It's hard to be as rigorous as I'd like about this, because
the Exception Handling ABI was meant for exception handling, so as
specified it only needs to work at points where C++ exceptions can be
thrown, as I understand it.  Fortunately GCC isn't limited to that, but
there are more issues, because -fasynchronous-unwind-tables doesn't
actually make things work from *any* instruction as documented (which is
not entirely bad, because working correctly would significantly increase
the size of .extab/.exidx).

All that said, the unwinding program for a function should work at any
point after the prologue and before the epilogue.

> > +				     "mov %[_pc],  r15\n\t"		\
> > +				     "mrs %[_cpsr], cpsr\n\t"		\
> > +				     : [_cpsr] "=r" (_cpsr),		\
> > +				       [_pc] "=r" (_pc)			\
> > +				     : [_regs] "r" (&(regs)->uregs)	\
> 
> It would be cleaner to pass a separate argument for each register, using the
> ARM_rN macros rather than calculating the offset by hand.

I'll do that.  If there were more arguments there might be a problem
at -O0, because the naive translation can run out of registers in
some cases, but that shouldn't be a problem here.  (Nor if someone
later extends this to all the core registers, because {r0-r13} can and
presumably should use a store-multiple.)

Thanks for the quick review, and sorry for the delay in responding.

--Jed
Will Deacon July 21, 2013, 9:39 p.m. UTC | #3
On Sat, Jul 20, 2013 at 04:43:21AM +0100, Jed Davis wrote:
> On Mon, Jul 15, 2013 at 02:53:42PM +0100, Will Deacon wrote:
> > > +				     "mov %[_pc],  r15\n\t"		\
> > > +				     "mrs %[_cpsr], cpsr\n\t"		\
> > > +				     : [_cpsr] "=r" (_cpsr),		\
> > > +				       [_pc] "=r" (_pc)			\
> > > +				     : [_regs] "r" (&(regs)->uregs)	\
> > 
> > It would be cleaner to pass a separate argument for each register, using the
> > ARM_rN macros rather than calculating the offset by hand.
> 
> I'll do that.  If there were more arguments there might be a problem
> at -O0, because the naive translation can run out of registers in
> some cases, but that shouldn't be a problem here.  (Nor if someone
> later extends this to all the core registers, because {r0-r13} can and
> presumably should use a store-multiple.)

We already rely on compiler optimisation for things like tlbflush.h, so we
don't need to worry about building with -O0.

Please send a v2 when you get a chance!

Will
diff mbox

Patch

diff --git a/arch/arm/include/asm/perf_event.h b/arch/arm/include/asm/perf_event.h
index 7558775..2cc7255 100644
--- a/arch/arm/include/asm/perf_event.h
+++ b/arch/arm/include/asm/perf_event.h
@@ -12,6 +12,8 @@ 
 #ifndef __ARM_PERF_EVENT_H__
 #define __ARM_PERF_EVENT_H__
 
+#include <asm/ptrace.h>
+
 /*
  * The ARMv7 CPU PMU supports up to 32 event counters.
  */
@@ -28,4 +30,45 @@  extern unsigned long perf_misc_flags(struct pt_regs *regs);
 #define perf_misc_flags(regs)	perf_misc_flags(regs)
 #endif
 
+/*
+ * We can't actually get the caller's registers here; the saved PC and
+ * SP values have to be consistent or else EHABI unwinding won't work,
+ * and the only way to find the matching SP for the return address is
+ * to unwind the current function.  So we save the current state
+ * instead.
+ *
+ * Note that the ARM Exception Handling ABI allows unwinding to depend
+ * on the contents of any core register, but our unwinder is limited
+ * to the ones in struct stackframe (which are the only ones we expect
+ * GCC to need for kernel code), so we just record those.
+ */
+#ifdef CONFIG_THUMB2_KERNEL
+#define perf_arch_fetch_caller_regs(regs, ip)				\
+	do {								\
+		__u32 _cpsr, _pc;					\
+		__asm__ __volatile__("str r7, [%[_regs], #(7 * 4)]\n\t" \
+				     "str r13, [%[_regs], #(13 * 4)]\n\t" \
+				     "str r14, [%[_regs], #(14 * 4)]\n\t" \
+				     "mov %[_pc],  r15\n\t"		\
+				     "mrs %[_cpsr], cpsr\n\t"		\
+				     : [_cpsr] "=r" (_cpsr),		\
+				       [_pc] "=r" (_pc)			\
+				     : [_regs] "r" (&(regs)->uregs)	\
+				     : "memory");			\
+		(regs)->ARM_pc = _pc;					\
+		(regs)->ARM_cpsr = _cpsr;				\
+	} while (0)
+#else
+#define perf_arch_fetch_caller_regs(regs, ip)				\
+	do {								\
+		__u32 _cpsr;						\
+		__asm__ __volatile__("stmia %[_regs11], {r11 - r15}\n\t" \
+				     "mrs %[_cpsr], cpsr\n\t"		\
+				     : [_cpsr] "=r" (_cpsr)		\
+				     : [_regs11] "r" (&(regs)->uregs[11]) \
+				     : "memory");			\
+		(regs)->ARM_cpsr = _cpsr;				\
+	} while (0)
+#endif
+
 #endif /* __ARM_PERF_EVENT_H__ */