mbox series

[0/6] A few cpuidle vs rcu fixes

Message ID 20230123205009.790550642@infradead.org (mailing list archive)
Headers show
Series A few cpuidle vs rcu fixes | expand

Message

Peter Zijlstra Jan. 23, 2023, 8:50 p.m. UTC
0-day robot reported graph-tracing made the cpuidle-vs-rcu rework go splat.

These patches appear to cure this, the ftrace selftest now runs to completion
without spamming scary messages to dmesg.

---
 arch/x86/include/asm/atomic64_32.h | 44 +++++++++++++++++++-------------------
 arch/x86/include/asm/atomic64_64.h | 36 +++++++++++++++----------------
 arch/x86/include/asm/kvmclock.h    |  2 +-
 arch/x86/include/asm/paravirt.h    |  2 +-
 arch/x86/include/asm/pvclock.h     |  3 ++-
 arch/x86/kernel/cpu/vmware.c       |  2 +-
 arch/x86/kernel/ftrace.c           |  3 +++
 arch/x86/kernel/kvmclock.c         |  6 +++---
 arch/x86/kernel/pvclock.c          | 22 +++++++++++++------
 arch/x86/kernel/tsc.c              |  7 +++---
 arch/x86/xen/time.c                | 12 +++++++++--
 drivers/cpuidle/cpuidle.c          |  2 +-
 drivers/cpuidle/poll_state.c       |  2 --
 include/linux/math64.h             |  4 ++--
 include/linux/sched/clock.h        |  8 +++----
 kernel/sched/clock.c               | 27 +++++++++++++++++------
 16 files changed, 107 insertions(+), 75 deletions(-)

Comments

Mark Rutland Jan. 24, 2023, 4:34 p.m. UTC | #1
Hi Peter,

On Mon, Jan 23, 2023 at 09:50:09PM +0100, Peter Zijlstra wrote:
> 0-day robot reported graph-tracing made the cpuidle-vs-rcu rework go splat.

Do you have a link toe the splat somewhere?

I'm assuming that this is partially generic, and I'd like to make sure I test
the right thing on arm64. I'll throw my usual lockdep options at the ftrace
selftests...

Thanks,
Mark.

> 
> These patches appear to cure this, the ftrace selftest now runs to completion
> without spamming scary messages to dmesg.
> 
> ---
>  arch/x86/include/asm/atomic64_32.h | 44 +++++++++++++++++++-------------------
>  arch/x86/include/asm/atomic64_64.h | 36 +++++++++++++++----------------
>  arch/x86/include/asm/kvmclock.h    |  2 +-
>  arch/x86/include/asm/paravirt.h    |  2 +-
>  arch/x86/include/asm/pvclock.h     |  3 ++-
>  arch/x86/kernel/cpu/vmware.c       |  2 +-
>  arch/x86/kernel/ftrace.c           |  3 +++
>  arch/x86/kernel/kvmclock.c         |  6 +++---
>  arch/x86/kernel/pvclock.c          | 22 +++++++++++++------
>  arch/x86/kernel/tsc.c              |  7 +++---
>  arch/x86/xen/time.c                | 12 +++++++++--
>  drivers/cpuidle/cpuidle.c          |  2 +-
>  drivers/cpuidle/poll_state.c       |  2 --
>  include/linux/math64.h             |  4 ++--
>  include/linux/sched/clock.h        |  8 +++----
>  kernel/sched/clock.c               | 27 +++++++++++++++++------
>  16 files changed, 107 insertions(+), 75 deletions(-)
> 
>
Mark Rutland Jan. 24, 2023, 5:30 p.m. UTC | #2
On Tue, Jan 24, 2023 at 04:34:23PM +0000, Mark Rutland wrote:
> Hi Peter,
> 
> On Mon, Jan 23, 2023 at 09:50:09PM +0100, Peter Zijlstra wrote:
> > 0-day robot reported graph-tracing made the cpuidle-vs-rcu rework go splat.
> 
> Do you have a link toe the splat somewhere?
> 
> I'm assuming that this is partially generic, and I'd like to make sure I test
> the right thing on arm64. I'll throw my usual lockdep options at the ftrace
> selftests...

Hmm... with the tip sched/core branch, with or without this series applied atop
I see a couple of splats which I don't see with v6.2-rc1 (which seems to be
entirely clean). I'm not seeing any other splats.

I can trigger those reliably with the 'toplevel-enable.tc' ftrace test:

  ./ftracetest test.d/event/toplevel-enable.tc

Splats below; I'll dig into this a bit more tomorrow.

[   65.729252] ------------[ cut here ]------------
[   65.730397] WARNING: CPU: 3 PID: 1162 at include/trace/events/preemptirq.h:55 trace_preempt_on+0x68/0x70
[   65.732450] Modules linked in:
[   65.733204] CPU: 3 PID: 1162 Comm: ftracetest Not tainted 6.2.0-rc1-00100-g1066815869f5 #2
[   65.735165] Hardware name: linux,dummy-virt (DT)
[   65.736278] pstate: 40400005 (nZcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
[   65.737929] pc : trace_preempt_on+0x68/0x70
[   65.738962] lr : preempt_count_sub+0xb4/0xf0
[   65.739998] sp : ffff80000e03ba70
[   65.740818] x29: ffff80000e03ba70 x28: ffff80000add07e8 x27: ffff800009d0b548
[   65.742531] x26: ffff00000742dd10 x25: ffff00000742dd00 x24: ffff80000ade11d0
[   65.744246] x23: ffff80000e03bb80 x22: ffff80000a99abb0 x21: ffff8000080a5cf4
[   65.745957] x20: ffff8000080a5cf4 x19: 0000000000000001 x18: 0000000000000000
[   65.747677] x17: 0000000000000000 x16: 0000000000000000 x15: 0000000000000000
[   65.749388] x14: 0000000000000028 x13: 00000000000042d7 x12: 000000000000035f
[   65.751105] x11: 000000000000035f x10: 000000000004035f x9 : ffff8000080a5cf4
[   65.752820] x8 : ffff80000ae31a18 x7 : 0000000000000000 x6 : 0000000000000001
[   65.754526] x5 : ffff80000a8e14e8 x4 : 0000000000000003 x3 : 0000000000000000
[   65.756244] x2 : 0000000000000001 x1 : ffff8000080a5cf4 x0 : ffff8000080a5cf4
[   65.757957] Call trace:
[   65.758572]  trace_preempt_on+0x68/0x70
[   65.759520]  preempt_count_sub+0xb4/0xf0
[   65.760477]  percpu_up_read.constprop.0+0xc4/0x180
[   65.761639]  cpus_read_unlock+0x18/0x24
[   65.762579]  static_key_enable+0x2c/0x40
[   65.763572]  tracepoint_add_func+0x330/0x3dc
[   65.764611]  tracepoint_probe_register+0x74/0xc0
[   65.765725]  trace_event_reg+0x8c/0xa0
[   65.766642]  __ftrace_event_enable_disable+0x174/0x4d0
[   65.767884]  __ftrace_set_clr_event_nolock+0xe0/0x150
[   65.769109]  ftrace_set_clr_event+0x90/0x13c
[   65.770143]  ftrace_event_write+0xd4/0x120
[   65.771145]  vfs_write+0xcc/0x2f0
[   65.771964]  ksys_write+0x78/0x110
[   65.772803]  __arm64_sys_write+0x24/0x30
[   65.773763]  invoke_syscall+0x50/0x120
[   65.774681]  el0_svc_common.constprop.0+0x68/0x124
[   65.775848]  do_el0_svc+0x40/0xbc
[   65.776669]  el0_svc+0x48/0xc0
[   65.777426]  el0t_64_sync_handler+0xf4/0x120
[   65.778459]  el0t_64_sync+0x190/0x194
[   65.779365] irq event stamp: 69686
[   65.780199] hardirqs last  enabled at (69685): [<ffff8000092d5664>] _raw_spin_unlock_irqrestore+0x80/0xa0
[   65.782457] hardirqs last disabled at (69686): [<ffff8000092c3fd4>] el1_dbg+0x24/0x90
[   65.784315] softirqs last  enabled at (69622): [<ffff800008010b08>] __do_softirq+0x448/0x5bc
[   65.786309] softirqs last disabled at (69613): [<ffff800008017288>] ____do_softirq+0x18/0x24
[   65.788332] ---[ end trace 0000000000000000 ]---
[   65.789588] ------------[ cut here ]------------
[   65.790622] WARNING: CPU: 3 PID: 1162 at include/trace/events/preemptirq.h:51 trace_preempt_off+0x68/0xb0
[   65.792698] Modules linked in:
[   65.793465] CPU: 3 PID: 1162 Comm: ftracetest Tainted: G        W          6.2.0-rc1-00100-g1066815869f5 #2
[   65.795780] Hardware name: linux,dummy-virt (DT)
[   65.796898] pstate: 40400005 (nZcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
[   65.798555] pc : trace_preempt_off+0x68/0xb0
[   65.799602] lr : preempt_count_add+0xa0/0xc0
[   65.800646] sp : ffff80000e03ba80
[   65.801465] x29: ffff80000e03ba80 x28: ffff80000add07e8 x27: ffff800009d0b558
[   65.803185] x26: ffff00000742dd90 x25: ffff00000742dd80 x24: ffff80000ade1188
[   65.804900] x23: ffff80000e03bb80 x22: ffff80000a99abb0 x21: ffff80000b8b7d18
[   65.806612] x20: ffff8000080a5c68 x19: ffff8000080a5c68 x18: 0000000000000000
[   65.808334] x17: 0000000000000000 x16: 0000000000000000 x15: 0000000000000000
[   65.810041] x14: 0000000000000028 x13: 00000000000042d7 x12: 000000000000035f
[   65.811755] x11: 000000000000035f x10: 000000000004035f x9 : ffff8000080a5c68
[   65.813460] x8 : ffff80000ae31a18 x7 : 0000000000000000 x6 : 0000000000000003
[   65.815174] x5 : 0000000030b5c3ca x4 : 0000000000000003 x3 : 0000000000000000
[   65.816886] x2 : 0000000000000001 x1 : ffff8000080a5c68 x0 : ffff8000080a5c68
[   65.818592] Call trace:
[   65.819216]  trace_preempt_off+0x68/0xb0
[   65.820171]  preempt_count_add+0xa0/0xc0
[   65.821131]  percpu_up_read.constprop.0+0x38/0x180
[   65.822288]  cpus_read_unlock+0x18/0x24
[   65.823236]  static_key_enable+0x2c/0x40
[   65.824194]  tracepoint_add_func+0x330/0x3dc
[   65.825236]  tracepoint_probe_register+0x74/0xc0
[   65.826351]  trace_event_reg+0x8c/0xa0
[   65.827276]  __ftrace_event_enable_disable+0x174/0x4d0
[   65.828506]  __ftrace_set_clr_event_nolock+0xe0/0x150
[   65.829721]  ftrace_set_clr_event+0x90/0x13c
[   65.830769]  ftrace_event_write+0xd4/0x120
[   65.831766]  vfs_write+0xcc/0x2f0
[   65.832581]  ksys_write+0x78/0x110
[   65.833422]  __arm64_sys_write+0x24/0x30
[   65.834376]  invoke_syscall+0x50/0x120
[   65.835300]  el0_svc_common.constprop.0+0x68/0x124
[   65.836451]  do_el0_svc+0x40/0xbc
[   65.837290]  el0_svc+0x48/0xc0
[   65.838054]  el0t_64_sync_handler+0xf4/0x120
[   65.839102]  el0t_64_sync+0x190/0x194
[   65.840006] irq event stamp: 69710
[   65.840845] hardirqs last  enabled at (69709): [<ffff8000092c4028>] el1_dbg+0x78/0x90
[   65.842699] hardirqs last disabled at (69710): [<ffff8000092c3fd4>] el1_dbg+0x24/0x90
[   65.844568] softirqs last  enabled at (69694): [<ffff800008010b08>] __do_softirq+0x448/0x5bc
[   65.846573] softirqs last disabled at (69689): [<ffff800008017288>] ____do_softirq+0x18/0x24
[   65.848578] ---[ end trace 0000000000000000 ]---

Thanks,
Mark.
Mark Rutland Jan. 24, 2023, 6:39 p.m. UTC | #3
On Tue, Jan 24, 2023 at 05:30:29PM +0000, Mark Rutland wrote:
> On Tue, Jan 24, 2023 at 04:34:23PM +0000, Mark Rutland wrote:
> > Hi Peter,
> > 
> > On Mon, Jan 23, 2023 at 09:50:09PM +0100, Peter Zijlstra wrote:
> > > 0-day robot reported graph-tracing made the cpuidle-vs-rcu rework go splat.
> > 
> > Do you have a link toe the splat somewhere?
> > 
> > I'm assuming that this is partially generic, and I'd like to make sure I test
> > the right thing on arm64. I'll throw my usual lockdep options at the ftrace
> > selftests...
> 
> Hmm... with the tip sched/core branch, with or without this series applied atop
> I see a couple of splats which I don't see with v6.2-rc1 (which seems to be
> entirely clean). I'm not seeing any other splats.
> 
> I can trigger those reliably with the 'toplevel-enable.tc' ftrace test:
> 
>   ./ftracetest test.d/event/toplevel-enable.tc
> 
> Splats below; I'll dig into this a bit more tomorrow.
> 
> [   65.729252] ------------[ cut here ]------------
> [   65.730397] WARNING: CPU: 3 PID: 1162 at include/trace/events/preemptirq.h:55 trace_preempt_on+0x68/0x70

The line number here is a bit inscrutible, but a bisect led me down to commit

  408b961146be4c1a ("tracing: WARN on rcuidle")

... and it appears this must be the RCUIDLE_COND() warning that adds, and that
seems to be because trace_preempt_on() calls trace_preempt_enable_rcuidle():

| void trace_preempt_on(unsigned long a0, unsigned long a1)
| {
|         if (!in_nmi())
|                 trace_preempt_enable_rcuidle(a0, a1);
|         tracer_preempt_on(a0, a1);
| }

It looks like that tracing is dependend upon CONFIG_TRACE_PREEMPT_TOGGLE, and I
have that because I enabled CONFIG_PREEMPT_TRACER. I reckon the same should
happen on x86 with CONFIG_PREEMPT_TRACER=y.

IIUC we'll need to clean up that trace_.*_rcuidle() usage too, but I'm not
entirely sure how to do that.

Thanks,
Mark.

> [   65.732450] Modules linked in:
> [   65.733204] CPU: 3 PID: 1162 Comm: ftracetest Not tainted 6.2.0-rc1-00100-g1066815869f5 #2
> [   65.735165] Hardware name: linux,dummy-virt (DT)
> [   65.736278] pstate: 40400005 (nZcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
> [   65.737929] pc : trace_preempt_on+0x68/0x70
> [   65.738962] lr : preempt_count_sub+0xb4/0xf0
> [   65.739998] sp : ffff80000e03ba70
> [   65.740818] x29: ffff80000e03ba70 x28: ffff80000add07e8 x27: ffff800009d0b548
> [   65.742531] x26: ffff00000742dd10 x25: ffff00000742dd00 x24: ffff80000ade11d0
> [   65.744246] x23: ffff80000e03bb80 x22: ffff80000a99abb0 x21: ffff8000080a5cf4
> [   65.745957] x20: ffff8000080a5cf4 x19: 0000000000000001 x18: 0000000000000000
> [   65.747677] x17: 0000000000000000 x16: 0000000000000000 x15: 0000000000000000
> [   65.749388] x14: 0000000000000028 x13: 00000000000042d7 x12: 000000000000035f
> [   65.751105] x11: 000000000000035f x10: 000000000004035f x9 : ffff8000080a5cf4
> [   65.752820] x8 : ffff80000ae31a18 x7 : 0000000000000000 x6 : 0000000000000001
> [   65.754526] x5 : ffff80000a8e14e8 x4 : 0000000000000003 x3 : 0000000000000000
> [   65.756244] x2 : 0000000000000001 x1 : ffff8000080a5cf4 x0 : ffff8000080a5cf4
> [   65.757957] Call trace:
> [   65.758572]  trace_preempt_on+0x68/0x70
> [   65.759520]  preempt_count_sub+0xb4/0xf0
> [   65.760477]  percpu_up_read.constprop.0+0xc4/0x180
> [   65.761639]  cpus_read_unlock+0x18/0x24
> [   65.762579]  static_key_enable+0x2c/0x40
> [   65.763572]  tracepoint_add_func+0x330/0x3dc
> [   65.764611]  tracepoint_probe_register+0x74/0xc0
> [   65.765725]  trace_event_reg+0x8c/0xa0
> [   65.766642]  __ftrace_event_enable_disable+0x174/0x4d0
> [   65.767884]  __ftrace_set_clr_event_nolock+0xe0/0x150
> [   65.769109]  ftrace_set_clr_event+0x90/0x13c
> [   65.770143]  ftrace_event_write+0xd4/0x120
> [   65.771145]  vfs_write+0xcc/0x2f0
> [   65.771964]  ksys_write+0x78/0x110
> [   65.772803]  __arm64_sys_write+0x24/0x30
> [   65.773763]  invoke_syscall+0x50/0x120
> [   65.774681]  el0_svc_common.constprop.0+0x68/0x124
> [   65.775848]  do_el0_svc+0x40/0xbc
> [   65.776669]  el0_svc+0x48/0xc0
> [   65.777426]  el0t_64_sync_handler+0xf4/0x120
> [   65.778459]  el0t_64_sync+0x190/0x194
> [   65.779365] irq event stamp: 69686
> [   65.780199] hardirqs last  enabled at (69685): [<ffff8000092d5664>] _raw_spin_unlock_irqrestore+0x80/0xa0
> [   65.782457] hardirqs last disabled at (69686): [<ffff8000092c3fd4>] el1_dbg+0x24/0x90
> [   65.784315] softirqs last  enabled at (69622): [<ffff800008010b08>] __do_softirq+0x448/0x5bc
> [   65.786309] softirqs last disabled at (69613): [<ffff800008017288>] ____do_softirq+0x18/0x24
> [   65.788332] ---[ end trace 0000000000000000 ]---
> [   65.789588] ------------[ cut here ]------------
> [   65.790622] WARNING: CPU: 3 PID: 1162 at include/trace/events/preemptirq.h:51 trace_preempt_off+0x68/0xb0
> [   65.792698] Modules linked in:
> [   65.793465] CPU: 3 PID: 1162 Comm: ftracetest Tainted: G        W          6.2.0-rc1-00100-g1066815869f5 #2
> [   65.795780] Hardware name: linux,dummy-virt (DT)
> [   65.796898] pstate: 40400005 (nZcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
> [   65.798555] pc : trace_preempt_off+0x68/0xb0
> [   65.799602] lr : preempt_count_add+0xa0/0xc0
> [   65.800646] sp : ffff80000e03ba80
> [   65.801465] x29: ffff80000e03ba80 x28: ffff80000add07e8 x27: ffff800009d0b558
> [   65.803185] x26: ffff00000742dd90 x25: ffff00000742dd80 x24: ffff80000ade1188
> [   65.804900] x23: ffff80000e03bb80 x22: ffff80000a99abb0 x21: ffff80000b8b7d18
> [   65.806612] x20: ffff8000080a5c68 x19: ffff8000080a5c68 x18: 0000000000000000
> [   65.808334] x17: 0000000000000000 x16: 0000000000000000 x15: 0000000000000000
> [   65.810041] x14: 0000000000000028 x13: 00000000000042d7 x12: 000000000000035f
> [   65.811755] x11: 000000000000035f x10: 000000000004035f x9 : ffff8000080a5c68
> [   65.813460] x8 : ffff80000ae31a18 x7 : 0000000000000000 x6 : 0000000000000003
> [   65.815174] x5 : 0000000030b5c3ca x4 : 0000000000000003 x3 : 0000000000000000
> [   65.816886] x2 : 0000000000000001 x1 : ffff8000080a5c68 x0 : ffff8000080a5c68
> [   65.818592] Call trace:
> [   65.819216]  trace_preempt_off+0x68/0xb0
> [   65.820171]  preempt_count_add+0xa0/0xc0
> [   65.821131]  percpu_up_read.constprop.0+0x38/0x180
> [   65.822288]  cpus_read_unlock+0x18/0x24
> [   65.823236]  static_key_enable+0x2c/0x40
> [   65.824194]  tracepoint_add_func+0x330/0x3dc
> [   65.825236]  tracepoint_probe_register+0x74/0xc0
> [   65.826351]  trace_event_reg+0x8c/0xa0
> [   65.827276]  __ftrace_event_enable_disable+0x174/0x4d0
> [   65.828506]  __ftrace_set_clr_event_nolock+0xe0/0x150
> [   65.829721]  ftrace_set_clr_event+0x90/0x13c
> [   65.830769]  ftrace_event_write+0xd4/0x120
> [   65.831766]  vfs_write+0xcc/0x2f0
> [   65.832581]  ksys_write+0x78/0x110
> [   65.833422]  __arm64_sys_write+0x24/0x30
> [   65.834376]  invoke_syscall+0x50/0x120
> [   65.835300]  el0_svc_common.constprop.0+0x68/0x124
> [   65.836451]  do_el0_svc+0x40/0xbc
> [   65.837290]  el0_svc+0x48/0xc0
> [   65.838054]  el0t_64_sync_handler+0xf4/0x120
> [   65.839102]  el0t_64_sync+0x190/0x194
> [   65.840006] irq event stamp: 69710
> [   65.840845] hardirqs last  enabled at (69709): [<ffff8000092c4028>] el1_dbg+0x78/0x90
> [   65.842699] hardirqs last disabled at (69710): [<ffff8000092c3fd4>] el1_dbg+0x24/0x90
> [   65.844568] softirqs last  enabled at (69694): [<ffff800008010b08>] __do_softirq+0x448/0x5bc
> [   65.846573] softirqs last disabled at (69689): [<ffff800008017288>] ____do_softirq+0x18/0x24
> [   65.848578] ---[ end trace 0000000000000000 ]---
> 
> Thanks,
> Mark.
Peter Zijlstra Jan. 25, 2023, 9:31 a.m. UTC | #4
On Tue, Jan 24, 2023 at 04:34:23PM +0000, Mark Rutland wrote:
> Hi Peter,
> 
> On Mon, Jan 23, 2023 at 09:50:09PM +0100, Peter Zijlstra wrote:
> > 0-day robot reported graph-tracing made the cpuidle-vs-rcu rework go splat.
> 
> Do you have a link toe the splat somewhere?
> 
> I'm assuming that this is partially generic, and I'd like to make sure I test
> the right thing on arm64. I'll throw my usual lockdep options at the ftrace
> selftests...

0-day triggered this by running tools/testing/selftests/ftrace/ftracetest,
which is what I've been using to reproduce.

If that don't work for you I can try and dig out the 0day email to see
if it has more details on.
Peter Zijlstra Jan. 25, 2023, 9:35 a.m. UTC | #5
On Tue, Jan 24, 2023 at 06:39:12PM +0000, Mark Rutland wrote:
> On Tue, Jan 24, 2023 at 05:30:29PM +0000, Mark Rutland wrote:
> > On Tue, Jan 24, 2023 at 04:34:23PM +0000, Mark Rutland wrote:
> > > Hi Peter,
> > > 
> > > On Mon, Jan 23, 2023 at 09:50:09PM +0100, Peter Zijlstra wrote:
> > > > 0-day robot reported graph-tracing made the cpuidle-vs-rcu rework go splat.
> > > 
> > > Do you have a link toe the splat somewhere?
> > > 
> > > I'm assuming that this is partially generic, and I'd like to make sure I test
> > > the right thing on arm64. I'll throw my usual lockdep options at the ftrace
> > > selftests...
> > 
> > Hmm... with the tip sched/core branch, with or without this series applied atop
> > I see a couple of splats which I don't see with v6.2-rc1 (which seems to be
> > entirely clean). I'm not seeing any other splats.
> > 
> > I can trigger those reliably with the 'toplevel-enable.tc' ftrace test:
> > 
> >   ./ftracetest test.d/event/toplevel-enable.tc
> > 
> > Splats below; I'll dig into this a bit more tomorrow.
> > 
> > [   65.729252] ------------[ cut here ]------------
> > [   65.730397] WARNING: CPU: 3 PID: 1162 at include/trace/events/preemptirq.h:55 trace_preempt_on+0x68/0x70
> 
> The line number here is a bit inscrutible, but a bisect led me down to commit
> 
>   408b961146be4c1a ("tracing: WARN on rcuidle")
> 
> ... and it appears this must be the RCUIDLE_COND() warning that adds, and that
> seems to be because trace_preempt_on() calls trace_preempt_enable_rcuidle():
> 
> | void trace_preempt_on(unsigned long a0, unsigned long a1)
> | {
> |         if (!in_nmi())
> |                 trace_preempt_enable_rcuidle(a0, a1);
> |         tracer_preempt_on(a0, a1);
> | }
> 
> It looks like that tracing is dependend upon CONFIG_TRACE_PREEMPT_TOGGLE, and I
> have that because I enabled CONFIG_PREEMPT_TRACER. I reckon the same should
> happen on x86 with CONFIG_PREEMPT_TRACER=y.
> 
> IIUC we'll need to clean up that trace_.*_rcuidle() usage too, but I'm not
> entirely sure how to do that.

tip/sched/core contains the following patch addressing this:

---
commit 9aedeaed6fc6fe8452b9b8225e95cc2b8631ff91
Author: Peter Zijlstra <peterz@infradead.org>
Date:   Thu Jan 12 20:43:49 2023 +0100

    tracing, hardirq: No moar _rcuidle() tracing
    
    Robot reported that trace_hardirqs_{on,off}() tickle the forbidden
    _rcuidle() tracepoint through local_irq_{en,dis}able().
    
    For 'sane' configs, these calls will only happen with RCU enabled and
    as such can use the regular tracepoint. This also means it's possible
    to trace them from NMI context again.
    
    Reported-by: kernel test robot <lkp@intel.com>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    Link: https://lore.kernel.org/r/20230112195541.477416709@infradead.org

diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c
index 629f2854e12b..f992444a0b1f 100644
--- a/kernel/trace/trace_preemptirq.c
+++ b/kernel/trace/trace_preemptirq.c
@@ -19,6 +19,20 @@
 /* Per-cpu variable to prevent redundant calls when IRQs already off */
 static DEFINE_PER_CPU(int, tracing_irq_cpu);
 
+/*
+ * Use regular trace points on architectures that implement noinstr
+ * tooling: these calls will only happen with RCU enabled, which can
+ * use a regular tracepoint.
+ *
+ * On older architectures, use the rcuidle tracing methods (which
+ * aren't NMI-safe - so exclude NMI contexts):
+ */
+#ifdef CONFIG_ARCH_WANTS_NO_INSTR
+#define trace(point)	trace_##point
+#else
+#define trace(point)	if (!in_nmi()) trace_##point##_rcuidle
+#endif
+
 /*
  * Like trace_hardirqs_on() but without the lockdep invocation. This is
  * used in the low level entry code where the ordering vs. RCU is important
@@ -28,8 +42,7 @@ static DEFINE_PER_CPU(int, tracing_irq_cpu);
 void trace_hardirqs_on_prepare(void)
 {
 	if (this_cpu_read(tracing_irq_cpu)) {
-		if (!in_nmi())
-			trace_irq_enable(CALLER_ADDR0, CALLER_ADDR1);
+		trace(irq_enable)(CALLER_ADDR0, CALLER_ADDR1);
 		tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1);
 		this_cpu_write(tracing_irq_cpu, 0);
 	}
@@ -40,8 +53,7 @@ NOKPROBE_SYMBOL(trace_hardirqs_on_prepare);
 void trace_hardirqs_on(void)
 {
 	if (this_cpu_read(tracing_irq_cpu)) {
-		if (!in_nmi())
-			trace_irq_enable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
+		trace(irq_enable)(CALLER_ADDR0, CALLER_ADDR1);
 		tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1);
 		this_cpu_write(tracing_irq_cpu, 0);
 	}
@@ -63,8 +75,7 @@ void trace_hardirqs_off_finish(void)
 	if (!this_cpu_read(tracing_irq_cpu)) {
 		this_cpu_write(tracing_irq_cpu, 1);
 		tracer_hardirqs_off(CALLER_ADDR0, CALLER_ADDR1);
-		if (!in_nmi())
-			trace_irq_disable(CALLER_ADDR0, CALLER_ADDR1);
+		trace(irq_disable)(CALLER_ADDR0, CALLER_ADDR1);
 	}
 
 }
@@ -78,8 +89,7 @@ void trace_hardirqs_off(void)
 	if (!this_cpu_read(tracing_irq_cpu)) {
 		this_cpu_write(tracing_irq_cpu, 1);
 		tracer_hardirqs_off(CALLER_ADDR0, CALLER_ADDR1);
-		if (!in_nmi())
-			trace_irq_disable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
+		trace(irq_disable)(CALLER_ADDR0, CALLER_ADDR1);
 	}
 }
 EXPORT_SYMBOL(trace_hardirqs_off);
Mark Rutland Jan. 25, 2023, 9:36 a.m. UTC | #6
On Wed, Jan 25, 2023 at 10:31:41AM +0100, Peter Zijlstra wrote:
> On Tue, Jan 24, 2023 at 04:34:23PM +0000, Mark Rutland wrote:
> > Hi Peter,
> > 
> > On Mon, Jan 23, 2023 at 09:50:09PM +0100, Peter Zijlstra wrote:
> > > 0-day robot reported graph-tracing made the cpuidle-vs-rcu rework go splat.
> > 
> > Do you have a link toe the splat somewhere?
> > 
> > I'm assuming that this is partially generic, and I'd like to make sure I test
> > the right thing on arm64. I'll throw my usual lockdep options at the ftrace
> > selftests...
> 
> 0-day triggered this by running tools/testing/selftests/ftrace/ftracetest,
> which is what I've been using to reproduce.
> 
> If that don't work for you I can try and dig out the 0day email to see
> if it has more details on.

I had a go running those on arm64, but got different splats (as per my other
replies), so I just wanted to see the backtraces and/or config to check I
wasn't missing something due to CONFIG_* or arch differences.

If you have the email to hand, that'd be great, but don't worry too much about
it!

Thanks,
Mark.
Peter Zijlstra Jan. 25, 2023, 9:40 a.m. UTC | #7
On Wed, Jan 25, 2023 at 10:35:16AM +0100, Peter Zijlstra wrote:
> tip/sched/core contains the following patch addressing this:
> 
> ---
> commit 9aedeaed6fc6fe8452b9b8225e95cc2b8631ff91
> Author: Peter Zijlstra <peterz@infradead.org>
> Date:   Thu Jan 12 20:43:49 2023 +0100
> 
>     tracing, hardirq: No moar _rcuidle() tracing
>     
>     Robot reported that trace_hardirqs_{on,off}() tickle the forbidden
>     _rcuidle() tracepoint through local_irq_{en,dis}able().
>     
>     For 'sane' configs, these calls will only happen with RCU enabled and
>     as such can use the regular tracepoint. This also means it's possible
>     to trace them from NMI context again.
>     
>     Reported-by: kernel test robot <lkp@intel.com>
>     Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
>     Signed-off-by: Ingo Molnar <mingo@kernel.org>
>     Link: https://lore.kernel.org/r/20230112195541.477416709@infradead.org
> 
> diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c
> index 629f2854e12b..f992444a0b1f 100644
> --- a/kernel/trace/trace_preemptirq.c
> +++ b/kernel/trace/trace_preemptirq.c
> @@ -19,6 +19,20 @@
>  /* Per-cpu variable to prevent redundant calls when IRQs already off */
>  static DEFINE_PER_CPU(int, tracing_irq_cpu);
>  
> +/*
> + * Use regular trace points on architectures that implement noinstr
> + * tooling: these calls will only happen with RCU enabled, which can
> + * use a regular tracepoint.
> + *
> + * On older architectures, use the rcuidle tracing methods (which
> + * aren't NMI-safe - so exclude NMI contexts):
> + */
> +#ifdef CONFIG_ARCH_WANTS_NO_INSTR
> +#define trace(point)	trace_##point
> +#else
> +#define trace(point)	if (!in_nmi()) trace_##point##_rcuidle
> +#endif
> +
>  /*
>   * Like trace_hardirqs_on() but without the lockdep invocation. This is
>   * used in the low level entry code where the ordering vs. RCU is important

For some reason I missed the trace_preempt_{on,off} things, so that then
gets the below on top or so.

diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c
index f992444a0b1f..ea96b41c8838 100644
--- a/kernel/trace/trace_preemptirq.c
+++ b/kernel/trace/trace_preemptirq.c
@@ -100,15 +100,13 @@ NOKPROBE_SYMBOL(trace_hardirqs_off);
 
 void trace_preempt_on(unsigned long a0, unsigned long a1)
 {
-	if (!in_nmi())
-		trace_preempt_enable_rcuidle(a0, a1);
+	trace(preempt_enable)(a0, a1);
 	tracer_preempt_on(a0, a1);
 }
 
 void trace_preempt_off(unsigned long a0, unsigned long a1)
 {
-	if (!in_nmi())
-		trace_preempt_disable_rcuidle(a0, a1);
+	trace(preempt_disable)(a0, a1);
 	tracer_preempt_off(a0, a1);
 }
 #endif
Mark Rutland Jan. 25, 2023, 10:23 a.m. UTC | #8
On Wed, Jan 25, 2023 at 10:40:17AM +0100, Peter Zijlstra wrote:
> On Wed, Jan 25, 2023 at 10:35:16AM +0100, Peter Zijlstra wrote:
> > tip/sched/core contains the following patch addressing this:
> > 
> > ---
> > commit 9aedeaed6fc6fe8452b9b8225e95cc2b8631ff91
> > Author: Peter Zijlstra <peterz@infradead.org>
> > Date:   Thu Jan 12 20:43:49 2023 +0100
> > 
> >     tracing, hardirq: No moar _rcuidle() tracing
> >     
> >     Robot reported that trace_hardirqs_{on,off}() tickle the forbidden
> >     _rcuidle() tracepoint through local_irq_{en,dis}able().
> >     
> >     For 'sane' configs, these calls will only happen with RCU enabled and
> >     as such can use the regular tracepoint. This also means it's possible
> >     to trace them from NMI context again.
> >     
> >     Reported-by: kernel test robot <lkp@intel.com>
> >     Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> >     Signed-off-by: Ingo Molnar <mingo@kernel.org>
> >     Link: https://lore.kernel.org/r/20230112195541.477416709@infradead.org
> > 
> > diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c
> > index 629f2854e12b..f992444a0b1f 100644
> > --- a/kernel/trace/trace_preemptirq.c
> > +++ b/kernel/trace/trace_preemptirq.c
> > @@ -19,6 +19,20 @@
> >  /* Per-cpu variable to prevent redundant calls when IRQs already off */
> >  static DEFINE_PER_CPU(int, tracing_irq_cpu);
> >  
> > +/*
> > + * Use regular trace points on architectures that implement noinstr
> > + * tooling: these calls will only happen with RCU enabled, which can
> > + * use a regular tracepoint.
> > + *
> > + * On older architectures, use the rcuidle tracing methods (which
> > + * aren't NMI-safe - so exclude NMI contexts):
> > + */
> > +#ifdef CONFIG_ARCH_WANTS_NO_INSTR
> > +#define trace(point)	trace_##point
> > +#else
> > +#define trace(point)	if (!in_nmi()) trace_##point##_rcuidle
> > +#endif
> > +
> >  /*
> >   * Like trace_hardirqs_on() but without the lockdep invocation. This is
> >   * used in the low level entry code where the ordering vs. RCU is important
> 
> For some reason I missed the trace_preempt_{on,off} things, so that then
> gets the below on top or so.
> 
> diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c
> index f992444a0b1f..ea96b41c8838 100644
> --- a/kernel/trace/trace_preemptirq.c
> +++ b/kernel/trace/trace_preemptirq.c
> @@ -100,15 +100,13 @@ NOKPROBE_SYMBOL(trace_hardirqs_off);
>  
>  void trace_preempt_on(unsigned long a0, unsigned long a1)
>  {
> -	if (!in_nmi())
> -		trace_preempt_enable_rcuidle(a0, a1);
> +	trace(preempt_enable)(a0, a1);
>  	tracer_preempt_on(a0, a1);
>  }
>  
>  void trace_preempt_off(unsigned long a0, unsigned long a1)
>  {
> -	if (!in_nmi())
> -		trace_preempt_disable_rcuidle(a0, a1);
> +	trace(preempt_disable)(a0, a1);
>  	tracer_preempt_off(a0, a1);
>  }
>  #endif

I've tested this fixlet atop this series (itself atop tip/sched/core) with a
full-fat ftrace config and the ftrace selftests, and that all runs cleanly.

FWIW, if you spin this as a patch:

Tested-by: Mark Rutland <mark.rutland@arm.com>

Without the fixlet I get splats on both arm64 and x86_64, e.g.

On arm64:

| ------------[ cut here ]------------
| WARNING: CPU: 1 PID: 1162 at include/trace/events/preemptirq.h:55 trace_preempt_on+0x68/0x70
| Modules linked in:
| CPU: 1 PID: 1162 Comm: ftracetest Not tainted 6.2.0-rc1-00100-g1066815869f5 #1
| Hardware name: linux,dummy-virt (DT)
| pstate: 40400005 (nZcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
| pc : trace_preempt_on+0x68/0x70
| lr : preempt_count_sub+0xb4/0xf0
| sp : ffff80000e04ba70
| x29: ffff80000e04ba70 x28: ffff80000ade09e8 x27: ffff800009d0c960
| x26: ffff000007c97e10 x25: ffff000007c97e00 x24: ffff80000adf1410
| x23: ffff80000e04bb80 x22: ffff80000a9aabb0 x21: ffff8000080a5cf4
| x20: ffff8000080a5cf4 x19: 0000000000000001 x18: 0000000000000000
| x17: 0000000000000000 x16: 0000000000000000 x15: 0000000000000000
| x14: 0000000000000028 x13: 0000000000004320 x12: 0000000000000361
| x11: 0000000000000361 x10: 0000000000040361 x9 : ffff8000080a5cf4
| x8 : ffff80000ae42a18 x7 : 0000000000000000 x6 : 0000000000000001
| x5 : ffff80000a8f14e8 x4 : 0000000000000001 x3 : 0000000000000000
| x2 : 0000000000000007 x1 : ffff8000080a5cf4 x0 : ffff8000080a5cf4
| Call trace:
|  trace_preempt_on+0x68/0x70
|  preempt_count_sub+0xb4/0xf0
|  percpu_up_read.constprop.0+0xc4/0x180
|  cpus_read_unlock+0x18/0x24
|  static_key_enable+0x2c/0x40
|  tracepoint_add_func+0x330/0x3dc
|  tracepoint_probe_register+0x74/0xc0
|  trace_event_reg+0x8c/0xa0
|  __ftrace_event_enable_disable+0x174/0x4d0
|  __ftrace_set_clr_event_nolock+0xe0/0x150
|  ftrace_set_clr_event+0x90/0x13c
|  ftrace_event_write+0xd4/0x120
|  vfs_write+0xcc/0x2f0
|  ksys_write+0x78/0x110
|  __arm64_sys_write+0x24/0x30
|  invoke_syscall+0x50/0x120
|  el0_svc_common.constprop.0+0x68/0x124
|  do_el0_svc+0x40/0xbc
|  el0_svc+0x48/0xc0
|  el0t_64_sync_handler+0xf4/0x120
|  el0t_64_sync+0x190/0x194
| irq event stamp: 69662
| hardirqs last  enabled at (69661): [<ffff8000092d63f4>] _raw_spin_unlock_irqrestore+0x80/0xa0
| hardirqs last disabled at (69662): [<ffff8000092c4d64>] el1_dbg+0x24/0x90
| softirqs last  enabled at (69564): [<ffff800008010b08>] __do_softirq+0x448/0x5bc
| softirqs last disabled at (69555): [<ffff800008017288>] ____do_softirq+0x18/0x24
| ---[ end trace 0000000000000000 ]---
| ------------[ cut here ]------------
| WARNING: CPU: 1 PID: 1162 at include/trace/events/preemptirq.h:51 trace_preempt_off+0x68/0xb0
| Modules linked in:
| CPU: 1 PID: 1162 Comm: ftracetest Tainted: G        W          6.2.0-rc1-00100-g1066815869f5 #1
| Hardware name: linux,dummy-virt (DT)
| pstate: 40400005 (nZcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
| pc : trace_preempt_off+0x68/0xb0
| lr : preempt_count_add+0xa0/0xc0
| sp : ffff80000e04ba80
| x29: ffff80000e04ba80 x28: ffff80000ade09e8 x27: ffff800009d0c970
| x26: ffff000007c97e90 x25: ffff000007c97e80 x24: ffff80000adf13c8
| x23: ffff80000e04bb80 x22: ffff80000a9aabb0 x21: ffff80000b8c8d18
| x20: ffff8000080a5c68 x19: ffff8000080a5c68 x18: 0000000000000000
| x17: 0000000000000000 x16: 0000000000000000 x15: 0000000000000000
| x14: 0000000000000028 x13: 0000000000004320 x12: 0000000000000361
| x11: 0000000000000361 x10: 0000000000040361 x9 : ffff8000080a5c68
| x8 : ffff80000ae42a18 x7 : 0000000000000000 x6 : 0000000000000003
| x5 : 000000003a9e6115 x4 : 0000000000000001 x3 : 0000000000000000
| x2 : 0000000000000007 x1 : ffff8000080a5c68 x0 : ffff8000080a5c68
| Call trace:
|  trace_preempt_off+0x68/0xb0
|  preempt_count_add+0xa0/0xc0
|  percpu_up_read.constprop.0+0x38/0x180
|  cpus_read_unlock+0x18/0x24
|  static_key_enable+0x2c/0x40
|  tracepoint_add_func+0x330/0x3dc
|  tracepoint_probe_register+0x74/0xc0
|  trace_event_reg+0x8c/0xa0
|  __ftrace_event_enable_disable+0x174/0x4d0
|  __ftrace_set_clr_event_nolock+0xe0/0x150
|  ftrace_set_clr_event+0x90/0x13c
|  ftrace_event_write+0xd4/0x120
|  vfs_write+0xcc/0x2f0
|  ksys_write+0x78/0x110
|  __arm64_sys_write+0x24/0x30
|  invoke_syscall+0x50/0x120
|  el0_svc_common.constprop.0+0x68/0x124
|  do_el0_svc+0x40/0xbc
|  el0_svc+0x48/0xc0
|  el0t_64_sync_handler+0xf4/0x120
|  el0t_64_sync+0x190/0x194
| irq event stamp: 69686
| hardirqs last  enabled at (69685): [<ffff8000092c4db8>] el1_dbg+0x78/0x90
| hardirqs last disabled at (69686): [<ffff8000092c4d64>] el1_dbg+0x24/0x90
| softirqs last  enabled at (69670): [<ffff800008010b08>] __do_softirq+0x448/0x5bc
| softirqs last disabled at (69665): [<ffff800008017288>] ____do_softirq+0x18/0x24
| ---[ end trace 0000000000000000 ]---

On x86_64:

| ------------[ cut here ]------------
| WARNING: CPU: 0 PID: 1083 at include/trace/events/preemptirq.h:55 trace_preempt_on+0x31/0x40
| Modules linked in:
| CPU: 0 PID: 1083 Comm: ftracetest Not tainted 6.2.0-rc1-00100-g1066815869f5 #3
| Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014
| RIP: 0010:trace_preempt_on+0x31/0x40
| Code: 1d d6 45 4c a9 00 00 f0 00 74 05 e9 29 11 00 00 cc 90 e9 22 11 00 00 65 8b 05 07 d6 45 4c 89 c0 48 0f a3 05 c1 af b1 01 73 e1 <0f> 0b eb dd 66 66 2e 0f 1f 84 00 00 00 00 00 90 90 90 0
| RSP: 0018:ffffb374c0ce7ba8 EFLAGS: 00010247
| RAX: 0000000000000000 RBX: 0000000000000001 RCX: 0000000000000000
| RDX: 0000000000000001 RSI: 0000000000000000 RDI: ffffffffb49e9c1d
| RBP: 0000000000000000 R08: 000000004ea10906 R09: 000000005e02388f
| R10: 00000000f5e02388 R11: 00000000d269a5f5 R12: ffff98620005c780
| R13: ffffffffb3a363d0 R14: 0000000000000a37 R15: 0000000000000a38
| FS:  00007fc812d836a0(0000) GS:ffff98623bc00000(0000) knlGS:0000000000000000
| CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
| CR2: 0000000000642e58 CR3: 00000001039c6005 CR4: 0000000000370ef0
| DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
| DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
| Call Trace:
|  <TASK>
|  preempt_count_sub+0xa3/0xe0
|  _raw_spin_unlock+0x2d/0x50
|  ? trace_preempt_on+0x17/0x40
|  __text_poke+0x326/0x4c0
|  ? trace_preempt_on+0x17/0x40
|  text_poke_bp_batch+0x7f/0x360
|  text_poke_finish+0x1f/0x30
|  arch_jump_label_transform_apply+0x1c/0x30
|  static_key_enable_cpuslocked+0x65/0xa0
|  static_key_enable+0x1a/0x20
|  tracepoint_add_func+0x32c/0x430
|  ? __pfx_trace_event_raw_event_preemptirq_template+0x10/0x10
|  tracepoint_probe_register+0x78/0xb0
|  ? __pfx_trace_event_raw_event_preemptirq_template+0x10/0x10
|  __ftrace_event_enable_disable+0x180/0x250
|  __ftrace_set_clr_event_nolock+0xe3/0x130
|  ftrace_set_clr_event+0x74/0xf0
|  ftrace_event_write+0xdd/0x110
|  vfs_write+0xee/0x510
|  ksys_write+0x75/0x100
|  do_syscall_64+0x3e/0x90
|  entry_SYSCALL_64_after_hwframe+0x72/0xdc
| RIP: 0033:0x7fc812d09103
| Code: 8b 7c 24 08 89 c5 e8 c5 ff ff ff 89 ef 89 44 24 08 e8 81 bc 02 00 8b 44 24 08 48 83 c4 10 5d c3 48 63 ff b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 76 10 48 8b 15 4e fd 05 00 f7 d8 64 3
| RSP: 002b:00007ffcb12972f8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
| RAX: ffffffffffffffda RBX: 0000000000000001 RCX: 00007fc812d09103
| RDX: 0000000000000004 RSI: 0000000000645050 RDI: 0000000000000001
| RBP: 0000000000645050 R08: fefefefefefefeff R09: ffffffff00000000
| R10: 0000000000000001 R11: 0000000000000246 R12: 0000000000000004
| R13: 00007fc812d83690 R14: 0000000000000001 R15: 0000000000000000
|  </TASK>
| irq event stamp: 57597
| hardirqs last  enabled at (57609): [<ffffffffb3b1384e>] __up_console_sem+0x5e/0x80
| hardirqs last disabled at (57620): [<ffffffffb3b13833>] __up_console_sem+0x43/0x80
| softirqs last  enabled at (57556): [<ffffffffb49eae94>] __do_softirq+0x354/0x4d7
| softirqs last disabled at (57543): [<ffffffffb3a92ad7>] irq_exit_rcu+0xc7/0x140
| ---[ end trace 0000000000000000 ]---
| ------------[ cut here ]------------
| WARNING: CPU: 0 PID: 1083 at include/trace/events/preemptirq.h:51 trace_preempt_off+0x31/0x40
| Modules linked in:
| CPU: 0 PID: 1083 Comm: ftracetest Tainted: G        W          6.2.0-rc1-00100-g1066815869f5 #3
| Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014
| RIP: 0010:trace_preempt_off+0x31/0x40
| Code: cd d5 45 4c a9 00 00 f0 00 74 05 e9 f9 11 00 00 cc 90 e9 f2 11 00 00 65 8b 05 b7 d5 45 4c 89 c0 48 0f a3 05 71 af b1 01 73 e1 <0f> 0b eb dd 66 2e 0f 1f 84 00 00 00 00 00 90 90 90 90 0
| RSP: 0018:ffffb374c0ce7c18 EFLAGS: 00010247
| RAX: 0000000000000000 RBX: 0000000000000001 RCX: 0000000000000001
| RDX: 0000000000000000 RSI: ffffffffb3b74639 RDI: ffffffffb3b74639
| RBP: ffffffffb56faa10 R08: ffffffffb56faa10 R09: 000000005e02388f
| R10: 00000000f5e02388 R11: 00000000d269a5f5 R12: ffffffffb3a36d20
| R13: 0000000000000000 R14: 0000000000000000 R15: ffff9862038a0780
| FS:  00007fc812d836a0(0000) GS:ffff98623bc00000(0000) knlGS:0000000000000000
| CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
| CR2: 0000000000642e58 CR3: 00000001039c6005 CR4: 0000000000370ef0
| DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
| DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
| Call Trace:
|  <TASK>
|  on_each_cpu_cond_mask+0x29/0x80
|  text_poke_bp_batch+0xe7/0x360
|  text_poke_finish+0x1f/0x30
|  arch_jump_label_transform_apply+0x1c/0x30
|  static_key_enable_cpuslocked+0x65/0xa0
|  static_key_enable+0x1a/0x20
|  tracepoint_add_func+0x32c/0x430
|  ? __pfx_trace_event_raw_event_preemptirq_template+0x10/0x10
|  tracepoint_probe_register+0x78/0xb0
|  ? __pfx_trace_event_raw_event_preemptirq_template+0x10/0x10
|  __ftrace_event_enable_disable+0x180/0x250
|  __ftrace_set_clr_event_nolock+0xe3/0x130
|  ftrace_set_clr_event+0x74/0xf0
|  ftrace_event_write+0xdd/0x110
|  vfs_write+0xee/0x510
|  ksys_write+0x75/0x100
|  do_syscall_64+0x3e/0x90
|  entry_SYSCALL_64_after_hwframe+0x72/0xdc
| RIP: 0033:0x7fc812d09103
| Code: 8b 7c 24 08 89 c5 e8 c5 ff ff ff 89 ef 89 44 24 08 e8 81 bc 02 00 8b 44 24 08 48 83 c4 10 5d c3 48 63 ff b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 76 10 48 8b 15 4e fd 05 00 f7 d8 64 3
| RSP: 002b:00007ffcb12972f8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
| RAX: ffffffffffffffda RBX: 0000000000000001 RCX: 00007fc812d09103
| RDX: 0000000000000004 RSI: 0000000000645050 RDI: 0000000000000001
| RBP: 0000000000645050 R08: fefefefefefefeff R09: ffffffff00000000
| R10: 0000000000000001 R11: 0000000000000246 R12: 0000000000000004
| R13: 00007fc812d83690 R14: 0000000000000001 R15: 0000000000000000
|  </TASK>
| irq event stamp: 58293
| hardirqs last  enabled at (58303): [<ffffffffb3b1384e>] __up_console_sem+0x5e/0x80
| hardirqs last disabled at (58314): [<ffffffffb3b13833>] __up_console_sem+0x43/0x80
| softirqs last  enabled at (57820): [<ffffffffb49eae94>] __do_softirq+0x354/0x4d7
| softirqs last disabled at (57811): [<ffffffffb3a92ad7>] irq_exit_rcu+0xc7/0x140
| ---[ end trace 0000000000000000 ]---

Thanks,
Mark.
Mark Rutland Jan. 25, 2023, 3:20 p.m. UTC | #9
Hi Peter,

On Mon, Jan 23, 2023 at 09:50:09PM +0100, Peter Zijlstra wrote:
> 0-day robot reported graph-tracing made the cpuidle-vs-rcu rework go splat.
> 
> These patches appear to cure this, the ftrace selftest now runs to completion
> without spamming scary messages to dmesg.

In addition to the other bits for arm64, we'll need the following patch. Are
you happy to add that to the start of this series?

I've tested this on an arm64 Juno board with a full-fat ftrace config,
CONFIG_PROVE_LOCKING + CONFIG_DEBUG_LOCKDEP, and CONFIG_DEBUG_VIRTUAL=y, and
build tested for 32-bit arm.

Thanks,
Mark.

---->8----
From 30ab9eba19e952cb51c9f599d2ac9b8a302cb63d Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Wed, 25 Jan 2023 14:20:49 +0000
Subject: [PATCH] drivers: firmware: psci: don't instrument suspend code

The PSCI suspend code is currently instrumentable, which is not safe as
instrumentation (e.g. ftrace) may try to make use of RCU during idle
periods when RCU is not watching.

To fix this we need to ensure that psci_suspend_finisher() and anything
it calls are not instrumented. We can do this fairly simply by marking
psci_suspend_finisher() and the psci*_cpu_suspend() functions as
noinstr, and the underlying helper functions as __always_inline.

When CONFIG_DEBUG_VIRTUAL=y, __pa_symbol() can expand to an out-of-line
instrumented function, so we must use __pa_symbol_nodebug() within
psci_suspend_finisher().

The raw SMCCC invocation functions are written in assembly, and are not
subject to compiler instrumentation.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Lorenzo Pieralisi <lpieralisi@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
---
 drivers/firmware/psci/psci.c | 31 +++++++++++++++++++------------
 1 file changed, 19 insertions(+), 12 deletions(-)

diff --git a/drivers/firmware/psci/psci.c b/drivers/firmware/psci/psci.c
index f3a044fa4652a..c12847b4736de 100644
--- a/drivers/firmware/psci/psci.c
+++ b/drivers/firmware/psci/psci.c
@@ -108,9 +108,10 @@ bool psci_power_state_is_valid(u32 state)
 	return !(state & ~valid_mask);
 }
 
-static unsigned long __invoke_psci_fn_hvc(unsigned long function_id,
-			unsigned long arg0, unsigned long arg1,
-			unsigned long arg2)
+static __always_inline unsigned long
+__invoke_psci_fn_hvc(unsigned long function_id,
+		     unsigned long arg0, unsigned long arg1,
+		     unsigned long arg2)
 {
 	struct arm_smccc_res res;
 
@@ -118,9 +119,10 @@ static unsigned long __invoke_psci_fn_hvc(unsigned long function_id,
 	return res.a0;
 }
 
-static unsigned long __invoke_psci_fn_smc(unsigned long function_id,
-			unsigned long arg0, unsigned long arg1,
-			unsigned long arg2)
+static __always_inline unsigned long
+__invoke_psci_fn_smc(unsigned long function_id,
+		     unsigned long arg0, unsigned long arg1,
+		     unsigned long arg2)
 {
 	struct arm_smccc_res res;
 
@@ -128,7 +130,7 @@ static unsigned long __invoke_psci_fn_smc(unsigned long function_id,
 	return res.a0;
 }
 
-static int psci_to_linux_errno(int errno)
+static __always_inline int psci_to_linux_errno(int errno)
 {
 	switch (errno) {
 	case PSCI_RET_SUCCESS:
@@ -169,7 +171,8 @@ int psci_set_osi_mode(bool enable)
 	return psci_to_linux_errno(err);
 }
 
-static int __psci_cpu_suspend(u32 fn, u32 state, unsigned long entry_point)
+static __always_inline int
+__psci_cpu_suspend(u32 fn, u32 state, unsigned long entry_point)
 {
 	int err;
 
@@ -177,13 +180,15 @@ static int __psci_cpu_suspend(u32 fn, u32 state, unsigned long entry_point)
 	return psci_to_linux_errno(err);
 }
 
-static int psci_0_1_cpu_suspend(u32 state, unsigned long entry_point)
+static __always_inline int
+psci_0_1_cpu_suspend(u32 state, unsigned long entry_point)
 {
 	return __psci_cpu_suspend(psci_0_1_function_ids.cpu_suspend,
 				  state, entry_point);
 }
 
-static int psci_0_2_cpu_suspend(u32 state, unsigned long entry_point)
+static __always_inline int
+psci_0_2_cpu_suspend(u32 state, unsigned long entry_point)
 {
 	return __psci_cpu_suspend(PSCI_FN_NATIVE(0_2, CPU_SUSPEND),
 				  state, entry_point);
@@ -447,10 +452,12 @@ late_initcall(psci_debugfs_init)
 #endif
 
 #ifdef CONFIG_CPU_IDLE
-static int psci_suspend_finisher(unsigned long state)
+static noinstr int psci_suspend_finisher(unsigned long state)
 {
 	u32 power_state = state;
-	phys_addr_t pa_cpu_resume = __pa_symbol(cpu_resume);
+	phys_addr_t pa_cpu_resume;
+
+	pa_cpu_resume = __pa_symbol_nodebug((unsigned long)cpu_resume);
 
 	return psci_ops.cpu_suspend(power_state, pa_cpu_resume);
 }