diff mbox series

[RFC] perf: fix panic by mark recursion inside perf_log_throttle

Message ID ff979a43-045a-dc56-64d1-2c31dd4db381@linux.alibaba.com (mailing list archive)
State RFC
Headers show
Series [RFC] perf: fix panic by mark recursion inside perf_log_throttle | expand

Checks

Context Check Description
netdev/tree_selection success Not a local patch

Commit Message

王贇 Sept. 9, 2021, 3:13 a.m. UTC
When running with ftrace function enabled, we observed panic
as below:

  traps: PANIC: double fault, error_code: 0x0
  [snip]
  RIP: 0010:perf_swevent_get_recursion_context+0x0/0x70
  [snip]
  Call Trace:
   <NMI>
   perf_trace_buf_alloc+0x26/0xd0
   perf_ftrace_function_call+0x18f/0x2e0
   kernelmode_fixup_or_oops+0x5/0x120
   __bad_area_nosemaphore+0x1b8/0x280
   do_user_addr_fault+0x410/0x920
   exc_page_fault+0x92/0x300
   asm_exc_page_fault+0x1e/0x30
  RIP: 0010:__get_user_nocheck_8+0x6/0x13
   perf_callchain_user+0x266/0x2f0
   get_perf_callchain+0x194/0x210
   perf_callchain+0xa3/0xc0
   perf_prepare_sample+0xa5/0xa60
   perf_event_output_forward+0x7b/0x1b0
   __perf_event_overflow+0x67/0x120
   perf_swevent_overflow+0xcb/0x110
   perf_swevent_event+0xb0/0xf0
   perf_tp_event+0x292/0x410
   perf_trace_run_bpf_submit+0x87/0xc0
   perf_trace_lock_acquire+0x12b/0x170
   lock_acquire+0x1bf/0x2e0
   perf_output_begin+0x70/0x4b0
   perf_log_throttle+0xe2/0x1a0
   perf_event_nmi_handler+0x30/0x50
   nmi_handle+0xba/0x2a0
   default_do_nmi+0x45/0xf0
   exc_nmi+0x155/0x170
   end_repeat_nmi+0x16/0x55

According to the trace we know the story is like this, the NMI
triggered perf IRQ throttling and call perf_log_throttle(),
which triggered the swevent overflow, and the overflow process
do perf_callchain_user() which triggered a user PF, and the PF
process triggered perf ftrace which finally lead into a suspected
stack overflow.

This patch marking the context as recursion during perf_log_throttle()
, so no more swevent during the process and no more panic.

Signed-off-by: Michael Wang <yun.wang@linux.alibaba.com>
---
 kernel/events/core.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

Comments

王贇 Sept. 9, 2021, 6:10 a.m. UTC | #1
Reported-by: Abaci <abaci@linux.alibaba.com>

On 2021/9/9 上午11:13, 王贇 wrote:
> When running with ftrace function enabled, we observed panic
> as below:
> 
>   traps: PANIC: double fault, error_code: 0x0
>   [snip]
>   RIP: 0010:perf_swevent_get_recursion_context+0x0/0x70
>   [snip]
>   Call Trace:
>    <NMI>
>    perf_trace_buf_alloc+0x26/0xd0
>    perf_ftrace_function_call+0x18f/0x2e0
>    kernelmode_fixup_or_oops+0x5/0x120
>    __bad_area_nosemaphore+0x1b8/0x280
>    do_user_addr_fault+0x410/0x920
>    exc_page_fault+0x92/0x300
>    asm_exc_page_fault+0x1e/0x30
>   RIP: 0010:__get_user_nocheck_8+0x6/0x13
>    perf_callchain_user+0x266/0x2f0
>    get_perf_callchain+0x194/0x210
>    perf_callchain+0xa3/0xc0
>    perf_prepare_sample+0xa5/0xa60
>    perf_event_output_forward+0x7b/0x1b0
>    __perf_event_overflow+0x67/0x120
>    perf_swevent_overflow+0xcb/0x110
>    perf_swevent_event+0xb0/0xf0
>    perf_tp_event+0x292/0x410
>    perf_trace_run_bpf_submit+0x87/0xc0
>    perf_trace_lock_acquire+0x12b/0x170
>    lock_acquire+0x1bf/0x2e0
>    perf_output_begin+0x70/0x4b0
>    perf_log_throttle+0xe2/0x1a0
>    perf_event_nmi_handler+0x30/0x50
>    nmi_handle+0xba/0x2a0
>    default_do_nmi+0x45/0xf0
>    exc_nmi+0x155/0x170
>    end_repeat_nmi+0x16/0x55
> 
> According to the trace we know the story is like this, the NMI
> triggered perf IRQ throttling and call perf_log_throttle(),
> which triggered the swevent overflow, and the overflow process
> do perf_callchain_user() which triggered a user PF, and the PF
> process triggered perf ftrace which finally lead into a suspected
> stack overflow.
> 
> This patch marking the context as recursion during perf_log_throttle()
> , so no more swevent during the process and no more panic.
> 
> Signed-off-by: Michael Wang <yun.wang@linux.alibaba.com>
> ---
>  kernel/events/core.c | 14 +++++++++-----
>  1 file changed, 9 insertions(+), 5 deletions(-)
> 
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 744e872..6063443 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -8716,6 +8716,7 @@ static void perf_log_throttle(struct perf_event *event, int enable)
>  	struct perf_output_handle handle;
>  	struct perf_sample_data sample;
>  	int ret;
> +	int rctx;
> 
>  	struct {
>  		struct perf_event_header	header;
> @@ -8738,14 +8739,17 @@ static void perf_log_throttle(struct perf_event *event, int enable)
> 
>  	perf_event_header__init_id(&throttle_event.header, &sample, event);
> 
> +	rctx = perf_swevent_get_recursion_context();
>  	ret = perf_output_begin(&handle, &sample, event,
>  				throttle_event.header.size);
> -	if (ret)
> -		return;
> +	if (!ret) {
> +		perf_output_put(&handle, throttle_event);
> +		perf_event__output_id_sample(event, &handle, &sample);
> +		perf_output_end(&handle);
> +	}
> 
> -	perf_output_put(&handle, throttle_event);
> -	perf_event__output_id_sample(event, &handle, &sample);
> -	perf_output_end(&handle);
> +	if (rctx >= 0)
> +		perf_swevent_put_recursion_context(rctx);
>  }
> 
>  /*
>
Peter Zijlstra Sept. 10, 2021, 3:38 p.m. UTC | #2
On Thu, Sep 09, 2021 at 11:13:21AM +0800, 王贇 wrote:
> When running with ftrace function enabled, we observed panic
> as below:
> 
>   traps: PANIC: double fault, error_code: 0x0
>   [snip]
>   RIP: 0010:perf_swevent_get_recursion_context+0x0/0x70
>   [snip]
>   Call Trace:
>    <NMI>
>    perf_trace_buf_alloc+0x26/0xd0
>    perf_ftrace_function_call+0x18f/0x2e0
>    kernelmode_fixup_or_oops+0x5/0x120
>    __bad_area_nosemaphore+0x1b8/0x280
>    do_user_addr_fault+0x410/0x920
>    exc_page_fault+0x92/0x300
>    asm_exc_page_fault+0x1e/0x30
>   RIP: 0010:__get_user_nocheck_8+0x6/0x13
>    perf_callchain_user+0x266/0x2f0
>    get_perf_callchain+0x194/0x210
>    perf_callchain+0xa3/0xc0
>    perf_prepare_sample+0xa5/0xa60
>    perf_event_output_forward+0x7b/0x1b0
>    __perf_event_overflow+0x67/0x120
>    perf_swevent_overflow+0xcb/0x110
>    perf_swevent_event+0xb0/0xf0
>    perf_tp_event+0x292/0x410
>    perf_trace_run_bpf_submit+0x87/0xc0
>    perf_trace_lock_acquire+0x12b/0x170
>    lock_acquire+0x1bf/0x2e0
>    perf_output_begin+0x70/0x4b0
>    perf_log_throttle+0xe2/0x1a0
>    perf_event_nmi_handler+0x30/0x50
>    nmi_handle+0xba/0x2a0
>    default_do_nmi+0x45/0xf0
>    exc_nmi+0x155/0x170
>    end_repeat_nmi+0x16/0x55

kernel/events/Makefile has:

ifdef CONFIG_FUNCTION_TRACER
CFLAGS_REMOVE_core.o = $(CC_FLAGS_FTRACE)
endif

Which, afaict, should avoid the above, no?
王贇 Sept. 13, 2021, 3 a.m. UTC | #3
On 2021/9/10 下午11:38, Peter Zijlstra wrote:
> On Thu, Sep 09, 2021 at 11:13:21AM +0800, 王贇 wrote:
>> When running with ftrace function enabled, we observed panic
>> as below:
>>
>>   traps: PANIC: double fault, error_code: 0x0
>>   [snip]
>>   RIP: 0010:perf_swevent_get_recursion_context+0x0/0x70
>>   [snip]
>>   Call Trace:
>>    <NMI>
>>    perf_trace_buf_alloc+0x26/0xd0
>>    perf_ftrace_function_call+0x18f/0x2e0
>>    kernelmode_fixup_or_oops+0x5/0x120
>>    __bad_area_nosemaphore+0x1b8/0x280
>>    do_user_addr_fault+0x410/0x920
>>    exc_page_fault+0x92/0x300
>>    asm_exc_page_fault+0x1e/0x30
>>   RIP: 0010:__get_user_nocheck_8+0x6/0x13
>>    perf_callchain_user+0x266/0x2f0
>>    get_perf_callchain+0x194/0x210
>>    perf_callchain+0xa3/0xc0
>>    perf_prepare_sample+0xa5/0xa60
>>    perf_event_output_forward+0x7b/0x1b0
>>    __perf_event_overflow+0x67/0x120
>>    perf_swevent_overflow+0xcb/0x110
>>    perf_swevent_event+0xb0/0xf0
>>    perf_tp_event+0x292/0x410
>>    perf_trace_run_bpf_submit+0x87/0xc0
>>    perf_trace_lock_acquire+0x12b/0x170
>>    lock_acquire+0x1bf/0x2e0
>>    perf_output_begin+0x70/0x4b0
>>    perf_log_throttle+0xe2/0x1a0
>>    perf_event_nmi_handler+0x30/0x50
>>    nmi_handle+0xba/0x2a0
>>    default_do_nmi+0x45/0xf0
>>    exc_nmi+0x155/0x170
>>    end_repeat_nmi+0x16/0x55
> 
> kernel/events/Makefile has:
> 
> ifdef CONFIG_FUNCTION_TRACER
> CFLAGS_REMOVE_core.o = $(CC_FLAGS_FTRACE)
> endif
> 
> Which, afaict, should avoid the above, no?

I'm afraid it's not working for this case, the
start point of tracing is at lock_acquire() which
is not from 'kernel/events/core', the following PF
related function are also not from 'core', prevent
ftrace on 'core' can't prevent this from happen...

Regards,
Michael Wang

>
王贇 Sept. 13, 2021, 3:21 a.m. UTC | #4
On 2021/9/13 上午11:00, 王贇 wrote:
> 
> 
> On 2021/9/10 下午11:38, Peter Zijlstra wrote:
>> On Thu, Sep 09, 2021 at 11:13:21AM +0800, 王贇 wrote:
>>> When running with ftrace function enabled, we observed panic
>>> as below:
>>>
>>>   traps: PANIC: double fault, error_code: 0x0
>>>   [snip]
>>>   RIP: 0010:perf_swevent_get_recursion_context+0x0/0x70
>>>   [snip]
>>>   Call Trace:
>>>    <NMI>
>>>    perf_trace_buf_alloc+0x26/0xd0
>>>    perf_ftrace_function_call+0x18f/0x2e0
>>>    kernelmode_fixup_or_oops+0x5/0x120
>>>    __bad_area_nosemaphore+0x1b8/0x280
>>>    do_user_addr_fault+0x410/0x920
>>>    exc_page_fault+0x92/0x300
>>>    asm_exc_page_fault+0x1e/0x30
>>>   RIP: 0010:__get_user_nocheck_8+0x6/0x13
>>>    perf_callchain_user+0x266/0x2f0
>>>    get_perf_callchain+0x194/0x210
>>>    perf_callchain+0xa3/0xc0
>>>    perf_prepare_sample+0xa5/0xa60
>>>    perf_event_output_forward+0x7b/0x1b0
>>>    __perf_event_overflow+0x67/0x120
>>>    perf_swevent_overflow+0xcb/0x110
>>>    perf_swevent_event+0xb0/0xf0
>>>    perf_tp_event+0x292/0x410
>>>    perf_trace_run_bpf_submit+0x87/0xc0
>>>    perf_trace_lock_acquire+0x12b/0x170
>>>    lock_acquire+0x1bf/0x2e0
>>>    perf_output_begin+0x70/0x4b0
>>>    perf_log_throttle+0xe2/0x1a0
>>>    perf_event_nmi_handler+0x30/0x50
>>>    nmi_handle+0xba/0x2a0
>>>    default_do_nmi+0x45/0xf0
>>>    exc_nmi+0x155/0x170
>>>    end_repeat_nmi+0x16/0x55
>>
>> kernel/events/Makefile has:
>>
>> ifdef CONFIG_FUNCTION_TRACER
>> CFLAGS_REMOVE_core.o = $(CC_FLAGS_FTRACE)
>> endif
>>
>> Which, afaict, should avoid the above, no?
> 
> I'm afraid it's not working for this case, the
> start point of tracing is at lock_acquire() which
> is not from 'kernel/events/core', the following PF
> related function are also not from 'core', prevent
> ftrace on 'core' can't prevent this from happen...

By a second thinking, I think you're right about the
way it should be fixed, since disabling ftrace on
'arch/x86/mm/fault.c' could also fix the problem.

Will send a formal patch later :-)

Regards,
Michael Wang

> 
> Regards,
> Michael Wang
> 
>>
Peter Zijlstra Sept. 13, 2021, 10:24 a.m. UTC | #5
On Mon, Sep 13, 2021 at 11:00:47AM +0800, 王贇 wrote:
> 
> 
> On 2021/9/10 下午11:38, Peter Zijlstra wrote:
> > On Thu, Sep 09, 2021 at 11:13:21AM +0800, 王贇 wrote:
> >> When running with ftrace function enabled, we observed panic
> >> as below:
> >>
> >>   traps: PANIC: double fault, error_code: 0x0
> >>   [snip]
> >>   RIP: 0010:perf_swevent_get_recursion_context+0x0/0x70
> >>   [snip]
> >>   Call Trace:
> >>    <NMI>
> >>    perf_trace_buf_alloc+0x26/0xd0
> >>    perf_ftrace_function_call+0x18f/0x2e0
> >>    kernelmode_fixup_or_oops+0x5/0x120
> >>    __bad_area_nosemaphore+0x1b8/0x280
> >>    do_user_addr_fault+0x410/0x920
> >>    exc_page_fault+0x92/0x300
> >>    asm_exc_page_fault+0x1e/0x30
> >>   RIP: 0010:__get_user_nocheck_8+0x6/0x13
> >>    perf_callchain_user+0x266/0x2f0
> >>    get_perf_callchain+0x194/0x210
> >>    perf_callchain+0xa3/0xc0
> >>    perf_prepare_sample+0xa5/0xa60
> >>    perf_event_output_forward+0x7b/0x1b0
> >>    __perf_event_overflow+0x67/0x120
> >>    perf_swevent_overflow+0xcb/0x110
> >>    perf_swevent_event+0xb0/0xf0
> >>    perf_tp_event+0x292/0x410
> >>    perf_trace_run_bpf_submit+0x87/0xc0
> >>    perf_trace_lock_acquire+0x12b/0x170
> >>    lock_acquire+0x1bf/0x2e0
> >>    perf_output_begin+0x70/0x4b0
> >>    perf_log_throttle+0xe2/0x1a0
> >>    perf_event_nmi_handler+0x30/0x50
> >>    nmi_handle+0xba/0x2a0
> >>    default_do_nmi+0x45/0xf0
> >>    exc_nmi+0x155/0x170
> >>    end_repeat_nmi+0x16/0x55
> > 
> > kernel/events/Makefile has:
> > 
> > ifdef CONFIG_FUNCTION_TRACER
> > CFLAGS_REMOVE_core.o = $(CC_FLAGS_FTRACE)
> > endif
> > 
> > Which, afaict, should avoid the above, no?
> 
> I'm afraid it's not working for this case, the
> start point of tracing is at lock_acquire() which
> is not from 'kernel/events/core', the following PF
> related function are also not from 'core', prevent
> ftrace on 'core' can't prevent this from happen...

I'm confused tho; where does the #DF come from? Because taking a #PF
from NMI should be perfectly fine.

AFAICT that callchain is something like:

	NMI
	  perf_event_nmi_handler()
	    (part of the chain is missing here)
	      perf_log_throttle()
	        perf_output_begin() /* events/ring_buffer.c */
		  rcu_read_lock()
		    rcu_lock_acquire()
		      lock_acquire()
		        trace_lock_acquire() --> perf_trace_foo

			  ...
			    perf_callchain()
			      perf_callchain_user()
			        #PF (fully expected during a userspace callchain)
				  (some stuff, until the first __fentry)
				    perf_trace_function_call
				      perf_trace_buf_alloc()
				        perf_swevent_get_recursion_context()
					  *BOOM*

Now, supposedly we then take another #PF from get_recursion_context() or
something, but that doesn't make sense. That should just work...

Can you figure out what's going wrong there? going with the RIP, this
almost looks like 'swhash->recursion' goes splat, but again that makes
no sense, that's a per-cpu variable.
Peter Zijlstra Sept. 13, 2021, 10:36 a.m. UTC | #6
On Mon, Sep 13, 2021 at 12:24:24PM +0200, Peter Zijlstra wrote:

FWIW:

> I'm confused tho; where does the #DF come from? Because taking a #PF
> from NMI should be perfectly fine.
> 
> AFAICT that callchain is something like:
> 
> 	NMI
> 	  perf_event_nmi_handler()
> 	    (part of the chain is missing here)
> 	      perf_log_throttle()
> 	        perf_output_begin() /* events/ring_buffer.c */
> 		  rcu_read_lock()
> 		    rcu_lock_acquire()
> 		      lock_acquire()
> 		        trace_lock_acquire() --> perf_trace_foo

This function also calls perf_trace_buf_alloc(), and will have
incremented the recursion count, such that:

> 
> 			  ...
> 			    perf_callchain()
> 			      perf_callchain_user()
> 			        #PF (fully expected during a userspace callchain)
> 				  (some stuff, until the first __fentry)
> 				    perf_trace_function_call
> 				      perf_trace_buf_alloc()
> 				        perf_swevent_get_recursion_context()
> 					  *BOOM*

this one, if it wouldn't mysteriously explode, would find recursion and
terminate, except that seems to be going side-ways.

> Now, supposedly we then take another #PF from get_recursion_context() or
> something, but that doesn't make sense. That should just work...
> 
> Can you figure out what's going wrong there? going with the RIP, this
> almost looks like 'swhash->recursion' goes splat, but again that makes
> no sense, that's a per-cpu variable.
> 
>
王贇 Sept. 14, 2021, 1:58 a.m. UTC | #7
On 2021/9/13 下午6:24, Peter Zijlstra wrote:
> On Mon, Sep 13, 2021 at 11:00:47AM +0800, 王贇 wrote:
>>
>>
>> On 2021/9/10 下午11:38, Peter Zijlstra wrote:
>>> On Thu, Sep 09, 2021 at 11:13:21AM +0800, 王贇 wrote:
>>>> When running with ftrace function enabled, we observed panic
>>>> as below:
>>>>
>>>>   traps: PANIC: double fault, error_code: 0x0
>>>>   [snip]
>>>>   RIP: 0010:perf_swevent_get_recursion_context+0x0/0x70
>>>>   [snip]
>>>>   Call Trace:
>>>>    <NMI>
>>>>    perf_trace_buf_alloc+0x26/0xd0
>>>>    perf_ftrace_function_call+0x18f/0x2e0
>>>>    kernelmode_fixup_or_oops+0x5/0x120
>>>>    __bad_area_nosemaphore+0x1b8/0x280
>>>>    do_user_addr_fault+0x410/0x920
>>>>    exc_page_fault+0x92/0x300
>>>>    asm_exc_page_fault+0x1e/0x30
>>>>   RIP: 0010:__get_user_nocheck_8+0x6/0x13
>>>>    perf_callchain_user+0x266/0x2f0
>>>>    get_perf_callchain+0x194/0x210
>>>>    perf_callchain+0xa3/0xc0
>>>>    perf_prepare_sample+0xa5/0xa60
>>>>    perf_event_output_forward+0x7b/0x1b0
>>>>    __perf_event_overflow+0x67/0x120
>>>>    perf_swevent_overflow+0xcb/0x110
>>>>    perf_swevent_event+0xb0/0xf0
>>>>    perf_tp_event+0x292/0x410
>>>>    perf_trace_run_bpf_submit+0x87/0xc0
>>>>    perf_trace_lock_acquire+0x12b/0x170
>>>>    lock_acquire+0x1bf/0x2e0
>>>>    perf_output_begin+0x70/0x4b0
>>>>    perf_log_throttle+0xe2/0x1a0
>>>>    perf_event_nmi_handler+0x30/0x50
>>>>    nmi_handle+0xba/0x2a0
>>>>    default_do_nmi+0x45/0xf0
>>>>    exc_nmi+0x155/0x170
>>>>    end_repeat_nmi+0x16/0x55
>>>
>>> kernel/events/Makefile has:
>>>
>>> ifdef CONFIG_FUNCTION_TRACER
>>> CFLAGS_REMOVE_core.o = $(CC_FLAGS_FTRACE)
>>> endif
>>>
>>> Which, afaict, should avoid the above, no?
>>
>> I'm afraid it's not working for this case, the
>> start point of tracing is at lock_acquire() which
>> is not from 'kernel/events/core', the following PF
>> related function are also not from 'core', prevent
>> ftrace on 'core' can't prevent this from happen...
> 
> I'm confused tho; where does the #DF come from? Because taking a #PF
> from NMI should be perfectly fine.
> 
> AFAICT that callchain is something like:
> 
> 	NMI
> 	  perf_event_nmi_handler()
> 	    (part of the chain is missing here)
> 	      perf_log_throttle()
> 	        perf_output_begin() /* events/ring_buffer.c */
> 		  rcu_read_lock()
> 		    rcu_lock_acquire()
> 		      lock_acquire()
> 		        trace_lock_acquire() --> perf_trace_foo
> 
> 			  ...
> 			    perf_callchain()
> 			      perf_callchain_user()
> 			        #PF (fully expected during a userspace callchain)
> 				  (some stuff, until the first __fentry)
> 				    perf_trace_function_call
> 				      perf_trace_buf_alloc()
> 				        perf_swevent_get_recursion_context()
> 					  *BOOM*
> 
> Now, supposedly we then take another #PF from get_recursion_context() or
> something, but that doesn't make sense. That should just work...
> 
> Can you figure out what's going wrong there? going with the RIP, this
> almost looks like 'swhash->recursion' goes splat, but again that makes
> no sense, that's a per-cpu variable.

That's true, I actually have tried several approach to avoid the issue, but
it trigger panic as long as we access 'swhash->recursion', the array should
be accessible but somehow broken, that's why I consider this a suspected
stack overflow, since nmi repeated and trace seems very long, but just a
suspect...

Regards,
Michael Wang

>
王贇 Sept. 14, 2021, 2:02 a.m. UTC | #8
On 2021/9/13 下午6:36, Peter Zijlstra wrote:
> On Mon, Sep 13, 2021 at 12:24:24PM +0200, Peter Zijlstra wrote:
> 
> FWIW:
> 
>> I'm confused tho; where does the #DF come from? Because taking a #PF
>> from NMI should be perfectly fine.
>>
>> AFAICT that callchain is something like:
>>
>> 	NMI
>> 	  perf_event_nmi_handler()
>> 	    (part of the chain is missing here)
>> 	      perf_log_throttle()
>> 	        perf_output_begin() /* events/ring_buffer.c */
>> 		  rcu_read_lock()
>> 		    rcu_lock_acquire()
>> 		      lock_acquire()
>> 		        trace_lock_acquire() --> perf_trace_foo
> 
> This function also calls perf_trace_buf_alloc(), and will have
> incremented the recursion count, such that:
> 
>>
>> 			  ...
>> 			    perf_callchain()
>> 			      perf_callchain_user()
>> 			        #PF (fully expected during a userspace callchain)
>> 				  (some stuff, until the first __fentry)
>> 				    perf_trace_function_call
>> 				      perf_trace_buf_alloc()
>> 				        perf_swevent_get_recursion_context()
>> 					  *BOOM*
> 
> this one, if it wouldn't mysteriously explode, would find recursion and
> terminate, except that seems to be going side-ways.

Yes, it supposed to avoid recursion in the same context, but it never got
chance to do that, the function and struct should all be fine, any idea
in such situation what can trigger this kind of double fault?

Regards,
Michael Wang

> 
>> Now, supposedly we then take another #PF from get_recursion_context() or
>> something, but that doesn't make sense. That should just work...
>>
>> Can you figure out what's going wrong there? going with the RIP, this
>> almost looks like 'swhash->recursion' goes splat, but again that makes
>> no sense, that's a per-cpu variable.
>>
>>
Peter Zijlstra Sept. 14, 2021, 10:28 a.m. UTC | #9
On Tue, Sep 14, 2021 at 09:58:44AM +0800, 王贇 wrote:
> On 2021/9/13 下午6:24, Peter Zijlstra wrote:

> > I'm confused tho; where does the #DF come from? Because taking a #PF
> > from NMI should be perfectly fine.
> > 
> > AFAICT that callchain is something like:
> > 
> > 	NMI
> > 	  perf_event_nmi_handler()
> > 	    (part of the chain is missing here)
> > 	      perf_log_throttle()
> > 	        perf_output_begin() /* events/ring_buffer.c */
> > 		  rcu_read_lock()
> > 		    rcu_lock_acquire()
> > 		      lock_acquire()
> > 		        trace_lock_acquire() --> perf_trace_foo
> > 
> > 			  ...
> > 			    perf_callchain()
> > 			      perf_callchain_user()
> > 			        #PF (fully expected during a userspace callchain)
> > 				  (some stuff, until the first __fentry)
> > 				    perf_trace_function_call
> > 				      perf_trace_buf_alloc()
> > 				        perf_swevent_get_recursion_context()
> > 					  *BOOM*
> > 
> > Now, supposedly we then take another #PF from get_recursion_context() or
> > something, but that doesn't make sense. That should just work...
> > 
> > Can you figure out what's going wrong there? going with the RIP, this
> > almost looks like 'swhash->recursion' goes splat, but again that makes
> > no sense, that's a per-cpu variable.
> 
> That's true, I actually have tried several approach to avoid the issue, but
> it trigger panic as long as we access 'swhash->recursion', the array should
> be accessible but somehow broken, that's why I consider this a suspected
> stack overflow, since nmi repeated and trace seems very long, but just a
> suspect...

You can simply increase the exception stack size to test this:

diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index a8d4ad856568..e9e2c3ba5923 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -15,7 +15,7 @@
 #define THREAD_SIZE_ORDER	(2 + KASAN_STACK_ORDER)
 #define THREAD_SIZE  (PAGE_SIZE << THREAD_SIZE_ORDER)
 
-#define EXCEPTION_STACK_ORDER (0 + KASAN_STACK_ORDER)
+#define EXCEPTION_STACK_ORDER (1 + KASAN_STACK_ORDER)
 #define EXCEPTION_STKSZ (PAGE_SIZE << EXCEPTION_STACK_ORDER)
 
 #define IRQ_STACK_ORDER (2 + KASAN_STACK_ORDER)



Also, something like this might be useful:


diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
index f248eb2ac2d4..4dfdbb9395eb 100644
--- a/arch/x86/include/asm/stacktrace.h
+++ b/arch/x86/include/asm/stacktrace.h
@@ -33,6 +33,8 @@ bool in_task_stack(unsigned long *stack, struct task_struct *task,
 
 bool in_entry_stack(unsigned long *stack, struct stack_info *info);
 
+bool in_exception_stack_guard(unsigned long *stack);
+
 int get_stack_info(unsigned long *stack, struct task_struct *task,
 		   struct stack_info *info, unsigned long *visit_mask);
 bool get_stack_info_noinstr(unsigned long *stack, struct task_struct *task,
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 5601b95944fa..056cf4f31599 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -126,6 +126,39 @@ static __always_inline bool in_exception_stack(unsigned long *stack, struct stac
 	return true;
 }
 
+noinstr bool in_exception_stack_guard(unsigned long *stack)
+{
+	unsigned long begin, end, stk = (unsigned long)stack;
+	const struct estack_pages *ep;
+	unsigned int k;
+
+	BUILD_BUG_ON(N_EXCEPTION_STACKS != 6);
+
+	begin = (unsigned long)__this_cpu_read(cea_exception_stacks);
+	/*
+	 * Handle the case where stack trace is collected _before_
+	 * cea_exception_stacks had been initialized.
+	 */
+	if (!begin)
+		return false;
+
+	end = begin + sizeof(struct cea_exception_stacks);
+	/* Bail if @stack is outside the exception stack area. */
+	if (stk < begin || stk >= end)
+		return false;
+
+	/* Calc page offset from start of exception stacks */
+	k = (stk - begin) >> PAGE_SHIFT;
+	/* Lookup the page descriptor */
+	ep = &estack_pages[k];
+	/* Guard page? */
+	if (!ep->size)
+		return true;
+
+	return false;
+}
+
+
 static __always_inline bool in_irq_stack(unsigned long *stack, struct stack_info *info)
 {
 	unsigned long *end = (unsigned long *)this_cpu_read(hardirq_stack_ptr);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index a58800973aed..8b043ed02c0d 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -459,6 +459,9 @@ DEFINE_IDTENTRY_DF(exc_double_fault)
 		handle_stack_overflow("kernel stack overflow (double-fault)",
 				      regs, address);
 	}
+
+	if (in_exception_stack_guard((void *)address))
+		pr_emerg("PANIC: exception stack guard: 0x%lx\n", address);
 #endif
 
 	pr_emerg("PANIC: double fault, error_code: 0x%lx\n", error_code);
王贇 Sept. 15, 2021, 1:51 a.m. UTC | #10
On 2021/9/14 下午6:28, Peter Zijlstra wrote:
[snip]
> 
> You can simply increase the exception stack size to test this:
> 
> diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
> index a8d4ad856568..e9e2c3ba5923 100644
> --- a/arch/x86/include/asm/page_64_types.h
> +++ b/arch/x86/include/asm/page_64_types.h
> @@ -15,7 +15,7 @@
>  #define THREAD_SIZE_ORDER	(2 + KASAN_STACK_ORDER)
>  #define THREAD_SIZE  (PAGE_SIZE << THREAD_SIZE_ORDER)
>  
> -#define EXCEPTION_STACK_ORDER (0 + KASAN_STACK_ORDER)
> +#define EXCEPTION_STACK_ORDER (1 + KASAN_STACK_ORDER)
>  #define EXCEPTION_STKSZ (PAGE_SIZE << EXCEPTION_STACK_ORDER)
>  
>  #define IRQ_STACK_ORDER (2 + KASAN_STACK_ORDER)

It's working in this case, no more panic.

> 
> 
> 
> Also, something like this might be useful:
> 
> 
> diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
> index f248eb2ac2d4..4dfdbb9395eb 100644
> --- a/arch/x86/include/asm/stacktrace.h
> +++ b/arch/x86/include/asm/stacktrace.h
> @@ -33,6 +33,8 @@ bool in_task_stack(unsigned long *stack, struct task_struct *task,
>  
>  bool in_entry_stack(unsigned long *stack, struct stack_info *info);
>  
> +bool in_exception_stack_guard(unsigned long *stack);
> +
>  int get_stack_info(unsigned long *stack, struct task_struct *task,
>  		   struct stack_info *info, unsigned long *visit_mask);
>  bool get_stack_info_noinstr(unsigned long *stack, struct task_struct *task,
> diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
> index 5601b95944fa..056cf4f31599 100644
> --- a/arch/x86/kernel/dumpstack_64.c
> +++ b/arch/x86/kernel/dumpstack_64.c
> @@ -126,6 +126,39 @@ static __always_inline bool in_exception_stack(unsigned long *stack, struct stac
>  	return true;
>  }
>  
> +noinstr bool in_exception_stack_guard(unsigned long *stack)
> +{
> +	unsigned long begin, end, stk = (unsigned long)stack;
> +	const struct estack_pages *ep;
> +	unsigned int k;
> +
> +	BUILD_BUG_ON(N_EXCEPTION_STACKS != 6);
> +
> +	begin = (unsigned long)__this_cpu_read(cea_exception_stacks);
> +	/*
> +	 * Handle the case where stack trace is collected _before_
> +	 * cea_exception_stacks had been initialized.
> +	 */
> +	if (!begin)
> +		return false;
> +
> +	end = begin + sizeof(struct cea_exception_stacks);
> +	/* Bail if @stack is outside the exception stack area. */
> +	if (stk < begin || stk >= end)
> +		return false;
> +
> +	/* Calc page offset from start of exception stacks */
> +	k = (stk - begin) >> PAGE_SHIFT;
> +	/* Lookup the page descriptor */
> +	ep = &estack_pages[k];
> +	/* Guard page? */
> +	if (!ep->size)
> +		return true;
> +
> +	return false;
> +}
> +
> +
>  static __always_inline bool in_irq_stack(unsigned long *stack, struct stack_info *info)
>  {
>  	unsigned long *end = (unsigned long *)this_cpu_read(hardirq_stack_ptr);
> diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
> index a58800973aed..8b043ed02c0d 100644
> --- a/arch/x86/kernel/traps.c
> +++ b/arch/x86/kernel/traps.c
> @@ -459,6 +459,9 @@ DEFINE_IDTENTRY_DF(exc_double_fault)
>  		handle_stack_overflow("kernel stack overflow (double-fault)",
>  				      regs, address);
>  	}
> +
> +	if (in_exception_stack_guard((void *)address))
> +		pr_emerg("PANIC: exception stack guard: 0x%lx\n", address);
>  #endif
>  
>  	pr_emerg("PANIC: double fault, error_code: 0x%lx\n", error_code);
> 

The panic triggered as below after the stack size recovered, I found this info
could be helpful, maybe we should keep it?

Regards,
Michael Wang

[   30.515200][    C0] traps: PANIC: exception stack guard: 0xfffffe000000aff8
[   30.515206][    C0] traps: PANIC: double fault, error_code: 0x0
[   30.515216][    C0] double fault: 0000 [#1] SMP PTI
[   30.515223][    C0] CPU: 0 PID: 702 Comm: a.out Not tainted 5.14.0-next-20210913+ #524
[   30.515230][    C0] Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011
[   30.515233][    C0] RIP: 0010:perf_swevent_get_recursion_context+0x0/0x70
[   30.515246][    C0] Code: 48 03 43 28 48 8b 0c 24 bb 01 00 00 00 4c 29 f0 48 39 c8 48 0f 47 c1 49 89 45 08 e9 48 ff ff ff 66 2e 0f 1f 84 00 00 00 00 00 <55> 53 e8 09 20 f2 ff 48 c7 c2 20 4d 03 00 65 48 03 15 5a 3b d2 7e
[   30.515253][    C0] RSP: 0018:fffffe000000b000 EFLAGS: 00010046
[   30.515259][    C0] RAX: 0000000080120008 RBX: fffffe000000b050 RCX: 0000000000000000
[   30.515264][    C0] RDX: ffff88810cbf2180 RSI: ffffffff81269031 RDI: 000000000000001c
[   30.515268][    C0] RBP: 000000000000001c R08: 0000000000000001 R09: 0000000000000000
[   30.515272][    C0] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000
[   30.515275][    C0] R13: fffffe000000b044 R14: 0000000000000001 R15: 0000000000000001
[   30.515280][    C0] FS:  00007fa1b01f4740(0000) GS:ffff88813bc00000(0000) knlGS:0000000000000000
[   30.515286][    C0] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   30.515290][    C0] CR2: fffffe000000aff8 CR3: 000000010e26a003 CR4: 00000000003606f0
[   30.515294][    C0] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[   30.515298][    C0] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[   30.515302][    C0] Call Trace:
[   30.515305][    C0]  <NMI>
[   30.515308][    C0]  perf_trace_buf_alloc+0x26/0xd0
[   30.515321][    C0]  ? is_prefetch.isra.25+0x260/0x260
[   30.515329][    C0]  ? __bad_area_nosemaphore+0x1b8/0x280
[   30.515336][    C0]  perf_ftrace_function_call+0x18f/0x2e0
[   30.515347][    C0]  ? perf_trace_buf_alloc+0xbf/0xd0
[   30.515385][    C0]  ? 0xffffffffa0106083
[   30.515412][    C0]  0xffffffffa0106083
[   30.515431][    C0]  ? 0xffffffffa0106083
[   30.515452][    C0]  ? kernelmode_fixup_or_oops+0x5/0x120
[   30.515465][    C0]  kernelmode_fixup_or_oops+0x5/0x120
[   30.515472][    C0]  __bad_area_nosemaphore+0x1b8/0x280
[   30.515492][    C0]  do_user_addr_fault+0x410/0x920
[   30.515508][    C0]  ? 0xffffffffa0106083
[   30.515525][    C0]  exc_page_fault+0x92/0x300
[   30.515542][    C0]  asm_exc_page_fault+0x1e/0x30
[   30.515551][    C0] RIP: 0010:__get_user_nocheck_8+0x6/0x13
diff mbox series

Patch

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 744e872..6063443 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -8716,6 +8716,7 @@  static void perf_log_throttle(struct perf_event *event, int enable)
 	struct perf_output_handle handle;
 	struct perf_sample_data sample;
 	int ret;
+	int rctx;

 	struct {
 		struct perf_event_header	header;
@@ -8738,14 +8739,17 @@  static void perf_log_throttle(struct perf_event *event, int enable)

 	perf_event_header__init_id(&throttle_event.header, &sample, event);

+	rctx = perf_swevent_get_recursion_context();
 	ret = perf_output_begin(&handle, &sample, event,
 				throttle_event.header.size);
-	if (ret)
-		return;
+	if (!ret) {
+		perf_output_put(&handle, throttle_event);
+		perf_event__output_id_sample(event, &handle, &sample);
+		perf_output_end(&handle);
+	}

-	perf_output_put(&handle, throttle_event);
-	perf_event__output_id_sample(event, &handle, &sample);
-	perf_output_end(&handle);
+	if (rctx >= 0)
+		perf_swevent_put_recursion_context(rctx);
 }

 /*