diff mbox series

x86/dumpstack/64: Add guard pages to stack_info

Message ID YUIOgmOfnOqPrE+z@hirez.programming.kicks-ass.net (mailing list archive)
State Not Applicable
Headers show
Series x86/dumpstack/64: Add guard pages to stack_info | expand

Checks

Context Check Description
netdev/tree_selection success Not a local patch

Commit Message

Peter Zijlstra Sept. 15, 2021, 3:17 p.m. UTC
On Wed, Sep 15, 2021 at 09:51:57AM +0800, 王贇 wrote:

> > +
> > +	if (in_exception_stack_guard((void *)address))
> > +		pr_emerg("PANIC: exception stack guard: 0x%lx\n", address);
> >  #endif
> >  
> >  	pr_emerg("PANIC: double fault, error_code: 0x%lx\n", error_code);
> > 
> 
> The panic triggered as below after the stack size recovered, I found this info
> could be helpful, maybe we should keep it?

Could you please test this?

---
Subject: x86/dumpstack/64: Add guard pages to stack_info
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed Sep 15 17:12:59 CEST 2021

Explicitly add the exception stack guard pages to stack_info and
report on them from #DF.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/x86/include/asm/cpu_entry_area.h |    3 +++
 arch/x86/include/asm/stacktrace.h     |    3 ++-
 arch/x86/kernel/dumpstack_64.c        |   17 ++++++++++++++++-
 arch/x86/kernel/traps.c               |   17 ++++++++++++++++-
 4 files changed, 37 insertions(+), 3 deletions(-)

Comments

王贇 Sept. 16, 2021, 3:34 a.m. UTC | #1
On 2021/9/15 下午11:17, Peter Zijlstra wrote:
> On Wed, Sep 15, 2021 at 09:51:57AM +0800, 王贇 wrote:
> 
>>> +
>>> +	if (in_exception_stack_guard((void *)address))
>>> +		pr_emerg("PANIC: exception stack guard: 0x%lx\n", address);
>>>  #endif
>>>  
>>>  	pr_emerg("PANIC: double fault, error_code: 0x%lx\n", error_code);
>>>
>>
>> The panic triggered as below after the stack size recovered, I found this info
>> could be helpful, maybe we should keep it?
> 
> Could you please test this?

It seems like not working properly, we get very long trace ending as below:

Regards,
Michael Wang

[   34.662432][    C0] BUG: unable to handle page fault for address: fffffe0000008ff0
[   34.662435][    C0] #PF: supervisor read access in kernel mode
[   34.662438][    C0] #PF: error_code(0x0000) - not-present page
[   34.662442][    C0] PGD 13ffef067 P4D 13ffef067 PUD 13ffed067 PMD 13ffec067 PTE 0
[   34.662455][    C0] Oops: 0000 [#11] SMP PTI
[   34.662459][    C0] CPU: 0 PID: 713 Comm: a.out Not tainted 5.14.0-next-20210913+ #530
[   34.662465][    C0] Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011
[   34.662468][    C0] RIP: 0010:get_stack_info_noinstr+0x8d/0xf0
[   34.662474][    C0] Code: 40 82 66 85 c9 74 33 8b 34 d5 40 6d 40 82 0f b7 14 d5 46 6d 40 82 48 01 f0 41 89 14 24 48 01 c1 49 89 44 24 08 49 89 4c 24 10 <48> 8b 41 f0 49 89 44 24 18 b8 01 00 00 00 eb 87 65 48 8b 05 43 c5
[   34.662485][    C0] RSP: 0018:fffffe0000009bb0 EFLAGS: 00010086
[   34.662490][    C0] RAX: fffffe0000008000 RBX: ffff888107422180 RCX: fffffe0000009000
[   34.662494][    C0] RDX: 0000000000000085 RSI: 0000000000000000 RDI: fffffe0000008f30
[   34.662498][    C0] RBP: fffffe0000008f30 R08: ffffffff82754eff R09: fffffe0000009b78
[   34.662502][    C0] R10: 0000000000000000 R11: 0000000000000006 R12: fffffe0000009c28
[   34.662506][    C0] R13: 0000000000000000 R14: ffff888107422180 R15: fffffe0000009c48
[   34.662510][    C0] FS:  00007f5298fe2740(0000) GS:ffff88813bc00000(0000) knlGS:0000000000000000
[   34.662516][    C0] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   34.662520][    C0] CR2: fffffe0000008ff0 CR3: 0000000109b5a005 CR4: 00000000003606f0
[   34.662524][    C0] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[   34.662528][    C0] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[   34.662532][    C0] Call Trace:
[   34.662534][    C0]  <#DF>
[   34.662542][    C0]  get_stack_info+0x30/0xb0
[   34.662556][    C0]  show_trace_log_lvl+0xf9/0x410
[   34.662571][    C0]  ? sprint_symbol_build_id+0x30/0x30
[   34.662605][    C0]  ? 0xffffffffa0106083
[   34.662628][    C0]  __die_body+0x1a/0x60
[   34.662641][    C0]  page_fault_oops+0xe8/0x560
[   34.662671][    C0]  kernelmode_fixup_or_oops+0x107/0x120
[   34.662687][    C0]  __bad_area_nosemaphore+0x1b8/0x280
[   34.662707][    C0]  do_kern_addr_fault+0x57/0xc0
[   34.662719][    C0]  exc_page_fault+0x1c1/0x300
[   34.662735][    C0]  asm_exc_page_fault+0x1e/0x30
[   34.662742][    C0] RIP: 0010:get_stack_info_noinstr+0x8d/0xf0
[   34.662749][    C0] Code: 40 82 66 85 c9 74 33 8b 34 d5 40 6d 40 82 0f b7 14 d5 46 6d 40 82 48 01 f0 41 89 14 24 48 01 c1 49 89 44 24 08 49 89 4c 24 10 <48> 8b 41 f0 49 89 44 24 18 b8 01 00 00 00 eb 87 65 48 8b 05 43 c5
[   34.662754][    C0] RSP: 0018:fffffe0000009ee8 EFLAGS: 00010086
[   34.662759][    C0] RAX: fffffe0000008000 RBX: ffff888107422180 RCX: fffffe0000009000
[   34.662763][    C0] RDX: 0000000000000085 RSI: 0000000000000000 RDI: fffffe0000008f28
[   34.662767][    C0] RBP: fffffe0000008f28 R08: 0000000000000000 R09: 0000000000000000
[   34.662771][    C0] R10: 0000000000000000 R11: 0000000000000000 R12: fffffe0000009f08
[   34.662775][    C0] R13: fffffe0000008f28 R14: 0000000109b5a005 R15: 0000000000000000
[   34.662816][    C0]  ? get_stack_info_noinstr+0x12/0xf0
[   34.662828][    C0]  exc_double_fault+0x138/0x1a0
[   34.662851][    C0]  asm_exc_double_fault+0x1e/0x30
[   34.662858][    C0] RIP: 0010:perf_ftrace_function_call+0x26/0x2e0
[   34.662866][    C0] Code: 5b 5d c3 90 55 48 89 e5 41 57 41 56 41 55 41 54 49 89 f5 53 49 89 fc 48 89 d3 48 81 ec d0 00 00 00 65 48 8b 04 25 28 00 00 00 <48> 89 45 d0 31 c0 e8 9f 69 fa ff e8 2a 4a f1 ff 84 c0 74 14 e8 91
[   34.662872][    C0] RSP: 0018:fffffe0000008f30 EFLAGS: 00010086
[   34.662877][    C0] RAX: 0c14fdf027d1e500 RBX: ffff8881002f99f0 RCX: fffffe0000009038
[   34.662881][    C0] RDX: ffff8881002f99f0 RSI: ffffffff817cd61f RDI: ffffffff811cc7b0
[   34.662884][    C0] RBP: fffffe0000009028 R08: ffffffff82754ed9 R09: fffffe00000095d0
[   34.662888][    C0] R10: fffffe00000095e8 R11: 0000000020455450 R12: ffffffff811cc7b0
[   34.662892][    C0] R13: ffffffff817cd61f R14: ffff0a00ffffff05 R15: ffffffff81edac2d
[   34.662896][    C0]  ? get_stack_info_noinstr+0x8d/0xf0
[   34.662904][    C0]  ? symbol_string+0xbf/0x160
[   34.662911][    C0]  ? sprint_symbol_build_id+0x30/0x30
[   34.662935][    C0]  ? symbol_string+0xbf/0x160
[   34.662942][    C0]  ? sprint_symbol_build_id+0x30/0x30
[   34.662959][    C0]  </#DF>
[   34.662964][    C0] BUG: unable to handle page fault for address: fffffe0000008ff0
[   34.662966][    C0] #PF: supervisor read access in kernel mode
[   34.662970][    C0] #PF: error_code(0x0000) - not-present page
[   34.662973][    C0] PGD 13ffef067 P4D 13ffef067 PUD 13ffed067 PMD 13ffec067 PTE 0
[   34.662986][    C0] Oops: 0000 [#12] SMP PTI
[   34.662991][    C0] CPU: 0 PID: 713 Comm: a.out Not tainted 5.14.0-next-20210913+ #530
[   34.662996][    C0] Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011


> 
> ---
> Subject: x86/dumpstack/64: Add guard pages to stack_info
> From: Peter Zijlstra <peterz@infradead.org>
> Date: Wed Sep 15 17:12:59 CEST 2021
> 
> Explicitly add the exception stack guard pages to stack_info and
> report on them from #DF.
> 
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>  arch/x86/include/asm/cpu_entry_area.h |    3 +++
>  arch/x86/include/asm/stacktrace.h     |    3 ++-
>  arch/x86/kernel/dumpstack_64.c        |   17 ++++++++++++++++-
>  arch/x86/kernel/traps.c               |   17 ++++++++++++++++-
>  4 files changed, 37 insertions(+), 3 deletions(-)
> 
> --- a/arch/x86/include/asm/cpu_entry_area.h
> +++ b/arch/x86/include/asm/cpu_entry_area.h
> @@ -61,6 +61,9 @@ enum exception_stack_ordering {
>  #define CEA_ESTACK_OFFS(st)					\
>  	offsetof(struct cea_exception_stacks, st## _stack)
>  
> +#define CEA_EGUARD_OFFS(st)					\
> +	offsetof(struct cea_exception_stacks, st## _stack_guard)
> +
>  #define CEA_ESTACK_PAGES					\
>  	(sizeof(struct cea_exception_stacks) / PAGE_SIZE)
>  
> --- a/arch/x86/include/asm/stacktrace.h
> +++ b/arch/x86/include/asm/stacktrace.h
> @@ -14,13 +14,14 @@
>  #include <asm/switch_to.h>
>  
>  enum stack_type {
> -	STACK_TYPE_UNKNOWN,
> +	STACK_TYPE_UNKNOWN = 0,
>  	STACK_TYPE_TASK,
>  	STACK_TYPE_IRQ,
>  	STACK_TYPE_SOFTIRQ,
>  	STACK_TYPE_ENTRY,
>  	STACK_TYPE_EXCEPTION,
>  	STACK_TYPE_EXCEPTION_LAST = STACK_TYPE_EXCEPTION + N_EXCEPTION_STACKS-1,
> +	STACK_TYPE_GUARD = 0x80,
>  };
>  
>  struct stack_info {
> --- a/arch/x86/kernel/dumpstack_64.c
> +++ b/arch/x86/kernel/dumpstack_64.c
> @@ -32,9 +32,15 @@ const char *stack_type_name(enum stack_t
>  {
>  	BUILD_BUG_ON(N_EXCEPTION_STACKS != 6);
>  
> +	if (type == STACK_TYPE_TASK)
> +		return "TASK";
> +
>  	if (type == STACK_TYPE_IRQ)
>  		return "IRQ";
>  
> +	if (type == STACK_TYPE_SOFTIRQ)
> +		return "SOFTIRQ";
> +
>  	if (type == STACK_TYPE_ENTRY) {
>  		/*
>  		 * On 64-bit, we have a generic entry stack that we
> @@ -63,6 +69,11 @@ struct estack_pages {
>  };
>  
>  #define EPAGERANGE(st)							\
> +	[PFN_DOWN(CEA_EGUARD_OFFS(st))] = {				\
> +		.offs	= CEA_EGUARD_OFFS(st),				\
> +		.size	= PAGE_SIZE,					\
> +		.type	= STACK_TYPE_GUARD +				\
> +			  STACK_TYPE_EXCEPTION + ESTACK_ ##st, },	\
>  	[PFN_DOWN(CEA_ESTACK_OFFS(st)) ...				\
>  	 PFN_DOWN(CEA_ESTACK_OFFS(st) + CEA_ESTACK_SIZE(st) - 1)] = {	\
>  		.offs	= CEA_ESTACK_OFFS(st),				\
> @@ -111,10 +122,11 @@ static __always_inline bool in_exception
>  	k = (stk - begin) >> PAGE_SHIFT;
>  	/* Lookup the page descriptor */
>  	ep = &estack_pages[k];
> -	/* Guard page? */
> +	/* unknown entry */
>  	if (!ep->size)
>  		return false;
>  
> +
>  	begin += (unsigned long)ep->offs;
>  	end = begin + (unsigned long)ep->size;
>  	regs = (struct pt_regs *)end - 1;
> @@ -193,6 +205,9 @@ int get_stack_info(unsigned long *stack,
>  	if (!get_stack_info_noinstr(stack, task, info))
>  		goto unknown;
>  
> +	if (info->type & STACK_TYPE_GUARD)
> +		goto unknown;
> +
>  	/*
>  	 * Make sure we don't iterate through any given stack more than once.
>  	 * If it comes up a second time then there's something wrong going on:
> --- a/arch/x86/kernel/traps.c
> +++ b/arch/x86/kernel/traps.c
> @@ -461,6 +461,19 @@ DEFINE_IDTENTRY_DF(exc_double_fault)
>  	}
>  #endif
>  
> +#ifdef CONFIG_X86_64
> +	{
> +		struct stack_info info;
> +
> +		if (get_stack_info_noinstr((void *)address, current, &info) &&
> +		    info.type & STACK_TYPE_GUARD) {
> +			const char *name = stack_type_name(info.type & ~STACK_TYPE_GUARD);
> +			pr_emerg("BUG: %s stack guard hit at %p (stack is %p..%p)\n",
> +				 name, (void *)address, info.begin, info.end);
> +		}
> +	}
> +#endif
> +
>  	pr_emerg("PANIC: double fault, error_code: 0x%lx\n", error_code);
>  	die("double fault", regs, error_code);
>  	panic("Machine halted.");
> @@ -708,7 +721,9 @@ asmlinkage __visible noinstr struct pt_r
>  	sp    = regs->sp;
>  	stack = (unsigned long *)sp;
>  
> -	if (!get_stack_info_noinstr(stack, current, &info) || info.type == STACK_TYPE_ENTRY ||
> +	if (!get_stack_info_noinstr(stack, current, &info) ||
> +	    info.type & STACK_TYPE_GUARD ||
> +	    info.type == STACK_TYPE_ENTRY ||
>  	    info.type >= STACK_TYPE_EXCEPTION_LAST)
>  		sp = __this_cpu_ist_top_va(VC2);
>  
>
王贇 Sept. 16, 2021, 3:47 a.m. UTC | #2
On 2021/9/15 下午11:17, Peter Zijlstra wrote:
> On Wed, Sep 15, 2021 at 09:51:57AM +0800, 王贇 wrote:
> 
>>> +
>>> +	if (in_exception_stack_guard((void *)address))
>>> +		pr_emerg("PANIC: exception stack guard: 0x%lx\n", address);
>>>  #endif
>>>  
>>>  	pr_emerg("PANIC: double fault, error_code: 0x%lx\n", error_code);
>>>
>>
>> The panic triggered as below after the stack size recovered, I found this info
>> could be helpful, maybe we should keep it?
> 
> Could you please test this?

I did some debug and found the issue, we are missing:

@@ -122,7 +137,10 @@ static __always_inline bool in_exception_stack(unsigned long *stack, struct stac
        info->type      = ep->type;
        info->begin     = (unsigned long *)begin;
        info->end       = (unsigned long *)end;
-       info->next_sp   = (unsigned long *)regs->sp;
+
+       if (!(ep->type & STACK_TYPE_GUARD))
+               info->next_sp   = (unsigned long *)regs->sp;
+
        return true;
 }

as the guard page are not working as real stack I guess?

With that one things going on correctly, and some trivials below.

> 
> ---
> Subject: x86/dumpstack/64: Add guard pages to stack_info
> From: Peter Zijlstra <peterz@infradead.org>
> Date: Wed Sep 15 17:12:59 CEST 2021
> 
> Explicitly add the exception stack guard pages to stack_info and
> report on them from #DF.
> 
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>  arch/x86/include/asm/cpu_entry_area.h |    3 +++
>  arch/x86/include/asm/stacktrace.h     |    3 ++-
>  arch/x86/kernel/dumpstack_64.c        |   17 ++++++++++++++++-
>  arch/x86/kernel/traps.c               |   17 ++++++++++++++++-
>  4 files changed, 37 insertions(+), 3 deletions(-)
> 
> --- a/arch/x86/include/asm/cpu_entry_area.h
> +++ b/arch/x86/include/asm/cpu_entry_area.h
> @@ -61,6 +61,9 @@ enum exception_stack_ordering {
>  #define CEA_ESTACK_OFFS(st)					\
>  	offsetof(struct cea_exception_stacks, st## _stack)
>  
> +#define CEA_EGUARD_OFFS(st)					\
> +	offsetof(struct cea_exception_stacks, st## _stack_guard)
> +
>  #define CEA_ESTACK_PAGES					\
>  	(sizeof(struct cea_exception_stacks) / PAGE_SIZE)
>  
> --- a/arch/x86/include/asm/stacktrace.h
> +++ b/arch/x86/include/asm/stacktrace.h
> @@ -14,13 +14,14 @@
>  #include <asm/switch_to.h>
>  
>  enum stack_type {
> -	STACK_TYPE_UNKNOWN,
> +	STACK_TYPE_UNKNOWN = 0,

Is this necessary?

>  	STACK_TYPE_TASK,
>  	STACK_TYPE_IRQ,
>  	STACK_TYPE_SOFTIRQ,
>  	STACK_TYPE_ENTRY,
>  	STACK_TYPE_EXCEPTION,
>  	STACK_TYPE_EXCEPTION_LAST = STACK_TYPE_EXCEPTION + N_EXCEPTION_STACKS-1,
> +	STACK_TYPE_GUARD = 0x80,
>  };
>  
>  struct stack_info {
> --- a/arch/x86/kernel/dumpstack_64.c
> +++ b/arch/x86/kernel/dumpstack_64.c
> @@ -32,9 +32,15 @@ const char *stack_type_name(enum stack_t
>  {
>  	BUILD_BUG_ON(N_EXCEPTION_STACKS != 6);
>  
> +	if (type == STACK_TYPE_TASK)
> +		return "TASK";
> +
>  	if (type == STACK_TYPE_IRQ)
>  		return "IRQ";
>  
> +	if (type == STACK_TYPE_SOFTIRQ)
> +		return "SOFTIRQ";
> +

Do we need one for GUARD too?

>  	if (type == STACK_TYPE_ENTRY) {
>  		/*
>  		 * On 64-bit, we have a generic entry stack that we
> @@ -63,6 +69,11 @@ struct estack_pages {
>  };
>  
>  #define EPAGERANGE(st)							\
> +	[PFN_DOWN(CEA_EGUARD_OFFS(st))] = {				\
> +		.offs	= CEA_EGUARD_OFFS(st),				\
> +		.size	= PAGE_SIZE,					\
> +		.type	= STACK_TYPE_GUARD +				\
> +			  STACK_TYPE_EXCEPTION + ESTACK_ ##st, },	\
>  	[PFN_DOWN(CEA_ESTACK_OFFS(st)) ...				\
>  	 PFN_DOWN(CEA_ESTACK_OFFS(st) + CEA_ESTACK_SIZE(st) - 1)] = {	\
>  		.offs	= CEA_ESTACK_OFFS(st),				\
> @@ -111,10 +122,11 @@ static __always_inline bool in_exception
>  	k = (stk - begin) >> PAGE_SHIFT;
>  	/* Lookup the page descriptor */
>  	ep = &estack_pages[k];
> -	/* Guard page? */
> +	/* unknown entry */
>  	if (!ep->size)
>  		return false;
>  
> +

Extra line?

Regards,
Michael Wang

>  	begin += (unsigned long)ep->offs;
>  	end = begin + (unsigned long)ep->size;
>  	regs = (struct pt_regs *)end - 1;
> @@ -193,6 +205,9 @@ int get_stack_info(unsigned long *stack,
>  	if (!get_stack_info_noinstr(stack, task, info))
>  		goto unknown;
>  
> +	if (info->type & STACK_TYPE_GUARD)
> +		goto unknown;
> +
>  	/*
>  	 * Make sure we don't iterate through any given stack more than once.
>  	 * If it comes up a second time then there's something wrong going on:
> --- a/arch/x86/kernel/traps.c
> +++ b/arch/x86/kernel/traps.c
> @@ -461,6 +461,19 @@ DEFINE_IDTENTRY_DF(exc_double_fault)
>  	}
>  #endif
>  
> +#ifdef CONFIG_X86_64
> +	{
> +		struct stack_info info;
> +
> +		if (get_stack_info_noinstr((void *)address, current, &info) &&
> +		    info.type & STACK_TYPE_GUARD) {
> +			const char *name = stack_type_name(info.type & ~STACK_TYPE_GUARD);
> +			pr_emerg("BUG: %s stack guard hit at %p (stack is %p..%p)\n",
> +				 name, (void *)address, info.begin, info.end);
> +		}
> +	}
> +#endif
> +
>  	pr_emerg("PANIC: double fault, error_code: 0x%lx\n", error_code);
>  	die("double fault", regs, error_code);
>  	panic("Machine halted.");
> @@ -708,7 +721,9 @@ asmlinkage __visible noinstr struct pt_r
>  	sp    = regs->sp;
>  	stack = (unsigned long *)sp;
>  
> -	if (!get_stack_info_noinstr(stack, current, &info) || info.type == STACK_TYPE_ENTRY ||
> +	if (!get_stack_info_noinstr(stack, current, &info) ||
> +	    info.type & STACK_TYPE_GUARD ||
> +	    info.type == STACK_TYPE_ENTRY ||
>  	    info.type >= STACK_TYPE_EXCEPTION_LAST)
>  		sp = __this_cpu_ist_top_va(VC2);
>  
>
Peter Zijlstra Sept. 16, 2021, 8 a.m. UTC | #3
On Thu, Sep 16, 2021 at 11:47:49AM +0800, 王贇 wrote:

> I did some debug and found the issue, we are missing:
> 
> @@ -122,7 +137,10 @@ static __always_inline bool in_exception_stack(unsigned long *stack, struct stac
>         info->type      = ep->type;
>         info->begin     = (unsigned long *)begin;
>         info->end       = (unsigned long *)end;
> -       info->next_sp   = (unsigned long *)regs->sp;
> +
> +       if (!(ep->type & STACK_TYPE_GUARD))
> +               info->next_sp   = (unsigned long *)regs->sp;
> +
>         return true;
>  }
> 
> as the guard page are not working as real stack I guess?

Correct, but I thought I put if (type & GUARD) terminators in all paths
that ended up caring about ->next_sp. Clearly I seem to have missed one
:/

Let me try and figure out where that happens.

> With that one things going on correctly, and some trivials below.

> >  enum stack_type {
> > -	STACK_TYPE_UNKNOWN,
> > +	STACK_TYPE_UNKNOWN = 0,
> 
> Is this necessary?

No, but it makes it more explicit we care about the value.

> >  	STACK_TYPE_TASK,
> >  	STACK_TYPE_IRQ,
> >  	STACK_TYPE_SOFTIRQ,
> >  	STACK_TYPE_ENTRY,
> >  	STACK_TYPE_EXCEPTION,
> >  	STACK_TYPE_EXCEPTION_LAST = STACK_TYPE_EXCEPTION + N_EXCEPTION_STACKS-1,
> > +	STACK_TYPE_GUARD = 0x80,

Note that this is a flag.

> >  };
> >  
> >  struct stack_info {
> > --- a/arch/x86/kernel/dumpstack_64.c
> > +++ b/arch/x86/kernel/dumpstack_64.c
> > @@ -32,9 +32,15 @@ const char *stack_type_name(enum stack_t
> >  {
> >  	BUILD_BUG_ON(N_EXCEPTION_STACKS != 6);
> >  
> > +	if (type == STACK_TYPE_TASK)
> > +		return "TASK";
> > +
> >  	if (type == STACK_TYPE_IRQ)
> >  		return "IRQ";
> >  
> > +	if (type == STACK_TYPE_SOFTIRQ)
> > +		return "SOFTIRQ";
> > +
> 
> Do we need one for GUARD too?

No, GUARD is not a single type but a flag. The caller can trivially do
something like:

	"%s %s", stack_type_name(type & ~GUARD),
	         (type & GUARD) ?  "GUARD" : ""

> >  	if (type == STACK_TYPE_ENTRY) {
> >  		/*
> >  		 * On 64-bit, we have a generic entry stack that we

> > @@ -111,10 +122,11 @@ static __always_inline bool in_exception
> >  	k = (stk - begin) >> PAGE_SHIFT;
> >  	/* Lookup the page descriptor */
> >  	ep = &estack_pages[k];
> > -	/* Guard page? */
> > +	/* unknown entry */
> >  	if (!ep->size)
> >  		return false;
> >  
> > +
> 
> Extra line?

Gone now, thanks!
Peter Zijlstra Sept. 16, 2021, 8:03 a.m. UTC | #4
On Thu, Sep 16, 2021 at 10:00:15AM +0200, Peter Zijlstra wrote:
> On Thu, Sep 16, 2021 at 11:47:49AM +0800, 王贇 wrote:
> 
> > I did some debug and found the issue, we are missing:
> > 
> > @@ -122,7 +137,10 @@ static __always_inline bool in_exception_stack(unsigned long *stack, struct stac
> >         info->type      = ep->type;
> >         info->begin     = (unsigned long *)begin;
> >         info->end       = (unsigned long *)end;
> > -       info->next_sp   = (unsigned long *)regs->sp;
> > +
> > +       if (!(ep->type & STACK_TYPE_GUARD))
> > +               info->next_sp   = (unsigned long *)regs->sp;
> > +
> >         return true;
> >  }
> > 
> > as the guard page are not working as real stack I guess?
> 
> Correct, but I thought I put if (type & GUARD) terminators in all paths
> that ended up caring about ->next_sp. Clearly I seem to have missed one
> :/
> 
> Let me try and figure out where that happens.

Oh, I'm an idiot... yes it tries to read regs the stack, but clearly
that won't work for the guard page.
Peter Zijlstra Sept. 16, 2021, 10:02 a.m. UTC | #5
On Thu, Sep 16, 2021 at 10:03:19AM +0200, Peter Zijlstra wrote:

> Oh, I'm an idiot... yes it tries to read regs the stack, but clearly
> that won't work for the guard page.

OK, extended it to also cover task and IRQ stacks. get_stack_info()
doesn't seem to know about SOFTIRQ stacks on 64bit, might have to look
into that next.

Andy, what's the story with page_fault_oops(), according to the comment
in exc_double_fault() actual stack overflows will always hit #DF.

---
diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h
index 3d52b094850a..c4e92462c2b4 100644
--- a/arch/x86/include/asm/cpu_entry_area.h
+++ b/arch/x86/include/asm/cpu_entry_area.h
@@ -61,6 +61,9 @@ enum exception_stack_ordering {
 #define CEA_ESTACK_OFFS(st)					\
 	offsetof(struct cea_exception_stacks, st## _stack)
 
+#define CEA_EGUARD_OFFS(st)					\
+	offsetof(struct cea_exception_stacks, st## _stack_guard)
+
 #define CEA_ESTACK_PAGES					\
 	(sizeof(struct cea_exception_stacks) / PAGE_SIZE)
 
diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
index f248eb2ac2d4..8ff346579330 100644
--- a/arch/x86/include/asm/stacktrace.h
+++ b/arch/x86/include/asm/stacktrace.h
@@ -14,13 +14,14 @@
 #include <asm/switch_to.h>
 
 enum stack_type {
-	STACK_TYPE_UNKNOWN,
+	STACK_TYPE_UNKNOWN = 0,
 	STACK_TYPE_TASK,
 	STACK_TYPE_IRQ,
 	STACK_TYPE_SOFTIRQ,
 	STACK_TYPE_ENTRY,
 	STACK_TYPE_EXCEPTION,
 	STACK_TYPE_EXCEPTION_LAST = STACK_TYPE_EXCEPTION + N_EXCEPTION_STACKS-1,
+	STACK_TYPE_GUARD = 0x80,
 };
 
 struct stack_info {
@@ -31,6 +32,15 @@ struct stack_info {
 bool in_task_stack(unsigned long *stack, struct task_struct *task,
 		   struct stack_info *info);
 
+static __always_inline bool in_stack_guard(void *addr, void *begin, void *end)
+{
+#ifdef CONFIG_VMAP_STACK
+	if (addr > (begin - PAGE_SIZE))
+		return true;
+#endif
+	return false;
+}
+
 bool in_entry_stack(unsigned long *stack, struct stack_info *info);
 
 int get_stack_info(unsigned long *stack, struct task_struct *task,
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index ea4fe192189d..91b406fe2a39 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -32,12 +32,19 @@ static struct pt_regs exec_summary_regs;
 bool noinstr in_task_stack(unsigned long *stack, struct task_struct *task,
 			   struct stack_info *info)
 {
-	unsigned long *begin = task_stack_page(task);
-	unsigned long *end   = task_stack_page(task) + THREAD_SIZE;
-
-	if (stack < begin || stack >= end)
+	void *begin = task_stack_page(task);
+	void *end   = begin + THREAD_SIZE;
+	int type    = STACK_TYPE_TASK;
+
+	if ((void *)stack < begin || (void *)stack >= end) {
+		if (in_stack_guard(stack, begin, end)) {
+			type |= STACK_TYPE_GUARD;
+			goto fill_info;
+		}
 		return false;
+	}
 
+fill_info:
 	info->type	= STACK_TYPE_TASK;
 	info->begin	= begin;
 	info->end	= end;
@@ -50,14 +57,20 @@ bool noinstr in_task_stack(unsigned long *stack, struct task_struct *task,
 bool noinstr in_entry_stack(unsigned long *stack, struct stack_info *info)
 {
 	struct entry_stack *ss = cpu_entry_stack(smp_processor_id());
-
+	int type = STACK_TYPE_ENTRY;
 	void *begin = ss;
 	void *end = ss + 1;
 
-	if ((void *)stack < begin || (void *)stack >= end)
+	if ((void *)stack < begin || (void *)stack >= end) {
+		if (in_stack_guard(stack, begin, end)) {
+			type |= STACK_TYPE_GUARD;
+			goto fill_info;
+		}
 		return false;
+	}
 
-	info->type	= STACK_TYPE_ENTRY;
+fill_info:
+	info->type	= type;
 	info->begin	= begin;
 	info->end	= end;
 	info->next_sp	= NULL;
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 5601b95944fa..3634bdf9ab36 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -32,9 +32,15 @@ const char *stack_type_name(enum stack_type type)
 {
 	BUILD_BUG_ON(N_EXCEPTION_STACKS != 6);
 
+	if (type == STACK_TYPE_TASK)
+		return "TASK";
+
 	if (type == STACK_TYPE_IRQ)
 		return "IRQ";
 
+	if (type == STACK_TYPE_SOFTIRQ)
+		return "SOFTIRQ";
+
 	if (type == STACK_TYPE_ENTRY) {
 		/*
 		 * On 64-bit, we have a generic entry stack that we
@@ -63,6 +69,11 @@ struct estack_pages {
 };
 
 #define EPAGERANGE(st)							\
+	[PFN_DOWN(CEA_EGUARD_OFFS(st))] = {				\
+		.offs	= CEA_EGUARD_OFFS(st),				\
+		.size	= PAGE_SIZE,					\
+		.type	= STACK_TYPE_GUARD +				\
+			  STACK_TYPE_EXCEPTION + ESTACK_ ##st, },	\
 	[PFN_DOWN(CEA_ESTACK_OFFS(st)) ...				\
 	 PFN_DOWN(CEA_ESTACK_OFFS(st) + CEA_ESTACK_SIZE(st) - 1)] = {	\
 		.offs	= CEA_ESTACK_OFFS(st),				\
@@ -111,7 +122,7 @@ static __always_inline bool in_exception_stack(unsigned long *stack, struct stac
 	k = (stk - begin) >> PAGE_SHIFT;
 	/* Lookup the page descriptor */
 	ep = &estack_pages[k];
-	/* Guard page? */
+	/* unknown entry */
 	if (!ep->size)
 		return false;
 
@@ -122,7 +133,12 @@ static __always_inline bool in_exception_stack(unsigned long *stack, struct stac
 	info->type	= ep->type;
 	info->begin	= (unsigned long *)begin;
 	info->end	= (unsigned long *)end;
-	info->next_sp	= (unsigned long *)regs->sp;
+	info->next_sp	= NULL;
+
+	/* Can't read regs from a guard page. */
+	if (!(ep->type & STACK_TYPE_GUARD))
+		info->next_sp = (unsigned long *)regs->sp;
+
 	return true;
 }
 
@@ -130,6 +146,7 @@ static __always_inline bool in_irq_stack(unsigned long *stack, struct stack_info
 {
 	unsigned long *end = (unsigned long *)this_cpu_read(hardirq_stack_ptr);
 	unsigned long *begin;
+	int type = STACK_TYPE_IRQ;
 
 	/*
 	 * @end points directly to the top most stack entry to avoid a -8
@@ -144,19 +161,27 @@ static __always_inline bool in_irq_stack(unsigned long *stack, struct stack_info
 	 * final operation is 'popq %rsp' which means after that RSP points
 	 * to the original stack and not to @end.
 	 */
-	if (stack < begin || stack >= end)
+	if (stack < begin || stack >= end) {
+		if (in_stack_guard(stack, begin, end)) {
+			type |= STACK_TYPE_GUARD;
+			goto fill_info;
+		}
 		return false;
+	}
 
-	info->type	= STACK_TYPE_IRQ;
+fill_info:
+	info->type	= type;
 	info->begin	= begin;
 	info->end	= end;
+	info->next_sp	= NULL;
 
 	/*
 	 * The next stack pointer is stored at the top of the irq stack
 	 * before switching to the irq stack. Actual stack entries are all
 	 * below that.
 	 */
-	info->next_sp = (unsigned long *)*(end - 1);
+	if (!(type & STACK_TYPE_GUARD))
+		info->next_sp = (unsigned long *)*(end - 1);
 
 	return true;
 }
@@ -193,6 +218,9 @@ int get_stack_info(unsigned long *stack, struct task_struct *task,
 	if (!get_stack_info_noinstr(stack, task, info))
 		goto unknown;
 
+	if (info->type & STACK_TYPE_GUARD)
+		goto unknown;
+
 	/*
 	 * Make sure we don't iterate through any given stack more than once.
 	 * If it comes up a second time then there's something wrong going on:
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index a58800973aed..80f6d8d735eb 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -353,6 +353,7 @@ DEFINE_IDTENTRY_DF(exc_double_fault)
 
 #ifdef CONFIG_VMAP_STACK
 	unsigned long address = read_cr2();
+	struct stack_info info;
 #endif
 
 #ifdef CONFIG_X86_ESPFIX64
@@ -455,9 +456,11 @@ DEFINE_IDTENTRY_DF(exc_double_fault)
 	 * stack even if the actual trigger for the double fault was
 	 * something else.
 	 */
-	if ((unsigned long)task_stack_page(tsk) - 1 - address < PAGE_SIZE) {
-		handle_stack_overflow("kernel stack overflow (double-fault)",
-				      regs, address);
+	if (get_stack_info_noinstr((void *)address, current, &info) &&
+	    info.type & STACK_TYPE_GUARD) {
+		const char *name = stack_type_name(info.type & ~STACK_TYPE_GUARD);
+		pr_emerg("BUG: %s stack guard hit at %p (stack is %p..%p)\n",
+			 name, (void *)address, info.begin, info.end);
 	}
 #endif
 
@@ -708,7 +711,9 @@ asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct pt_regs *r
 	sp    = regs->sp;
 	stack = (unsigned long *)sp;
 
-	if (!get_stack_info_noinstr(stack, current, &info) || info.type == STACK_TYPE_ENTRY ||
+	if (!get_stack_info_noinstr(stack, current, &info) ||
+	    info.type & STACK_TYPE_GUARD ||
+	    info.type == STACK_TYPE_ENTRY ||
 	    info.type >= STACK_TYPE_EXCEPTION_LAST)
 		sp = __this_cpu_ist_top_va(VC2);
王贇 Sept. 17, 2021, 2:15 a.m. UTC | #6
On 2021/9/16 下午6:02, Peter Zijlstra wrote:
> On Thu, Sep 16, 2021 at 10:03:19AM +0200, Peter Zijlstra wrote:
> 
>> Oh, I'm an idiot... yes it tries to read regs the stack, but clearly
>> that won't work for the guard page.
> 
> OK, extended it to also cover task and IRQ stacks. get_stack_info()
> doesn't seem to know about SOFTIRQ stacks on 64bit, might have to look
> into that next.
> 
> Andy, what's the story with page_fault_oops(), according to the comment
> in exc_double_fault() actual stack overflows will always hit #DF.

Just give this one a test, still not working properly...

[   51.016033][    C0] traps: PANIC: double fault, error_code: 0x0
[   51.016047][    C0] double fault: 0000 [#1] SMP PTI
[   51.016054][    C0] CPU: 0 PID: 761 Comm: a.out Not tainted 5.14.0-next-20210913+ #543
[   51.016061][    C0] Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011
[   51.016065][    C0] RIP: 0010:perf_swevent_get_recursion_context+0x0/0x70
[   51.016079][    C0] Code: 48 03 43 28 48 8b 0c 24 bb 01 00 00 00 4c 29 f0 48 39 c8 48 0f 47 c1 49 89 45 08 e9 48 ff ff ff 66 2e 0f 1f 84 00 00 00 00 00 <55> 53 e8 09 20 f2 ff 48 c7 c2 20 4d 03 00 65 48 03 15 5a 3b d2 7e
[   51.016086][    C0] RSP: 0018:fffffe000000b000 EFLAGS: 00010046
[   51.016093][    C0] RAX: 0000000080120008 RBX: fffffe000000b050 RCX: 0000000000000000
[   51.016097][    C0] RDX: ffff888106c3c300 RSI: ffffffff81269031 RDI: 000000000000001c
[   51.016102][    C0] RBP: 000000000000001c R08: 0000000000000001 R09: 0000000000000000
[   51.016106][    C0] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000
[   51.016109][    C0] R13: fffffe000000b044 R14: 0000000000000001 R15: 0000000000000001
[   51.016113][    C0] FS:  00007f0cfd961740(0000) GS:ffff88813bc00000(0000) knlGS:0000000000000000
[   51.016120][    C0] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   51.016124][    C0] CR2: fffffe000000aff8 CR3: 0000000105ecc001 CR4: 00000000003606f0
[   51.016129][    C0] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[   51.016132][    C0] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[   51.016136][    C0] Call Trace:
[   51.016139][    C0]  <TASK>
[   51.016141][    C0]  </TASK>
[   51.016144][    C0] Modules linked in:
[   51.042436][    C0] ---[ end trace 5c102ce76b073dcf ]---
[   51.042440][    C0] RIP: 0010:perf_swevent_get_recursion_context+0x0/0x70
[   51.042450][    C0] Code: 48 03 43 28 48 8b 0c 24 bb 01 00 00 00 4c 29 f0 48 39 c8 48 0f 47 c1 49 89 45 08 e9 48 ff ff ff 66 2e 0f 1f 84 00 00 00 00 00 <55> 53 e8 09 20 f2 ff 48 c7 c2 20 4d 03 00 65 48 03 15 5a 3b d2 7e
[   51.042457][    C0] RSP: 0018:fffffe000000b000 EFLAGS: 00010046
[   51.042462][    C0] RAX: 0000000080120008 RBX: fffffe000000b050 RCX: 0000000000000000
[   51.042466][    C0] RDX: ffff888106c3c300 RSI: ffffffff81269031 RDI: 000000000000001c
[   51.042470][    C0] RBP: 000000000000001c R08: 0000000000000001 R09: 0000000000000000
[   51.042479][    C0] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000
[   51.042483][    C0] R13: fffffe000000b044 R14: 0000000000000001 R15: 0000000000000001
[   51.042487][    C0] FS:  00007f0cfd961740(0000) GS:ffff88813bc00000(0000) knlGS:0000000000000000
[   51.042493][    C0] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   51.042497][    C0] CR2: fffffe000000aff8 CR3: 0000000105ecc001 CR4: 00000000003606f0
[   51.042501][    C0] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[   51.042505][    C0] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[   51.042510][    C0] Kernel panic - not syncing: Fatal exception in interrupt
[   51.042917][    C0] Kernel Offset: disabled

Regards,
Michael Wang


> 
> ---
> diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h
> index 3d52b094850a..c4e92462c2b4 100644
> --- a/arch/x86/include/asm/cpu_entry_area.h
> +++ b/arch/x86/include/asm/cpu_entry_area.h
> @@ -61,6 +61,9 @@ enum exception_stack_ordering {
>  #define CEA_ESTACK_OFFS(st)					\
>  	offsetof(struct cea_exception_stacks, st## _stack)
>  
> +#define CEA_EGUARD_OFFS(st)					\
> +	offsetof(struct cea_exception_stacks, st## _stack_guard)
> +
>  #define CEA_ESTACK_PAGES					\
>  	(sizeof(struct cea_exception_stacks) / PAGE_SIZE)
>  
> diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
> index f248eb2ac2d4..8ff346579330 100644
> --- a/arch/x86/include/asm/stacktrace.h
> +++ b/arch/x86/include/asm/stacktrace.h
> @@ -14,13 +14,14 @@
>  #include <asm/switch_to.h>
>  
>  enum stack_type {
> -	STACK_TYPE_UNKNOWN,
> +	STACK_TYPE_UNKNOWN = 0,
>  	STACK_TYPE_TASK,
>  	STACK_TYPE_IRQ,
>  	STACK_TYPE_SOFTIRQ,
>  	STACK_TYPE_ENTRY,
>  	STACK_TYPE_EXCEPTION,
>  	STACK_TYPE_EXCEPTION_LAST = STACK_TYPE_EXCEPTION + N_EXCEPTION_STACKS-1,
> +	STACK_TYPE_GUARD = 0x80,
>  };
>  
>  struct stack_info {
> @@ -31,6 +32,15 @@ struct stack_info {
>  bool in_task_stack(unsigned long *stack, struct task_struct *task,
>  		   struct stack_info *info);
>  
> +static __always_inline bool in_stack_guard(void *addr, void *begin, void *end)
> +{
> +#ifdef CONFIG_VMAP_STACK
> +	if (addr > (begin - PAGE_SIZE))
> +		return true;
> +#endif
> +	return false;
> +}
> +
>  bool in_entry_stack(unsigned long *stack, struct stack_info *info);
>  
>  int get_stack_info(unsigned long *stack, struct task_struct *task,
> diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
> index ea4fe192189d..91b406fe2a39 100644
> --- a/arch/x86/kernel/dumpstack.c
> +++ b/arch/x86/kernel/dumpstack.c
> @@ -32,12 +32,19 @@ static struct pt_regs exec_summary_regs;
>  bool noinstr in_task_stack(unsigned long *stack, struct task_struct *task,
>  			   struct stack_info *info)
>  {
> -	unsigned long *begin = task_stack_page(task);
> -	unsigned long *end   = task_stack_page(task) + THREAD_SIZE;
> -
> -	if (stack < begin || stack >= end)
> +	void *begin = task_stack_page(task);
> +	void *end   = begin + THREAD_SIZE;
> +	int type    = STACK_TYPE_TASK;
> +
> +	if ((void *)stack < begin || (void *)stack >= end) {
> +		if (in_stack_guard(stack, begin, end)) {
> +			type |= STACK_TYPE_GUARD;
> +			goto fill_info;
> +		}
>  		return false;
> +	}
>  
> +fill_info:
>  	info->type	= STACK_TYPE_TASK;
>  	info->begin	= begin;
>  	info->end	= end;
> @@ -50,14 +57,20 @@ bool noinstr in_task_stack(unsigned long *stack, struct task_struct *task,
>  bool noinstr in_entry_stack(unsigned long *stack, struct stack_info *info)
>  {
>  	struct entry_stack *ss = cpu_entry_stack(smp_processor_id());
> -
> +	int type = STACK_TYPE_ENTRY;
>  	void *begin = ss;
>  	void *end = ss + 1;
>  
> -	if ((void *)stack < begin || (void *)stack >= end)
> +	if ((void *)stack < begin || (void *)stack >= end) {
> +		if (in_stack_guard(stack, begin, end)) {
> +			type |= STACK_TYPE_GUARD;
> +			goto fill_info;
> +		}
>  		return false;
> +	}
>  
> -	info->type	= STACK_TYPE_ENTRY;
> +fill_info:
> +	info->type	= type;
>  	info->begin	= begin;
>  	info->end	= end;
>  	info->next_sp	= NULL;
> diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
> index 5601b95944fa..3634bdf9ab36 100644
> --- a/arch/x86/kernel/dumpstack_64.c
> +++ b/arch/x86/kernel/dumpstack_64.c
> @@ -32,9 +32,15 @@ const char *stack_type_name(enum stack_type type)
>  {
>  	BUILD_BUG_ON(N_EXCEPTION_STACKS != 6);
>  
> +	if (type == STACK_TYPE_TASK)
> +		return "TASK";
> +
>  	if (type == STACK_TYPE_IRQ)
>  		return "IRQ";
>  
> +	if (type == STACK_TYPE_SOFTIRQ)
> +		return "SOFTIRQ";
> +
>  	if (type == STACK_TYPE_ENTRY) {
>  		/*
>  		 * On 64-bit, we have a generic entry stack that we
> @@ -63,6 +69,11 @@ struct estack_pages {
>  };
>  
>  #define EPAGERANGE(st)							\
> +	[PFN_DOWN(CEA_EGUARD_OFFS(st))] = {				\
> +		.offs	= CEA_EGUARD_OFFS(st),				\
> +		.size	= PAGE_SIZE,					\
> +		.type	= STACK_TYPE_GUARD +				\
> +			  STACK_TYPE_EXCEPTION + ESTACK_ ##st, },	\
>  	[PFN_DOWN(CEA_ESTACK_OFFS(st)) ...				\
>  	 PFN_DOWN(CEA_ESTACK_OFFS(st) + CEA_ESTACK_SIZE(st) - 1)] = {	\
>  		.offs	= CEA_ESTACK_OFFS(st),				\
> @@ -111,7 +122,7 @@ static __always_inline bool in_exception_stack(unsigned long *stack, struct stac
>  	k = (stk - begin) >> PAGE_SHIFT;
>  	/* Lookup the page descriptor */
>  	ep = &estack_pages[k];
> -	/* Guard page? */
> +	/* unknown entry */
>  	if (!ep->size)
>  		return false;
>  
> @@ -122,7 +133,12 @@ static __always_inline bool in_exception_stack(unsigned long *stack, struct stac
>  	info->type	= ep->type;
>  	info->begin	= (unsigned long *)begin;
>  	info->end	= (unsigned long *)end;
> -	info->next_sp	= (unsigned long *)regs->sp;
> +	info->next_sp	= NULL;
> +
> +	/* Can't read regs from a guard page. */
> +	if (!(ep->type & STACK_TYPE_GUARD))
> +		info->next_sp = (unsigned long *)regs->sp;
> +
>  	return true;
>  }
>  
> @@ -130,6 +146,7 @@ static __always_inline bool in_irq_stack(unsigned long *stack, struct stack_info
>  {
>  	unsigned long *end = (unsigned long *)this_cpu_read(hardirq_stack_ptr);
>  	unsigned long *begin;
> +	int type = STACK_TYPE_IRQ;
>  
>  	/*
>  	 * @end points directly to the top most stack entry to avoid a -8
> @@ -144,19 +161,27 @@ static __always_inline bool in_irq_stack(unsigned long *stack, struct stack_info
>  	 * final operation is 'popq %rsp' which means after that RSP points
>  	 * to the original stack and not to @end.
>  	 */
> -	if (stack < begin || stack >= end)
> +	if (stack < begin || stack >= end) {
> +		if (in_stack_guard(stack, begin, end)) {
> +			type |= STACK_TYPE_GUARD;
> +			goto fill_info;
> +		}
>  		return false;
> +	}
>  
> -	info->type	= STACK_TYPE_IRQ;
> +fill_info:
> +	info->type	= type;
>  	info->begin	= begin;
>  	info->end	= end;
> +	info->next_sp	= NULL;
>  
>  	/*
>  	 * The next stack pointer is stored at the top of the irq stack
>  	 * before switching to the irq stack. Actual stack entries are all
>  	 * below that.
>  	 */
> -	info->next_sp = (unsigned long *)*(end - 1);
> +	if (!(type & STACK_TYPE_GUARD))
> +		info->next_sp = (unsigned long *)*(end - 1);
>  
>  	return true;
>  }
> @@ -193,6 +218,9 @@ int get_stack_info(unsigned long *stack, struct task_struct *task,
>  	if (!get_stack_info_noinstr(stack, task, info))
>  		goto unknown;
>  
> +	if (info->type & STACK_TYPE_GUARD)
> +		goto unknown;
> +
>  	/*
>  	 * Make sure we don't iterate through any given stack more than once.
>  	 * If it comes up a second time then there's something wrong going on:
> diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
> index a58800973aed..80f6d8d735eb 100644
> --- a/arch/x86/kernel/traps.c
> +++ b/arch/x86/kernel/traps.c
> @@ -353,6 +353,7 @@ DEFINE_IDTENTRY_DF(exc_double_fault)
>  
>  #ifdef CONFIG_VMAP_STACK
>  	unsigned long address = read_cr2();
> +	struct stack_info info;
>  #endif
>  
>  #ifdef CONFIG_X86_ESPFIX64
> @@ -455,9 +456,11 @@ DEFINE_IDTENTRY_DF(exc_double_fault)
>  	 * stack even if the actual trigger for the double fault was
>  	 * something else.
>  	 */
> -	if ((unsigned long)task_stack_page(tsk) - 1 - address < PAGE_SIZE) {
> -		handle_stack_overflow("kernel stack overflow (double-fault)",
> -				      regs, address);
> +	if (get_stack_info_noinstr((void *)address, current, &info) &&
> +	    info.type & STACK_TYPE_GUARD) {
> +		const char *name = stack_type_name(info.type & ~STACK_TYPE_GUARD);
> +		pr_emerg("BUG: %s stack guard hit at %p (stack is %p..%p)\n",
> +			 name, (void *)address, info.begin, info.end);
>  	}
>  #endif
>  
> @@ -708,7 +711,9 @@ asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct pt_regs *r
>  	sp    = regs->sp;
>  	stack = (unsigned long *)sp;
>  
> -	if (!get_stack_info_noinstr(stack, current, &info) || info.type == STACK_TYPE_ENTRY ||
> +	if (!get_stack_info_noinstr(stack, current, &info) ||
> +	    info.type & STACK_TYPE_GUARD ||
> +	    info.type == STACK_TYPE_ENTRY ||
>  	    info.type >= STACK_TYPE_EXCEPTION_LAST)
>  		sp = __this_cpu_ist_top_va(VC2);
>  
>
王贇 Sept. 17, 2021, 3:02 a.m. UTC | #7
On 2021/9/16 下午6:02, Peter Zijlstra wrote:
[snip]
>  
> +static __always_inline bool in_stack_guard(void *addr, void *begin, void *end)
> +{
> +#ifdef CONFIG_VMAP_STACK
> +	if (addr > (begin - PAGE_SIZE))
> +		return true;

After fix this logical as:

  addr >= (begin - PAGE_SIZE) && addr < begin

it's working.

Regards,
Michael Wang

> +#endif
> +	return false;
> +}
[snip]
>  
>  #ifdef CONFIG_X86_ESPFIX64
> @@ -455,9 +456,11 @@ DEFINE_IDTENTRY_DF(exc_double_fault)
>  	 * stack even if the actual trigger for the double fault was
>  	 * something else.
>  	 */
> -	if ((unsigned long)task_stack_page(tsk) - 1 - address < PAGE_SIZE) {
> -		handle_stack_overflow("kernel stack overflow (double-fault)",
> -				      regs, address);
> +	if (get_stack_info_noinstr((void *)address, current, &info) &&
> +	    info.type & STACK_TYPE_GUARD) {
> +		const char *name = stack_type_name(info.type & ~STACK_TYPE_GUARD);
> +		pr_emerg("BUG: %s stack guard hit at %p (stack is %p..%p)\n",
> +			 name, (void *)address, info.begin, info.end);
>  	}
>  #endif
>  
> @@ -708,7 +711,9 @@ asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct pt_regs *r
>  	sp    = regs->sp;
>  	stack = (unsigned long *)sp;
>  
> -	if (!get_stack_info_noinstr(stack, current, &info) || info.type == STACK_TYPE_ENTRY ||
> +	if (!get_stack_info_noinstr(stack, current, &info) ||
> +	    info.type & STACK_TYPE_GUARD ||
> +	    info.type == STACK_TYPE_ENTRY ||
>  	    info.type >= STACK_TYPE_EXCEPTION_LAST)
>  		sp = __this_cpu_ist_top_va(VC2);
>  
>
Peter Zijlstra Sept. 17, 2021, 10:21 a.m. UTC | #8
On Fri, Sep 17, 2021 at 11:02:07AM +0800, 王贇 wrote:
> 
> 
> On 2021/9/16 下午6:02, Peter Zijlstra wrote:
> [snip]
> >  
> > +static __always_inline bool in_stack_guard(void *addr, void *begin, void *end)
> > +{
> > +#ifdef CONFIG_VMAP_STACK
> > +	if (addr > (begin - PAGE_SIZE))
> > +		return true;
> 
> After fix this logical as:
> 
>   addr >= (begin - PAGE_SIZE) && addr < begin
> 
> it's working.

Shees, I seem to have a knack for getting it wrong, don't I. Thanks!

Anyway, I'll ammend the commit locally, but I'd really like some
feedback from Andy, who wrote all that VIRT_STACK stuff in the first
place.
Peter Zijlstra Sept. 17, 2021, 4:40 p.m. UTC | #9
On Fri, Sep 17, 2021 at 12:21:24PM +0200, Peter Zijlstra wrote:
> On Fri, Sep 17, 2021 at 11:02:07AM +0800, 王贇 wrote:
> > 
> > 
> > On 2021/9/16 下午6:02, Peter Zijlstra wrote:
> > [snip]
> > >  
> > > +static __always_inline bool in_stack_guard(void *addr, void *begin, void *end)
> > > +{
> > > +#ifdef CONFIG_VMAP_STACK
> > > +	if (addr > (begin - PAGE_SIZE))
> > > +		return true;
> > 
> > After fix this logical as:
> > 
> >   addr >= (begin - PAGE_SIZE) && addr < begin
> > 
> > it's working.
> 
> Shees, I seem to have a knack for getting it wrong, don't I. Thanks!
> 
> Anyway, I'll ammend the commit locally, but I'd really like some
> feedback from Andy, who wrote all that VIRT_STACK stuff in the first
> place.

Andy suggested something like this.

---
diff --git a/arch/x86/include/asm/irq_stack.h b/arch/x86/include/asm/irq_stack.h
index 562854c60808..9a2e37a7304d 100644
--- a/arch/x86/include/asm/irq_stack.h
+++ b/arch/x86/include/asm/irq_stack.h
@@ -77,11 +77,11 @@
  *     Function calls can clobber anything except the callee-saved
  *     registers. Tell the compiler.
  */
-#define call_on_irqstack(func, asm_call, argconstr...)			\
+#define call_on_stack(stack, func, asm_call, argconstr...)		\
 {									\
 	register void *tos asm("r11");					\
 									\
-	tos = ((void *)__this_cpu_read(hardirq_stack_ptr));		\
+	tos = ((void *)(stack));					\
 									\
 	asm_inline volatile(						\
 	"movq	%%rsp, (%[tos])				\n"		\
@@ -98,6 +98,25 @@
 	);								\
 }
 
+#define ASM_CALL_ARG0							\
+	"call %P[__func]				\n"
+
+#define ASM_CALL_ARG1							\
+	"movq	%[arg1], %%rdi				\n"		\
+	ASM_CALL_ARG0
+
+#define ASM_CALL_ARG2							\
+	"movq	%[arg2], %%rsi				\n"		\
+	ASM_CALL_ARG1
+
+#define ASM_CALL_ARG3							\
+	"movq	%[arg3], %%rdx				\n"		\
+	ASM_CALL_ARG2
+
+#define call_on_irqstack(func, asm_call, argconstr...)			\
+	call_on_stack(__this_cpu_read(hardirq_stack_ptr),		\
+		      func, asm_call, argconstr)
+
 /* Macros to assert type correctness for run_*_on_irqstack macros */
 #define assert_function_type(func, proto)				\
 	static_assert(__builtin_types_compatible_p(typeof(&func), proto))
@@ -147,8 +166,7 @@
  */
 #define ASM_CALL_SYSVEC							\
 	"call irq_enter_rcu				\n"		\
-	"movq	%[arg1], %%rdi				\n"		\
-	"call %P[__func]				\n"		\
+	ASM_CALL_ARG1							\
 	"call irq_exit_rcu				\n"
 
 #define SYSVEC_CONSTRAINTS	, [arg1] "r" (regs)
@@ -168,12 +186,10 @@
  */
 #define ASM_CALL_IRQ							\
 	"call irq_enter_rcu				\n"		\
-	"movq	%[arg1], %%rdi				\n"		\
-	"movl	%[arg2], %%esi				\n"		\
-	"call %P[__func]				\n"		\
+	ASM_CALL_ARG2							\
 	"call irq_exit_rcu				\n"
 
-#define IRQ_CONSTRAINTS	, [arg1] "r" (regs), [arg2] "r" (vector)
+#define IRQ_CONSTRAINTS	, [arg1] "r" (regs), [arg2] "r" ((long)vector)
 
 #define run_irq_on_irqstack_cond(func, regs, vector)			\
 {									\
@@ -185,9 +201,6 @@
 			      IRQ_CONSTRAINTS, regs, vector);		\
 }
 
-#define ASM_CALL_SOFTIRQ						\
-	"call %P[__func]				\n"
-
 /*
  * Macro to invoke __do_softirq on the irq stack. This is only called from
  * task context when bottom halves are about to be reenabled and soft
@@ -197,7 +210,7 @@
 #define do_softirq_own_stack()						\
 {									\
 	__this_cpu_write(hardirq_stack_inuse, true);			\
-	call_on_irqstack(__do_softirq, ASM_CALL_SOFTIRQ);		\
+	call_on_irqstack(__do_softirq, ASM_CALL_ARG0);			\
 	__this_cpu_write(hardirq_stack_inuse, false);			\
 }
 
diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
index f248eb2ac2d4..17a52793f6c3 100644
--- a/arch/x86/include/asm/stacktrace.h
+++ b/arch/x86/include/asm/stacktrace.h
@@ -38,6 +38,16 @@ int get_stack_info(unsigned long *stack, struct task_struct *task,
 bool get_stack_info_noinstr(unsigned long *stack, struct task_struct *task,
 			    struct stack_info *info);
 
+static __always_inline
+bool get_stack_guard_info(unsigned long *stack, struct stack_info *info)
+{
+	/* make sure it's not in the stack proper */
+	if (get_stack_info_noinstr(stack, current, info))
+		return false;
+	/* but if it is in the page below it, we hit a guard */
+	return get_stack_info_noinstr((void *)stack + PAGE_SIZE-1, current, info);
+}
+
 const char *stack_type_name(enum stack_type type);
 
 static inline bool on_stack(struct stack_info *info, void *addr, size_t len)
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 7f7200021bd1..6221be7cafc3 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -40,9 +40,9 @@ void math_emulate(struct math_emu_info *);
 bool fault_in_kernel_space(unsigned long address);
 
 #ifdef CONFIG_VMAP_STACK
-void __noreturn handle_stack_overflow(const char *message,
-				      struct pt_regs *regs,
-				      unsigned long fault_address);
+void __noreturn handle_stack_overflow(struct pt_regs *regs,
+				      unsigned long fault_address,
+				      struct stack_info *info);
 #endif
 
 #endif /* _ASM_X86_TRAPS_H */
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 5601b95944fa..6c5defd6569a 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -32,9 +32,15 @@ const char *stack_type_name(enum stack_type type)
 {
 	BUILD_BUG_ON(N_EXCEPTION_STACKS != 6);
 
+	if (type == STACK_TYPE_TASK)
+		return "TASK";
+
 	if (type == STACK_TYPE_IRQ)
 		return "IRQ";
 
+	if (type == STACK_TYPE_SOFTIRQ)
+		return "SOFTIRQ";
+
 	if (type == STACK_TYPE_ENTRY) {
 		/*
 		 * On 64-bit, we have a generic entry stack that we
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index a58800973aed..77857d41289d 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -313,17 +313,19 @@ DEFINE_IDTENTRY_ERRORCODE(exc_alignment_check)
 }
 
 #ifdef CONFIG_VMAP_STACK
-__visible void __noreturn handle_stack_overflow(const char *message,
-						struct pt_regs *regs,
-						unsigned long fault_address)
+__visible void __noreturn handle_stack_overflow(struct pt_regs *regs,
+						unsigned long fault_address,
+						struct stack_info *info)
 {
-	printk(KERN_EMERG "BUG: stack guard page was hit at %p (stack is %p..%p)\n",
-		 (void *)fault_address, current->stack,
-		 (char *)current->stack + THREAD_SIZE - 1);
-	die(message, regs, 0);
+	const char *name = stack_type_name(info->type);
+
+	printk(KERN_EMERG "BUG: %s stack guard page was hit at %p (stack is %p..%p)\n",
+	       name, (void *)fault_address, info->begin, info->end);
+
+	die("stack guard page", regs, 0);
 
 	/* Be absolutely certain we don't return. */
-	panic("%s", message);
+	panic("%s stack guard hit", name);
 }
 #endif
 
@@ -353,6 +355,7 @@ DEFINE_IDTENTRY_DF(exc_double_fault)
 
 #ifdef CONFIG_VMAP_STACK
 	unsigned long address = read_cr2();
+	struct stack_info info;
 #endif
 
 #ifdef CONFIG_X86_ESPFIX64
@@ -455,10 +458,8 @@ DEFINE_IDTENTRY_DF(exc_double_fault)
 	 * stack even if the actual trigger for the double fault was
 	 * something else.
 	 */
-	if ((unsigned long)task_stack_page(tsk) - 1 - address < PAGE_SIZE) {
-		handle_stack_overflow("kernel stack overflow (double-fault)",
-				      regs, address);
-	}
+	if (get_stack_guard_info((void *)address, &info))
+		handle_stack_overflow(regs, address, &info);
 #endif
 
 	pr_emerg("PANIC: double fault, error_code: 0x%lx\n", error_code);
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index b2eefdefc108..edb5152f0866 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -32,6 +32,7 @@
 #include <asm/pgtable_areas.h>		/* VMALLOC_START, ...		*/
 #include <asm/kvm_para.h>		/* kvm_handle_async_pf		*/
 #include <asm/vdso.h>			/* fixup_vdso_exception()	*/
+#include <asm/irq_stack.h>
 
 #define CREATE_TRACE_POINTS
 #include <asm/trace/exceptions.h>
@@ -631,6 +632,9 @@ static noinline void
 page_fault_oops(struct pt_regs *regs, unsigned long error_code,
 		unsigned long address)
 {
+#ifdef CONFIG_VMAP_STACK
+	struct stack_info info;
+#endif
 	unsigned long flags;
 	int sig;
 
@@ -649,9 +653,7 @@ page_fault_oops(struct pt_regs *regs, unsigned long error_code,
 	 * that we're in vmalloc space to avoid this.
 	 */
 	if (is_vmalloc_addr((void *)address) &&
-	    (((unsigned long)current->stack - 1 - address < PAGE_SIZE) ||
-	     address - ((unsigned long)current->stack + THREAD_SIZE) < PAGE_SIZE)) {
-		unsigned long stack = __this_cpu_ist_top_va(DF) - sizeof(void *);
+	    get_stack_guard_info((void *)address, &info)) {
 		/*
 		 * We're likely to be running with very little stack space
 		 * left.  It's plausible that we'd hit this condition but
@@ -662,13 +664,11 @@ page_fault_oops(struct pt_regs *regs, unsigned long error_code,
 		 * and then double-fault, though, because we're likely to
 		 * break the console driver and lose most of the stack dump.
 		 */
-		asm volatile ("movq %[stack], %%rsp\n\t"
-			      "call handle_stack_overflow\n\t"
-			      "1: jmp 1b"
-			      : ASM_CALL_CONSTRAINT
-			      : "D" ("kernel stack overflow (page fault)"),
-				"S" (regs), "d" (address),
-				[stack] "rm" (stack));
+		call_on_stack(__this_cpu_ist_top_va(DF) - sizeof(void*),
+			      handle_stack_overflow,
+			      ASM_CALL_ARG3,
+			      , [arg1] "r" (regs), [arg2] "r" (address), [arg3] "r" (&info));
+
 		unreachable();
 	}
 #endif
王贇 Sept. 18, 2021, 2:30 a.m. UTC | #10
On 2021/9/18 上午12:40, Peter Zijlstra wrote:
> On Fri, Sep 17, 2021 at 12:21:24PM +0200, Peter Zijlstra wrote:
>> On Fri, Sep 17, 2021 at 11:02:07AM +0800, 王贇 wrote:
>>>
>>>
>>> On 2021/9/16 下午6:02, Peter Zijlstra wrote:
>>> [snip]
>>>>  
>>>> +static __always_inline bool in_stack_guard(void *addr, void *begin, void *end)
>>>> +{
>>>> +#ifdef CONFIG_VMAP_STACK
>>>> +	if (addr > (begin - PAGE_SIZE))
>>>> +		return true;
>>>
>>> After fix this logical as:
>>>
>>>   addr >= (begin - PAGE_SIZE) && addr < begin
>>>
>>> it's working.
>>
>> Shees, I seem to have a knack for getting it wrong, don't I. Thanks!
>>
>> Anyway, I'll ammend the commit locally, but I'd really like some
>> feedback from Andy, who wrote all that VIRT_STACK stuff in the first
>> place.
> 
> Andy suggested something like this.

Now it seem like working well :-)

[  193.100475][    C0] BUG: NMI stack guard page was hit at 0000000085fd977b (stack is 000000003a55b09e..00000000d8cce1a5)
[  193.100493][    C0] stack guard page: 0000 [#1] SMP PTI
[  193.100499][    C0] CPU: 0 PID: 968 Comm: a.out Not tainted 5.14.0-next-20210913+ #548
[  193.100506][    C0] Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011
[  193.100510][    C0] RIP: 0010:perf_swevent_get_recursion_context+0x0/0x70
[  193.100523][    C0] Code: 48 03 43 28 48 8b 0c 24 bb 01 00 00 00 4c 29 f0 48 39 c8 48 0f 47 c1 49 89 45 08 e9 48 ff ff ff 66 2e 0f 1f 84 00 00 00 00 00 <55> 53 e8 09 20 f2 ff 48 c7 c2 20 4d 03 00 65 48 03 15 5a 3b d2 7e
[  193.100529][    C0] RSP: 0018:fffffe000000b000 EFLAGS: 00010046
[  193.100535][    C0] RAX: 0000000080120006 RBX: fffffe000000b050 RCX: 0000000000000000
[  193.100540][    C0] RDX: ffff88810de82180 RSI: ffffffff81269031 RDI: 000000000000001c
[  193.100544][    C0] RBP: 000000000000001c R08: 0000000000000001 R09: 0000000000000000
[  193.100548][    C0] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000
[  193.100551][    C0] R13: fffffe000000b044 R14: 0000000000000001 R15: 0000000000000009
[  193.100556][    C0] FS:  00007fa18c42d740(0000) GS:ffff88813bc00000(0000) knlGS:0000000000000000
[  193.100562][    C0] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  193.100566][    C0] CR2: fffffe000000aff8 CR3: 00000001160ac005 CR4: 00000000003606f0
[  193.100570][    C0] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  193.100574][    C0] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[  193.100578][    C0] Call Trace:
[  193.100581][    C0]  <NMI>
[  193.100584][    C0]  perf_trace_buf_alloc+0x26/0xd0
[  193.100597][    C0]  ? is_prefetch.isra.25+0x260/0x260
[  193.100605][    C0]  ? __bad_area_nosemaphore+0x1b8/0x280
[  193.100611][    C0]  perf_ftrace_function_call+0x18f/0x2e0


Tested-by: Michael Wang <yun.wang@linux.alibaba.com>

BTW, would you like to apply the other patch which increasing exception
stack size after this one?

Regards,
Michael Wang


> 
> ---
> diff --git a/arch/x86/include/asm/irq_stack.h b/arch/x86/include/asm/irq_stack.h
> index 562854c60808..9a2e37a7304d 100644
> --- a/arch/x86/include/asm/irq_stack.h
> +++ b/arch/x86/include/asm/irq_stack.h
> @@ -77,11 +77,11 @@
>   *     Function calls can clobber anything except the callee-saved
>   *     registers. Tell the compiler.
>   */
> -#define call_on_irqstack(func, asm_call, argconstr...)			\
> +#define call_on_stack(stack, func, asm_call, argconstr...)		\
>  {									\
>  	register void *tos asm("r11");					\
>  									\
> -	tos = ((void *)__this_cpu_read(hardirq_stack_ptr));		\
> +	tos = ((void *)(stack));					\
>  									\
>  	asm_inline volatile(						\
>  	"movq	%%rsp, (%[tos])				\n"		\
> @@ -98,6 +98,25 @@
>  	);								\
>  }
>  
> +#define ASM_CALL_ARG0							\
> +	"call %P[__func]				\n"
> +
> +#define ASM_CALL_ARG1							\
> +	"movq	%[arg1], %%rdi				\n"		\
> +	ASM_CALL_ARG0
> +
> +#define ASM_CALL_ARG2							\
> +	"movq	%[arg2], %%rsi				\n"		\
> +	ASM_CALL_ARG1
> +
> +#define ASM_CALL_ARG3							\
> +	"movq	%[arg3], %%rdx				\n"		\
> +	ASM_CALL_ARG2
> +
> +#define call_on_irqstack(func, asm_call, argconstr...)			\
> +	call_on_stack(__this_cpu_read(hardirq_stack_ptr),		\
> +		      func, asm_call, argconstr)
> +
>  /* Macros to assert type correctness for run_*_on_irqstack macros */
>  #define assert_function_type(func, proto)				\
>  	static_assert(__builtin_types_compatible_p(typeof(&func), proto))
> @@ -147,8 +166,7 @@
>   */
>  #define ASM_CALL_SYSVEC							\
>  	"call irq_enter_rcu				\n"		\
> -	"movq	%[arg1], %%rdi				\n"		\
> -	"call %P[__func]				\n"		\
> +	ASM_CALL_ARG1							\
>  	"call irq_exit_rcu				\n"
>  
>  #define SYSVEC_CONSTRAINTS	, [arg1] "r" (regs)
> @@ -168,12 +186,10 @@
>   */
>  #define ASM_CALL_IRQ							\
>  	"call irq_enter_rcu				\n"		\
> -	"movq	%[arg1], %%rdi				\n"		\
> -	"movl	%[arg2], %%esi				\n"		\
> -	"call %P[__func]				\n"		\
> +	ASM_CALL_ARG2							\
>  	"call irq_exit_rcu				\n"
>  
> -#define IRQ_CONSTRAINTS	, [arg1] "r" (regs), [arg2] "r" (vector)
> +#define IRQ_CONSTRAINTS	, [arg1] "r" (regs), [arg2] "r" ((long)vector)
>  
>  #define run_irq_on_irqstack_cond(func, regs, vector)			\
>  {									\
> @@ -185,9 +201,6 @@
>  			      IRQ_CONSTRAINTS, regs, vector);		\
>  }
>  
> -#define ASM_CALL_SOFTIRQ						\
> -	"call %P[__func]				\n"
> -
>  /*
>   * Macro to invoke __do_softirq on the irq stack. This is only called from
>   * task context when bottom halves are about to be reenabled and soft
> @@ -197,7 +210,7 @@
>  #define do_softirq_own_stack()						\
>  {									\
>  	__this_cpu_write(hardirq_stack_inuse, true);			\
> -	call_on_irqstack(__do_softirq, ASM_CALL_SOFTIRQ);		\
> +	call_on_irqstack(__do_softirq, ASM_CALL_ARG0);			\
>  	__this_cpu_write(hardirq_stack_inuse, false);			\
>  }
>  
> diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
> index f248eb2ac2d4..17a52793f6c3 100644
> --- a/arch/x86/include/asm/stacktrace.h
> +++ b/arch/x86/include/asm/stacktrace.h
> @@ -38,6 +38,16 @@ int get_stack_info(unsigned long *stack, struct task_struct *task,
>  bool get_stack_info_noinstr(unsigned long *stack, struct task_struct *task,
>  			    struct stack_info *info);
>  
> +static __always_inline
> +bool get_stack_guard_info(unsigned long *stack, struct stack_info *info)
> +{
> +	/* make sure it's not in the stack proper */
> +	if (get_stack_info_noinstr(stack, current, info))
> +		return false;
> +	/* but if it is in the page below it, we hit a guard */
> +	return get_stack_info_noinstr((void *)stack + PAGE_SIZE-1, current, info);
> +}
> +
>  const char *stack_type_name(enum stack_type type);
>  
>  static inline bool on_stack(struct stack_info *info, void *addr, size_t len)
> diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
> index 7f7200021bd1..6221be7cafc3 100644
> --- a/arch/x86/include/asm/traps.h
> +++ b/arch/x86/include/asm/traps.h
> @@ -40,9 +40,9 @@ void math_emulate(struct math_emu_info *);
>  bool fault_in_kernel_space(unsigned long address);
>  
>  #ifdef CONFIG_VMAP_STACK
> -void __noreturn handle_stack_overflow(const char *message,
> -				      struct pt_regs *regs,
> -				      unsigned long fault_address);
> +void __noreturn handle_stack_overflow(struct pt_regs *regs,
> +				      unsigned long fault_address,
> +				      struct stack_info *info);
>  #endif
>  
>  #endif /* _ASM_X86_TRAPS_H */
> diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
> index 5601b95944fa..6c5defd6569a 100644
> --- a/arch/x86/kernel/dumpstack_64.c
> +++ b/arch/x86/kernel/dumpstack_64.c
> @@ -32,9 +32,15 @@ const char *stack_type_name(enum stack_type type)
>  {
>  	BUILD_BUG_ON(N_EXCEPTION_STACKS != 6);
>  
> +	if (type == STACK_TYPE_TASK)
> +		return "TASK";
> +
>  	if (type == STACK_TYPE_IRQ)
>  		return "IRQ";
>  
> +	if (type == STACK_TYPE_SOFTIRQ)
> +		return "SOFTIRQ";
> +
>  	if (type == STACK_TYPE_ENTRY) {
>  		/*
>  		 * On 64-bit, we have a generic entry stack that we
> diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
> index a58800973aed..77857d41289d 100644
> --- a/arch/x86/kernel/traps.c
> +++ b/arch/x86/kernel/traps.c
> @@ -313,17 +313,19 @@ DEFINE_IDTENTRY_ERRORCODE(exc_alignment_check)
>  }
>  
>  #ifdef CONFIG_VMAP_STACK
> -__visible void __noreturn handle_stack_overflow(const char *message,
> -						struct pt_regs *regs,
> -						unsigned long fault_address)
> +__visible void __noreturn handle_stack_overflow(struct pt_regs *regs,
> +						unsigned long fault_address,
> +						struct stack_info *info)
>  {
> -	printk(KERN_EMERG "BUG: stack guard page was hit at %p (stack is %p..%p)\n",
> -		 (void *)fault_address, current->stack,
> -		 (char *)current->stack + THREAD_SIZE - 1);
> -	die(message, regs, 0);
> +	const char *name = stack_type_name(info->type);
> +
> +	printk(KERN_EMERG "BUG: %s stack guard page was hit at %p (stack is %p..%p)\n",
> +	       name, (void *)fault_address, info->begin, info->end);
> +
> +	die("stack guard page", regs, 0);
>  
>  	/* Be absolutely certain we don't return. */
> -	panic("%s", message);
> +	panic("%s stack guard hit", name);
>  }
>  #endif
>  
> @@ -353,6 +355,7 @@ DEFINE_IDTENTRY_DF(exc_double_fault)
>  
>  #ifdef CONFIG_VMAP_STACK
>  	unsigned long address = read_cr2();
> +	struct stack_info info;
>  #endif
>  
>  #ifdef CONFIG_X86_ESPFIX64
> @@ -455,10 +458,8 @@ DEFINE_IDTENTRY_DF(exc_double_fault)
>  	 * stack even if the actual trigger for the double fault was
>  	 * something else.
>  	 */
> -	if ((unsigned long)task_stack_page(tsk) - 1 - address < PAGE_SIZE) {
> -		handle_stack_overflow("kernel stack overflow (double-fault)",
> -				      regs, address);
> -	}
> +	if (get_stack_guard_info((void *)address, &info))
> +		handle_stack_overflow(regs, address, &info);
>  #endif
>  
>  	pr_emerg("PANIC: double fault, error_code: 0x%lx\n", error_code);
> diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
> index b2eefdefc108..edb5152f0866 100644
> --- a/arch/x86/mm/fault.c
> +++ b/arch/x86/mm/fault.c
> @@ -32,6 +32,7 @@
>  #include <asm/pgtable_areas.h>		/* VMALLOC_START, ...		*/
>  #include <asm/kvm_para.h>		/* kvm_handle_async_pf		*/
>  #include <asm/vdso.h>			/* fixup_vdso_exception()	*/
> +#include <asm/irq_stack.h>
>  
>  #define CREATE_TRACE_POINTS
>  #include <asm/trace/exceptions.h>
> @@ -631,6 +632,9 @@ static noinline void
>  page_fault_oops(struct pt_regs *regs, unsigned long error_code,
>  		unsigned long address)
>  {
> +#ifdef CONFIG_VMAP_STACK
> +	struct stack_info info;
> +#endif
>  	unsigned long flags;
>  	int sig;
>  
> @@ -649,9 +653,7 @@ page_fault_oops(struct pt_regs *regs, unsigned long error_code,
>  	 * that we're in vmalloc space to avoid this.
>  	 */
>  	if (is_vmalloc_addr((void *)address) &&
> -	    (((unsigned long)current->stack - 1 - address < PAGE_SIZE) ||
> -	     address - ((unsigned long)current->stack + THREAD_SIZE) < PAGE_SIZE)) {
> -		unsigned long stack = __this_cpu_ist_top_va(DF) - sizeof(void *);
> +	    get_stack_guard_info((void *)address, &info)) {
>  		/*
>  		 * We're likely to be running with very little stack space
>  		 * left.  It's plausible that we'd hit this condition but
> @@ -662,13 +664,11 @@ page_fault_oops(struct pt_regs *regs, unsigned long error_code,
>  		 * and then double-fault, though, because we're likely to
>  		 * break the console driver and lose most of the stack dump.
>  		 */
> -		asm volatile ("movq %[stack], %%rsp\n\t"
> -			      "call handle_stack_overflow\n\t"
> -			      "1: jmp 1b"
> -			      : ASM_CALL_CONSTRAINT
> -			      : "D" ("kernel stack overflow (page fault)"),
> -				"S" (regs), "d" (address),
> -				[stack] "rm" (stack));
> +		call_on_stack(__this_cpu_ist_top_va(DF) - sizeof(void*),
> +			      handle_stack_overflow,
> +			      ASM_CALL_ARG3,
> +			      , [arg1] "r" (regs), [arg2] "r" (address), [arg3] "r" (&info));
> +
>  		unreachable();
>  	}
>  #endif
>
王贇 Sept. 18, 2021, 2:38 a.m. UTC | #11
On 2021/9/18 上午12:40, Peter Zijlstra wrote:
[snip]
> -	printk(KERN_EMERG "BUG: stack guard page was hit at %p (stack is %p..%p)\n",
> -		 (void *)fault_address, current->stack,
> -		 (char *)current->stack + THREAD_SIZE - 1);
> -	die(message, regs, 0);
> +	const char *name = stack_type_name(info->type);
> +
> +	printk(KERN_EMERG "BUG: %s stack guard page was hit at %p (stack is %p..%p)\n",
> +	       name, (void *)fault_address, info->begin, info->end);

Just found that the printed pointer address is not correct:
  BUG: NMI stack guard page was hit at 0000000085fd977b (stack is 000000003a55b09e..00000000d8cce1a5)

Maybe we could use %px instead?

Regards,
Michael Wang

> +
> +	die("stack guard page", regs, 0);
>  
>  	/* Be absolutely certain we don't return. */
> -	panic("%s", message);
> +	panic("%s stack guard hit", name);
>  }
>  #endif
>  
> @@ -353,6 +355,7 @@ DEFINE_IDTENTRY_DF(exc_double_fault)
>  
>  #ifdef CONFIG_VMAP_STACK
>  	unsigned long address = read_cr2();
> +	struct stack_info info;
>  #endif
>  
>  #ifdef CONFIG_X86_ESPFIX64
> @@ -455,10 +458,8 @@ DEFINE_IDTENTRY_DF(exc_double_fault)
>  	 * stack even if the actual trigger for the double fault was
>  	 * something else.
>  	 */
> -	if ((unsigned long)task_stack_page(tsk) - 1 - address < PAGE_SIZE) {
> -		handle_stack_overflow("kernel stack overflow (double-fault)",
> -				      regs, address);
> -	}
> +	if (get_stack_guard_info((void *)address, &info))
> +		handle_stack_overflow(regs, address, &info);
>  #endif
>  
>  	pr_emerg("PANIC: double fault, error_code: 0x%lx\n", error_code);
> diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
> index b2eefdefc108..edb5152f0866 100644
> --- a/arch/x86/mm/fault.c
> +++ b/arch/x86/mm/fault.c
> @@ -32,6 +32,7 @@
>  #include <asm/pgtable_areas.h>		/* VMALLOC_START, ...		*/
>  #include <asm/kvm_para.h>		/* kvm_handle_async_pf		*/
>  #include <asm/vdso.h>			/* fixup_vdso_exception()	*/
> +#include <asm/irq_stack.h>
>  
>  #define CREATE_TRACE_POINTS
>  #include <asm/trace/exceptions.h>
> @@ -631,6 +632,9 @@ static noinline void
>  page_fault_oops(struct pt_regs *regs, unsigned long error_code,
>  		unsigned long address)
>  {
> +#ifdef CONFIG_VMAP_STACK
> +	struct stack_info info;
> +#endif
>  	unsigned long flags;
>  	int sig;
>  
> @@ -649,9 +653,7 @@ page_fault_oops(struct pt_regs *regs, unsigned long error_code,
>  	 * that we're in vmalloc space to avoid this.
>  	 */
>  	if (is_vmalloc_addr((void *)address) &&
> -	    (((unsigned long)current->stack - 1 - address < PAGE_SIZE) ||
> -	     address - ((unsigned long)current->stack + THREAD_SIZE) < PAGE_SIZE)) {
> -		unsigned long stack = __this_cpu_ist_top_va(DF) - sizeof(void *);
> +	    get_stack_guard_info((void *)address, &info)) {
>  		/*
>  		 * We're likely to be running with very little stack space
>  		 * left.  It's plausible that we'd hit this condition but
> @@ -662,13 +664,11 @@ page_fault_oops(struct pt_regs *regs, unsigned long error_code,
>  		 * and then double-fault, though, because we're likely to
>  		 * break the console driver and lose most of the stack dump.
>  		 */
> -		asm volatile ("movq %[stack], %%rsp\n\t"
> -			      "call handle_stack_overflow\n\t"
> -			      "1: jmp 1b"
> -			      : ASM_CALL_CONSTRAINT
> -			      : "D" ("kernel stack overflow (page fault)"),
> -				"S" (regs), "d" (address),
> -				[stack] "rm" (stack));
> +		call_on_stack(__this_cpu_ist_top_va(DF) - sizeof(void*),
> +			      handle_stack_overflow,
> +			      ASM_CALL_ARG3,
> +			      , [arg1] "r" (regs), [arg2] "r" (address), [arg3] "r" (&info));
> +
>  		unreachable();
>  	}
>  #endif
>
Peter Zijlstra Sept. 18, 2021, 6:56 a.m. UTC | #12
On Sat, Sep 18, 2021 at 10:30:42AM +0800, 王贇 wrote:
> > Andy suggested something like this.
> 
> Now it seem like working well :-)

Thanks for sticking with it and testing all that over and over!

> [  193.100475][    C0] BUG: NMI stack guard page was hit at 0000000085fd977b (stack is 000000003a55b09e..00000000d8cce1a5)
> [  193.100493][    C0] stack guard page: 0000 [#1] SMP PTI
> [  193.100499][    C0] CPU: 0 PID: 968 Comm: a.out Not tainted 5.14.0-next-20210913+ #548
> [  193.100506][    C0] Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011
> [  193.100510][    C0] RIP: 0010:perf_swevent_get_recursion_context+0x0/0x70
> [  193.100523][    C0] Code: 48 03 43 28 48 8b 0c 24 bb 01 00 00 00 4c 29 f0 48 39 c8 48 0f 47 c1 49 89 45 08 e9 48 ff ff ff 66 2e 0f 1f 84 00 00 00 00 00 <55> 53 e8 09 20 f2 ff 48 c7 c2 20 4d 03 00 65 48 03 15 5a 3b d2 7e
> [  193.100529][    C0] RSP: 0018:fffffe000000b000 EFLAGS: 00010046
> [  193.100535][    C0] RAX: 0000000080120006 RBX: fffffe000000b050 RCX: 0000000000000000
> [  193.100540][    C0] RDX: ffff88810de82180 RSI: ffffffff81269031 RDI: 000000000000001c
> [  193.100544][    C0] RBP: 000000000000001c R08: 0000000000000001 R09: 0000000000000000
> [  193.100548][    C0] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000
> [  193.100551][    C0] R13: fffffe000000b044 R14: 0000000000000001 R15: 0000000000000009
> [  193.100556][    C0] FS:  00007fa18c42d740(0000) GS:ffff88813bc00000(0000) knlGS:0000000000000000
> [  193.100562][    C0] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [  193.100566][    C0] CR2: fffffe000000aff8 CR3: 00000001160ac005 CR4: 00000000003606f0
> [  193.100570][    C0] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> [  193.100574][    C0] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> [  193.100578][    C0] Call Trace:
> [  193.100581][    C0]  <NMI>
> [  193.100584][    C0]  perf_trace_buf_alloc+0x26/0xd0
> [  193.100597][    C0]  ? is_prefetch.isra.25+0x260/0x260
> [  193.100605][    C0]  ? __bad_area_nosemaphore+0x1b8/0x280
> [  193.100611][    C0]  perf_ftrace_function_call+0x18f/0x2e0
> 
> 
> Tested-by: Michael Wang <yun.wang@linux.alibaba.com>
> 
> BTW, would you like to apply the other patch which increasing exception
> stack size after this one?

Yes, I have that queued behind it :-)
diff mbox series

Patch

--- a/arch/x86/include/asm/cpu_entry_area.h
+++ b/arch/x86/include/asm/cpu_entry_area.h
@@ -61,6 +61,9 @@  enum exception_stack_ordering {
 #define CEA_ESTACK_OFFS(st)					\
 	offsetof(struct cea_exception_stacks, st## _stack)
 
+#define CEA_EGUARD_OFFS(st)					\
+	offsetof(struct cea_exception_stacks, st## _stack_guard)
+
 #define CEA_ESTACK_PAGES					\
 	(sizeof(struct cea_exception_stacks) / PAGE_SIZE)
 
--- a/arch/x86/include/asm/stacktrace.h
+++ b/arch/x86/include/asm/stacktrace.h
@@ -14,13 +14,14 @@ 
 #include <asm/switch_to.h>
 
 enum stack_type {
-	STACK_TYPE_UNKNOWN,
+	STACK_TYPE_UNKNOWN = 0,
 	STACK_TYPE_TASK,
 	STACK_TYPE_IRQ,
 	STACK_TYPE_SOFTIRQ,
 	STACK_TYPE_ENTRY,
 	STACK_TYPE_EXCEPTION,
 	STACK_TYPE_EXCEPTION_LAST = STACK_TYPE_EXCEPTION + N_EXCEPTION_STACKS-1,
+	STACK_TYPE_GUARD = 0x80,
 };
 
 struct stack_info {
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -32,9 +32,15 @@  const char *stack_type_name(enum stack_t
 {
 	BUILD_BUG_ON(N_EXCEPTION_STACKS != 6);
 
+	if (type == STACK_TYPE_TASK)
+		return "TASK";
+
 	if (type == STACK_TYPE_IRQ)
 		return "IRQ";
 
+	if (type == STACK_TYPE_SOFTIRQ)
+		return "SOFTIRQ";
+
 	if (type == STACK_TYPE_ENTRY) {
 		/*
 		 * On 64-bit, we have a generic entry stack that we
@@ -63,6 +69,11 @@  struct estack_pages {
 };
 
 #define EPAGERANGE(st)							\
+	[PFN_DOWN(CEA_EGUARD_OFFS(st))] = {				\
+		.offs	= CEA_EGUARD_OFFS(st),				\
+		.size	= PAGE_SIZE,					\
+		.type	= STACK_TYPE_GUARD +				\
+			  STACK_TYPE_EXCEPTION + ESTACK_ ##st, },	\
 	[PFN_DOWN(CEA_ESTACK_OFFS(st)) ...				\
 	 PFN_DOWN(CEA_ESTACK_OFFS(st) + CEA_ESTACK_SIZE(st) - 1)] = {	\
 		.offs	= CEA_ESTACK_OFFS(st),				\
@@ -111,10 +122,11 @@  static __always_inline bool in_exception
 	k = (stk - begin) >> PAGE_SHIFT;
 	/* Lookup the page descriptor */
 	ep = &estack_pages[k];
-	/* Guard page? */
+	/* unknown entry */
 	if (!ep->size)
 		return false;
 
+
 	begin += (unsigned long)ep->offs;
 	end = begin + (unsigned long)ep->size;
 	regs = (struct pt_regs *)end - 1;
@@ -193,6 +205,9 @@  int get_stack_info(unsigned long *stack,
 	if (!get_stack_info_noinstr(stack, task, info))
 		goto unknown;
 
+	if (info->type & STACK_TYPE_GUARD)
+		goto unknown;
+
 	/*
 	 * Make sure we don't iterate through any given stack more than once.
 	 * If it comes up a second time then there's something wrong going on:
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -461,6 +461,19 @@  DEFINE_IDTENTRY_DF(exc_double_fault)
 	}
 #endif
 
+#ifdef CONFIG_X86_64
+	{
+		struct stack_info info;
+
+		if (get_stack_info_noinstr((void *)address, current, &info) &&
+		    info.type & STACK_TYPE_GUARD) {
+			const char *name = stack_type_name(info.type & ~STACK_TYPE_GUARD);
+			pr_emerg("BUG: %s stack guard hit at %p (stack is %p..%p)\n",
+				 name, (void *)address, info.begin, info.end);
+		}
+	}
+#endif
+
 	pr_emerg("PANIC: double fault, error_code: 0x%lx\n", error_code);
 	die("double fault", regs, error_code);
 	panic("Machine halted.");
@@ -708,7 +721,9 @@  asmlinkage __visible noinstr struct pt_r
 	sp    = regs->sp;
 	stack = (unsigned long *)sp;
 
-	if (!get_stack_info_noinstr(stack, current, &info) || info.type == STACK_TYPE_ENTRY ||
+	if (!get_stack_info_noinstr(stack, current, &info) ||
+	    info.type & STACK_TYPE_GUARD ||
+	    info.type == STACK_TYPE_ENTRY ||
 	    info.type >= STACK_TYPE_EXCEPTION_LAST)
 		sp = __this_cpu_ist_top_va(VC2);