diff mbox

[v4,15/29] x86/mm/64: Enable vmapped stacks

Message ID f0a81fa3f7f81422cca23c9d6c88a555d89787ab.1466974736.git.luto@kernel.org (mailing list archive)
State New, archived
Headers show

Commit Message

Andy Lutomirski June 26, 2016, 9:55 p.m. UTC
This allows x86_64 kernels to enable vmapped stacks.  There are a
couple of interesting bits.

First, x86 lazily faults in top-level paging entries for the vmalloc
area.  This won't work if we get a page fault while trying to access
the stack: the CPU will promote it to a double-fault and we'll die.
To avoid this problem, probe the new stack when switching stacks and
forcibly populate the pgd entry for the stack when switching mms.

Second, once we have guard pages around the stack, we'll want to
detect and handle stack overflow.

I didn't enable it on x86_32.  We'd need to rework the double-fault
code a bit and I'm concerned about running out of vmalloc virtual
addresses under some workloads.

This patch, by itself, will behave somewhat erratically when the
stack overflows while RSP is still more than a few tens of bytes
above the bottom of the stack.  Specifically, we'll get #PF and make
it to no_context and an oops without triggering a double-fault, and
no_context doesn't know about stack overflows.  The next patch will
improve that case.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
---
 arch/x86/Kconfig                 |  1 +
 arch/x86/include/asm/switch_to.h | 28 +++++++++++++++++++++++++++-
 arch/x86/kernel/traps.c          | 32 ++++++++++++++++++++++++++++++++
 arch/x86/mm/tlb.c                | 15 +++++++++++++++
 4 files changed, 75 insertions(+), 1 deletion(-)

Comments

Brian Gerst June 27, 2016, 3:01 p.m. UTC | #1
On Sun, Jun 26, 2016 at 5:55 PM, Andy Lutomirski <luto@kernel.org> wrote:
> This allows x86_64 kernels to enable vmapped stacks.  There are a
> couple of interesting bits.
>
> First, x86 lazily faults in top-level paging entries for the vmalloc
> area.  This won't work if we get a page fault while trying to access
> the stack: the CPU will promote it to a double-fault and we'll die.
> To avoid this problem, probe the new stack when switching stacks and
> forcibly populate the pgd entry for the stack when switching mms.
>
> Second, once we have guard pages around the stack, we'll want to
> detect and handle stack overflow.
>
> I didn't enable it on x86_32.  We'd need to rework the double-fault
> code a bit and I'm concerned about running out of vmalloc virtual
> addresses under some workloads.
>
> This patch, by itself, will behave somewhat erratically when the
> stack overflows while RSP is still more than a few tens of bytes
> above the bottom of the stack.  Specifically, we'll get #PF and make
> it to no_context and an oops without triggering a double-fault, and
> no_context doesn't know about stack overflows.  The next patch will
> improve that case.
>
> Signed-off-by: Andy Lutomirski <luto@kernel.org>
> ---
>  arch/x86/Kconfig                 |  1 +
>  arch/x86/include/asm/switch_to.h | 28 +++++++++++++++++++++++++++-
>  arch/x86/kernel/traps.c          | 32 ++++++++++++++++++++++++++++++++
>  arch/x86/mm/tlb.c                | 15 +++++++++++++++
>  4 files changed, 75 insertions(+), 1 deletion(-)
>
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index d9a94da0c29f..afdcf96ef109 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -92,6 +92,7 @@ config X86
>         select HAVE_ARCH_TRACEHOOK
>         select HAVE_ARCH_TRANSPARENT_HUGEPAGE
>         select HAVE_EBPF_JIT                    if X86_64
> +       select HAVE_ARCH_VMAP_STACK             if X86_64
>         select HAVE_CC_STACKPROTECTOR
>         select HAVE_CMPXCHG_DOUBLE
>         select HAVE_CMPXCHG_LOCAL
> diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
> index 8f321a1b03a1..14e4b20f0aaf 100644
> --- a/arch/x86/include/asm/switch_to.h
> +++ b/arch/x86/include/asm/switch_to.h
> @@ -8,6 +8,28 @@ struct tss_struct;
>  void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
>                       struct tss_struct *tss);
>
> +/* This runs runs on the previous thread's stack. */
> +static inline void prepare_switch_to(struct task_struct *prev,
> +                                    struct task_struct *next)
> +{
> +#ifdef CONFIG_VMAP_STACK
> +       /*
> +        * If we switch to a stack that has a top-level paging entry
> +        * that is not present in the current mm, the resulting #PF will
> +        * will be promoted to a double-fault and we'll panic.  Probe
> +        * the new stack now so that vmalloc_fault can fix up the page
> +        * tables if needed.  This can only happen if we use a stack
> +        * in vmap space.
> +        *
> +        * We assume that the stack is aligned so that it never spans
> +        * more than one top-level paging entry.
> +        *
> +        * To minimize cache pollution, just follow the stack pointer.
> +        */
> +       READ_ONCE(*(unsigned char *)next->thread.sp);
> +#endif
> +}
> +
>  #ifdef CONFIG_X86_32
>
>  #ifdef CONFIG_CC_STACKPROTECTOR
> @@ -39,6 +61,8 @@ do {                                                                  \
>          */                                                             \
>         unsigned long ebx, ecx, edx, esi, edi;                          \
>                                                                         \
> +       prepare_switch_to(prev, next);                                  \
> +                                                                       \
>         asm volatile("pushl %%ebp\n\t"          /* save    EBP   */     \
>                      "movl %%esp,%[prev_sp]\n\t"        /* save    ESP   */ \
>                      "movl %[next_sp],%%esp\n\t"        /* restore ESP   */ \
> @@ -103,7 +127,9 @@ do {                                                                        \
>   * clean in kernel mode, with the possible exception of IOPL.  Kernel IOPL
>   * has no effect.
>   */
> -#define switch_to(prev, next, last) \
> +#define switch_to(prev, next, last)                                      \
> +       prepare_switch_to(prev, next);                                    \
> +                                                                         \
>         asm volatile(SAVE_CONTEXT                                         \
>              "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */       \
>              "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */    \
> diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
> index 00f03d82e69a..9cb7ea781176 100644
> --- a/arch/x86/kernel/traps.c
> +++ b/arch/x86/kernel/traps.c
> @@ -292,12 +292,30 @@ DO_ERROR(X86_TRAP_NP,     SIGBUS,  "segment not present", segment_not_present)
>  DO_ERROR(X86_TRAP_SS,     SIGBUS,  "stack segment",            stack_segment)
>  DO_ERROR(X86_TRAP_AC,     SIGBUS,  "alignment check",          alignment_check)
>
> +#ifdef CONFIG_VMAP_STACK
> +static void __noreturn handle_stack_overflow(const char *message,
> +                                            struct pt_regs *regs,
> +                                            unsigned long fault_address)
> +{
> +       printk(KERN_EMERG "BUG: stack guard page was hit at %p (stack is %p..%p)\n",
> +                (void *)fault_address, current->stack,
> +                (char *)current->stack + THREAD_SIZE - 1);
> +       die(message, regs, 0);
> +
> +       /* Be absolutely certain we don't return. */
> +       panic(message);
> +}
> +#endif
> +
>  #ifdef CONFIG_X86_64
>  /* Runs on IST stack */
>  dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
>  {
>         static const char str[] = "double fault";
>         struct task_struct *tsk = current;
> +#ifdef CONFIG_VMAP_STACK
> +       unsigned long cr2;
> +#endif
>
>  #ifdef CONFIG_X86_ESPFIX64
>         extern unsigned char native_irq_return_iret[];
> @@ -332,6 +350,20 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
>         tsk->thread.error_code = error_code;
>         tsk->thread.trap_nr = X86_TRAP_DF;
>
> +#ifdef CONFIG_VMAP_STACK
> +       /*
> +        * If we overflow the stack into a guard page, the CPU will fail
> +        * to deliver #PF and will send #DF instead.  CR2 will contain
> +        * the linear address of the second fault, which will be in the
> +        * guard page below the bottom of the stack.
> +        */
> +       cr2 = read_cr2();
> +       if ((unsigned long)tsk->stack - 1 - cr2 < PAGE_SIZE)
> +               handle_stack_overflow(
> +                       "kernel stack overflow (double-fault)",
> +                       regs, cr2);
> +#endif

Is there any other way to tell if this was from a page fault?  If it
wasn't a page fault then CR2 is undefined.

--
Brian Gerst
Brian Gerst June 27, 2016, 3:12 p.m. UTC | #2
On Mon, Jun 27, 2016 at 11:01 AM, Brian Gerst <brgerst@gmail.com> wrote:
> On Sun, Jun 26, 2016 at 5:55 PM, Andy Lutomirski <luto@kernel.org> wrote:
>> This allows x86_64 kernels to enable vmapped stacks.  There are a
>> couple of interesting bits.
>>
>> First, x86 lazily faults in top-level paging entries for the vmalloc
>> area.  This won't work if we get a page fault while trying to access
>> the stack: the CPU will promote it to a double-fault and we'll die.
>> To avoid this problem, probe the new stack when switching stacks and
>> forcibly populate the pgd entry for the stack when switching mms.
>>
>> Second, once we have guard pages around the stack, we'll want to
>> detect and handle stack overflow.
>>
>> I didn't enable it on x86_32.  We'd need to rework the double-fault
>> code a bit and I'm concerned about running out of vmalloc virtual
>> addresses under some workloads.
>>
>> This patch, by itself, will behave somewhat erratically when the
>> stack overflows while RSP is still more than a few tens of bytes
>> above the bottom of the stack.  Specifically, we'll get #PF and make
>> it to no_context and an oops without triggering a double-fault, and
>> no_context doesn't know about stack overflows.  The next patch will
>> improve that case.
>>
>> Signed-off-by: Andy Lutomirski <luto@kernel.org>
>> ---
>>  arch/x86/Kconfig                 |  1 +
>>  arch/x86/include/asm/switch_to.h | 28 +++++++++++++++++++++++++++-
>>  arch/x86/kernel/traps.c          | 32 ++++++++++++++++++++++++++++++++
>>  arch/x86/mm/tlb.c                | 15 +++++++++++++++
>>  4 files changed, 75 insertions(+), 1 deletion(-)
>>
>> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
>> index d9a94da0c29f..afdcf96ef109 100644
>> --- a/arch/x86/Kconfig
>> +++ b/arch/x86/Kconfig
>> @@ -92,6 +92,7 @@ config X86
>>         select HAVE_ARCH_TRACEHOOK
>>         select HAVE_ARCH_TRANSPARENT_HUGEPAGE
>>         select HAVE_EBPF_JIT                    if X86_64
>> +       select HAVE_ARCH_VMAP_STACK             if X86_64
>>         select HAVE_CC_STACKPROTECTOR
>>         select HAVE_CMPXCHG_DOUBLE
>>         select HAVE_CMPXCHG_LOCAL
>> diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
>> index 8f321a1b03a1..14e4b20f0aaf 100644
>> --- a/arch/x86/include/asm/switch_to.h
>> +++ b/arch/x86/include/asm/switch_to.h
>> @@ -8,6 +8,28 @@ struct tss_struct;
>>  void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
>>                       struct tss_struct *tss);
>>
>> +/* This runs runs on the previous thread's stack. */
>> +static inline void prepare_switch_to(struct task_struct *prev,
>> +                                    struct task_struct *next)
>> +{
>> +#ifdef CONFIG_VMAP_STACK
>> +       /*
>> +        * If we switch to a stack that has a top-level paging entry
>> +        * that is not present in the current mm, the resulting #PF will
>> +        * will be promoted to a double-fault and we'll panic.  Probe
>> +        * the new stack now so that vmalloc_fault can fix up the page
>> +        * tables if needed.  This can only happen if we use a stack
>> +        * in vmap space.
>> +        *
>> +        * We assume that the stack is aligned so that it never spans
>> +        * more than one top-level paging entry.
>> +        *
>> +        * To minimize cache pollution, just follow the stack pointer.
>> +        */
>> +       READ_ONCE(*(unsigned char *)next->thread.sp);
>> +#endif
>> +}
>> +
>>  #ifdef CONFIG_X86_32
>>
>>  #ifdef CONFIG_CC_STACKPROTECTOR
>> @@ -39,6 +61,8 @@ do {                                                                  \
>>          */                                                             \
>>         unsigned long ebx, ecx, edx, esi, edi;                          \
>>                                                                         \
>> +       prepare_switch_to(prev, next);                                  \
>> +                                                                       \
>>         asm volatile("pushl %%ebp\n\t"          /* save    EBP   */     \
>>                      "movl %%esp,%[prev_sp]\n\t"        /* save    ESP   */ \
>>                      "movl %[next_sp],%%esp\n\t"        /* restore ESP   */ \
>> @@ -103,7 +127,9 @@ do {                                                                        \
>>   * clean in kernel mode, with the possible exception of IOPL.  Kernel IOPL
>>   * has no effect.
>>   */
>> -#define switch_to(prev, next, last) \
>> +#define switch_to(prev, next, last)                                      \
>> +       prepare_switch_to(prev, next);                                    \
>> +                                                                         \
>>         asm volatile(SAVE_CONTEXT                                         \
>>              "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */       \
>>              "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */    \
>> diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
>> index 00f03d82e69a..9cb7ea781176 100644
>> --- a/arch/x86/kernel/traps.c
>> +++ b/arch/x86/kernel/traps.c
>> @@ -292,12 +292,30 @@ DO_ERROR(X86_TRAP_NP,     SIGBUS,  "segment not present", segment_not_present)
>>  DO_ERROR(X86_TRAP_SS,     SIGBUS,  "stack segment",            stack_segment)
>>  DO_ERROR(X86_TRAP_AC,     SIGBUS,  "alignment check",          alignment_check)
>>
>> +#ifdef CONFIG_VMAP_STACK
>> +static void __noreturn handle_stack_overflow(const char *message,
>> +                                            struct pt_regs *regs,
>> +                                            unsigned long fault_address)
>> +{
>> +       printk(KERN_EMERG "BUG: stack guard page was hit at %p (stack is %p..%p)\n",
>> +                (void *)fault_address, current->stack,
>> +                (char *)current->stack + THREAD_SIZE - 1);
>> +       die(message, regs, 0);
>> +
>> +       /* Be absolutely certain we don't return. */
>> +       panic(message);
>> +}
>> +#endif
>> +
>>  #ifdef CONFIG_X86_64
>>  /* Runs on IST stack */
>>  dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
>>  {
>>         static const char str[] = "double fault";
>>         struct task_struct *tsk = current;
>> +#ifdef CONFIG_VMAP_STACK
>> +       unsigned long cr2;
>> +#endif
>>
>>  #ifdef CONFIG_X86_ESPFIX64
>>         extern unsigned char native_irq_return_iret[];
>> @@ -332,6 +350,20 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
>>         tsk->thread.error_code = error_code;
>>         tsk->thread.trap_nr = X86_TRAP_DF;
>>
>> +#ifdef CONFIG_VMAP_STACK
>> +       /*
>> +        * If we overflow the stack into a guard page, the CPU will fail
>> +        * to deliver #PF and will send #DF instead.  CR2 will contain
>> +        * the linear address of the second fault, which will be in the
>> +        * guard page below the bottom of the stack.
>> +        */
>> +       cr2 = read_cr2();
>> +       if ((unsigned long)tsk->stack - 1 - cr2 < PAGE_SIZE)
>> +               handle_stack_overflow(
>> +                       "kernel stack overflow (double-fault)",
>> +                       regs, cr2);
>> +#endif
>
> Is there any other way to tell if this was from a page fault?  If it
> wasn't a page fault then CR2 is undefined.

I guess it doesn't really matter, since the fault is fatal either way.
The error message might be incorrect though.

--
Brian Gerst
Andy Lutomirski June 27, 2016, 3:22 p.m. UTC | #3
On Mon, Jun 27, 2016 at 8:12 AM, Brian Gerst <brgerst@gmail.com> wrote:
> On Mon, Jun 27, 2016 at 11:01 AM, Brian Gerst <brgerst@gmail.com> wrote:
>> On Sun, Jun 26, 2016 at 5:55 PM, Andy Lutomirski <luto@kernel.org> wrote:
>>> This allows x86_64 kernels to enable vmapped stacks.  There are a
>>> couple of interesting bits.
>>>
>>> First, x86 lazily faults in top-level paging entries for the vmalloc
>>> area.  This won't work if we get a page fault while trying to access
>>> the stack: the CPU will promote it to a double-fault and we'll die.
>>> To avoid this problem, probe the new stack when switching stacks and
>>> forcibly populate the pgd entry for the stack when switching mms.
>>>
>>> Second, once we have guard pages around the stack, we'll want to
>>> detect and handle stack overflow.
>>>
>>> I didn't enable it on x86_32.  We'd need to rework the double-fault
>>> code a bit and I'm concerned about running out of vmalloc virtual
>>> addresses under some workloads.
>>>
>>> This patch, by itself, will behave somewhat erratically when the
>>> stack overflows while RSP is still more than a few tens of bytes
>>> above the bottom of the stack.  Specifically, we'll get #PF and make
>>> it to no_context and an oops without triggering a double-fault, and
>>> no_context doesn't know about stack overflows.  The next patch will
>>> improve that case.
>>>
>>> Signed-off-by: Andy Lutomirski <luto@kernel.org>
>>> ---
>>>  arch/x86/Kconfig                 |  1 +
>>>  arch/x86/include/asm/switch_to.h | 28 +++++++++++++++++++++++++++-
>>>  arch/x86/kernel/traps.c          | 32 ++++++++++++++++++++++++++++++++
>>>  arch/x86/mm/tlb.c                | 15 +++++++++++++++
>>>  4 files changed, 75 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
>>> index d9a94da0c29f..afdcf96ef109 100644
>>> --- a/arch/x86/Kconfig
>>> +++ b/arch/x86/Kconfig
>>> @@ -92,6 +92,7 @@ config X86
>>>         select HAVE_ARCH_TRACEHOOK
>>>         select HAVE_ARCH_TRANSPARENT_HUGEPAGE
>>>         select HAVE_EBPF_JIT                    if X86_64
>>> +       select HAVE_ARCH_VMAP_STACK             if X86_64
>>>         select HAVE_CC_STACKPROTECTOR
>>>         select HAVE_CMPXCHG_DOUBLE
>>>         select HAVE_CMPXCHG_LOCAL
>>> diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
>>> index 8f321a1b03a1..14e4b20f0aaf 100644
>>> --- a/arch/x86/include/asm/switch_to.h
>>> +++ b/arch/x86/include/asm/switch_to.h
>>> @@ -8,6 +8,28 @@ struct tss_struct;
>>>  void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
>>>                       struct tss_struct *tss);
>>>
>>> +/* This runs runs on the previous thread's stack. */
>>> +static inline void prepare_switch_to(struct task_struct *prev,
>>> +                                    struct task_struct *next)
>>> +{
>>> +#ifdef CONFIG_VMAP_STACK
>>> +       /*
>>> +        * If we switch to a stack that has a top-level paging entry
>>> +        * that is not present in the current mm, the resulting #PF will
>>> +        * will be promoted to a double-fault and we'll panic.  Probe
>>> +        * the new stack now so that vmalloc_fault can fix up the page
>>> +        * tables if needed.  This can only happen if we use a stack
>>> +        * in vmap space.
>>> +        *
>>> +        * We assume that the stack is aligned so that it never spans
>>> +        * more than one top-level paging entry.
>>> +        *
>>> +        * To minimize cache pollution, just follow the stack pointer.
>>> +        */
>>> +       READ_ONCE(*(unsigned char *)next->thread.sp);
>>> +#endif
>>> +}
>>> +
>>>  #ifdef CONFIG_X86_32
>>>
>>>  #ifdef CONFIG_CC_STACKPROTECTOR
>>> @@ -39,6 +61,8 @@ do {                                                                  \
>>>          */                                                             \
>>>         unsigned long ebx, ecx, edx, esi, edi;                          \
>>>                                                                         \
>>> +       prepare_switch_to(prev, next);                                  \
>>> +                                                                       \
>>>         asm volatile("pushl %%ebp\n\t"          /* save    EBP   */     \
>>>                      "movl %%esp,%[prev_sp]\n\t"        /* save    ESP   */ \
>>>                      "movl %[next_sp],%%esp\n\t"        /* restore ESP   */ \
>>> @@ -103,7 +127,9 @@ do {                                                                        \
>>>   * clean in kernel mode, with the possible exception of IOPL.  Kernel IOPL
>>>   * has no effect.
>>>   */
>>> -#define switch_to(prev, next, last) \
>>> +#define switch_to(prev, next, last)                                      \
>>> +       prepare_switch_to(prev, next);                                    \
>>> +                                                                         \
>>>         asm volatile(SAVE_CONTEXT                                         \
>>>              "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */       \
>>>              "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */    \
>>> diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
>>> index 00f03d82e69a..9cb7ea781176 100644
>>> --- a/arch/x86/kernel/traps.c
>>> +++ b/arch/x86/kernel/traps.c
>>> @@ -292,12 +292,30 @@ DO_ERROR(X86_TRAP_NP,     SIGBUS,  "segment not present", segment_not_present)
>>>  DO_ERROR(X86_TRAP_SS,     SIGBUS,  "stack segment",            stack_segment)
>>>  DO_ERROR(X86_TRAP_AC,     SIGBUS,  "alignment check",          alignment_check)
>>>
>>> +#ifdef CONFIG_VMAP_STACK
>>> +static void __noreturn handle_stack_overflow(const char *message,
>>> +                                            struct pt_regs *regs,
>>> +                                            unsigned long fault_address)
>>> +{
>>> +       printk(KERN_EMERG "BUG: stack guard page was hit at %p (stack is %p..%p)\n",
>>> +                (void *)fault_address, current->stack,
>>> +                (char *)current->stack + THREAD_SIZE - 1);
>>> +       die(message, regs, 0);
>>> +
>>> +       /* Be absolutely certain we don't return. */
>>> +       panic(message);
>>> +}
>>> +#endif
>>> +
>>>  #ifdef CONFIG_X86_64
>>>  /* Runs on IST stack */
>>>  dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
>>>  {
>>>         static const char str[] = "double fault";
>>>         struct task_struct *tsk = current;
>>> +#ifdef CONFIG_VMAP_STACK
>>> +       unsigned long cr2;
>>> +#endif
>>>
>>>  #ifdef CONFIG_X86_ESPFIX64
>>>         extern unsigned char native_irq_return_iret[];
>>> @@ -332,6 +350,20 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
>>>         tsk->thread.error_code = error_code;
>>>         tsk->thread.trap_nr = X86_TRAP_DF;
>>>
>>> +#ifdef CONFIG_VMAP_STACK
>>> +       /*
>>> +        * If we overflow the stack into a guard page, the CPU will fail
>>> +        * to deliver #PF and will send #DF instead.  CR2 will contain
>>> +        * the linear address of the second fault, which will be in the
>>> +        * guard page below the bottom of the stack.
>>> +        */
>>> +       cr2 = read_cr2();
>>> +       if ((unsigned long)tsk->stack - 1 - cr2 < PAGE_SIZE)
>>> +               handle_stack_overflow(
>>> +                       "kernel stack overflow (double-fault)",
>>> +                       regs, cr2);
>>> +#endif
>>
>> Is there any other way to tell if this was from a page fault?  If it
>> wasn't a page fault then CR2 is undefined.
>
> I guess it doesn't really matter, since the fault is fatal either way.
> The error message might be incorrect though.
>

It's at least worth a comment, though.  Maybe I should check if
regs->rsp is within 40 bytes of the bottom of the stack, too, such
that delivery of an inner fault would have double-faulted assuming the
inner fault didn't use an IST vector.

--Andy
Andy Lutomirski June 27, 2016, 3:54 p.m. UTC | #4
On Mon, Jun 27, 2016 at 8:22 AM, Andy Lutomirski <luto@amacapital.net> wrote:
> On Mon, Jun 27, 2016 at 8:12 AM, Brian Gerst <brgerst@gmail.com> wrote:
>> On Mon, Jun 27, 2016 at 11:01 AM, Brian Gerst <brgerst@gmail.com> wrote:
>>> On Sun, Jun 26, 2016 at 5:55 PM, Andy Lutomirski <luto@kernel.org> wrote:
>>>>  #ifdef CONFIG_X86_64
>>>>  /* Runs on IST stack */
>>>>  dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
>>>>  {
>>>>         static const char str[] = "double fault";
>>>>         struct task_struct *tsk = current;
>>>> +#ifdef CONFIG_VMAP_STACK
>>>> +       unsigned long cr2;
>>>> +#endif
>>>>
>>>>  #ifdef CONFIG_X86_ESPFIX64
>>>>         extern unsigned char native_irq_return_iret[];
>>>> @@ -332,6 +350,20 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
>>>>         tsk->thread.error_code = error_code;
>>>>         tsk->thread.trap_nr = X86_TRAP_DF;
>>>>
>>>> +#ifdef CONFIG_VMAP_STACK
>>>> +       /*
>>>> +        * If we overflow the stack into a guard page, the CPU will fail
>>>> +        * to deliver #PF and will send #DF instead.  CR2 will contain
>>>> +        * the linear address of the second fault, which will be in the
>>>> +        * guard page below the bottom of the stack.
>>>> +        */
>>>> +       cr2 = read_cr2();
>>>> +       if ((unsigned long)tsk->stack - 1 - cr2 < PAGE_SIZE)
>>>> +               handle_stack_overflow(
>>>> +                       "kernel stack overflow (double-fault)",
>>>> +                       regs, cr2);
>>>> +#endif
>>>
>>> Is there any other way to tell if this was from a page fault?  If it
>>> wasn't a page fault then CR2 is undefined.
>>
>> I guess it doesn't really matter, since the fault is fatal either way.
>> The error message might be incorrect though.
>>
>
> It's at least worth a comment, though.  Maybe I should check if
> regs->rsp is within 40 bytes of the bottom of the stack, too, such
> that delivery of an inner fault would have double-faulted assuming the
> inner fault didn't use an IST vector.
>

How about:

    /*
     * If we overflow the stack into a guard page, the CPU will fail
     * to deliver #PF and will send #DF instead.  CR2 will contain
     * the linear address of the second fault, which will be in the
     * guard page below the bottom of the stack.
     *
     * We're limited to using heuristics here, since the CPU does
     * not tell us what type of fault failed and, if the first fault
     * wasn't a page fault, CR2 may contain stale garbage.  To mostly
     * rule out garbage, we check if the saved RSP is close enough to
     * the bottom of the stack to cause exception delivery to fail.
     * The worst case is 7 stack slots: one for alignment, five for
     * SS..RIP, and one for the error code.
     */
    tsk_stack = (unsigned long)task_stack_page(tsk);
    if (regs->rsp <= tsk_stack + 7*8 && regs->rsp > tsk_stack - PAGE_SIZE) {
        /* A double-fault due to #PF delivery failure is plausible. */
        cr2 = read_cr2();
        if (tsk_stack - 1 - cr2 < PAGE_SIZE)
            handle_stack_overflow(
                "kernel stack overflow (double-fault)",
                regs, cr2);
    }

--Andy
Brian Gerst June 27, 2016, 4:17 p.m. UTC | #5
On Mon, Jun 27, 2016 at 11:54 AM, Andy Lutomirski <luto@amacapital.net> wrote:
> On Mon, Jun 27, 2016 at 8:22 AM, Andy Lutomirski <luto@amacapital.net> wrote:
>> On Mon, Jun 27, 2016 at 8:12 AM, Brian Gerst <brgerst@gmail.com> wrote:
>>> On Mon, Jun 27, 2016 at 11:01 AM, Brian Gerst <brgerst@gmail.com> wrote:
>>>> On Sun, Jun 26, 2016 at 5:55 PM, Andy Lutomirski <luto@kernel.org> wrote:
>>>>>  #ifdef CONFIG_X86_64
>>>>>  /* Runs on IST stack */
>>>>>  dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
>>>>>  {
>>>>>         static const char str[] = "double fault";
>>>>>         struct task_struct *tsk = current;
>>>>> +#ifdef CONFIG_VMAP_STACK
>>>>> +       unsigned long cr2;
>>>>> +#endif
>>>>>
>>>>>  #ifdef CONFIG_X86_ESPFIX64
>>>>>         extern unsigned char native_irq_return_iret[];
>>>>> @@ -332,6 +350,20 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
>>>>>         tsk->thread.error_code = error_code;
>>>>>         tsk->thread.trap_nr = X86_TRAP_DF;
>>>>>
>>>>> +#ifdef CONFIG_VMAP_STACK
>>>>> +       /*
>>>>> +        * If we overflow the stack into a guard page, the CPU will fail
>>>>> +        * to deliver #PF and will send #DF instead.  CR2 will contain
>>>>> +        * the linear address of the second fault, which will be in the
>>>>> +        * guard page below the bottom of the stack.
>>>>> +        */
>>>>> +       cr2 = read_cr2();
>>>>> +       if ((unsigned long)tsk->stack - 1 - cr2 < PAGE_SIZE)
>>>>> +               handle_stack_overflow(
>>>>> +                       "kernel stack overflow (double-fault)",
>>>>> +                       regs, cr2);
>>>>> +#endif
>>>>
>>>> Is there any other way to tell if this was from a page fault?  If it
>>>> wasn't a page fault then CR2 is undefined.
>>>
>>> I guess it doesn't really matter, since the fault is fatal either way.
>>> The error message might be incorrect though.
>>>
>>
>> It's at least worth a comment, though.  Maybe I should check if
>> regs->rsp is within 40 bytes of the bottom of the stack, too, such
>> that delivery of an inner fault would have double-faulted assuming the
>> inner fault didn't use an IST vector.
>>
>
> How about:
>
>     /*
>      * If we overflow the stack into a guard page, the CPU will fail
>      * to deliver #PF and will send #DF instead.  CR2 will contain
>      * the linear address of the second fault, which will be in the
>      * guard page below the bottom of the stack.
>      *
>      * We're limited to using heuristics here, since the CPU does
>      * not tell us what type of fault failed and, if the first fault
>      * wasn't a page fault, CR2 may contain stale garbage.  To mostly
>      * rule out garbage, we check if the saved RSP is close enough to
>      * the bottom of the stack to cause exception delivery to fail.
>      * The worst case is 7 stack slots: one for alignment, five for
>      * SS..RIP, and one for the error code.
>      */
>     tsk_stack = (unsigned long)task_stack_page(tsk);
>     if (regs->rsp <= tsk_stack + 7*8 && regs->rsp > tsk_stack - PAGE_SIZE) {
>         /* A double-fault due to #PF delivery failure is plausible. */
>         cr2 = read_cr2();
>         if (tsk_stack - 1 - cr2 < PAGE_SIZE)
>             handle_stack_overflow(
>                 "kernel stack overflow (double-fault)",
>                 regs, cr2);
>     }

I think RSP anywhere in the guard page would be best, since it could
have been decremented by a function prologue into the guard page
before an access that triggers the page fault.

--
Brian Gerst
Andy Lutomirski June 27, 2016, 4:35 p.m. UTC | #6
On Mon, Jun 27, 2016 at 9:17 AM, Brian Gerst <brgerst@gmail.com> wrote:
> On Mon, Jun 27, 2016 at 11:54 AM, Andy Lutomirski <luto@amacapital.net> wrote:
>> On Mon, Jun 27, 2016 at 8:22 AM, Andy Lutomirski <luto@amacapital.net> wrote:
>>> On Mon, Jun 27, 2016 at 8:12 AM, Brian Gerst <brgerst@gmail.com> wrote:
>>>> On Mon, Jun 27, 2016 at 11:01 AM, Brian Gerst <brgerst@gmail.com> wrote:
>>>>> On Sun, Jun 26, 2016 at 5:55 PM, Andy Lutomirski <luto@kernel.org> wrote:
>>>>>>  #ifdef CONFIG_X86_64
>>>>>>  /* Runs on IST stack */
>>>>>>  dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
>>>>>>  {
>>>>>>         static const char str[] = "double fault";
>>>>>>         struct task_struct *tsk = current;
>>>>>> +#ifdef CONFIG_VMAP_STACK
>>>>>> +       unsigned long cr2;
>>>>>> +#endif
>>>>>>
>>>>>>  #ifdef CONFIG_X86_ESPFIX64
>>>>>>         extern unsigned char native_irq_return_iret[];
>>>>>> @@ -332,6 +350,20 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
>>>>>>         tsk->thread.error_code = error_code;
>>>>>>         tsk->thread.trap_nr = X86_TRAP_DF;
>>>>>>
>>>>>> +#ifdef CONFIG_VMAP_STACK
>>>>>> +       /*
>>>>>> +        * If we overflow the stack into a guard page, the CPU will fail
>>>>>> +        * to deliver #PF and will send #DF instead.  CR2 will contain
>>>>>> +        * the linear address of the second fault, which will be in the
>>>>>> +        * guard page below the bottom of the stack.
>>>>>> +        */
>>>>>> +       cr2 = read_cr2();
>>>>>> +       if ((unsigned long)tsk->stack - 1 - cr2 < PAGE_SIZE)
>>>>>> +               handle_stack_overflow(
>>>>>> +                       "kernel stack overflow (double-fault)",
>>>>>> +                       regs, cr2);
>>>>>> +#endif
>>>>>
>>>>> Is there any other way to tell if this was from a page fault?  If it
>>>>> wasn't a page fault then CR2 is undefined.
>>>>
>>>> I guess it doesn't really matter, since the fault is fatal either way.
>>>> The error message might be incorrect though.
>>>>
>>>
>>> It's at least worth a comment, though.  Maybe I should check if
>>> regs->rsp is within 40 bytes of the bottom of the stack, too, such
>>> that delivery of an inner fault would have double-faulted assuming the
>>> inner fault didn't use an IST vector.
>>>
>>
>> How about:
>>
>>     /*
>>      * If we overflow the stack into a guard page, the CPU will fail
>>      * to deliver #PF and will send #DF instead.  CR2 will contain
>>      * the linear address of the second fault, which will be in the
>>      * guard page below the bottom of the stack.
>>      *
>>      * We're limited to using heuristics here, since the CPU does
>>      * not tell us what type of fault failed and, if the first fault
>>      * wasn't a page fault, CR2 may contain stale garbage.  To mostly
>>      * rule out garbage, we check if the saved RSP is close enough to
>>      * the bottom of the stack to cause exception delivery to fail.
>>      * The worst case is 7 stack slots: one for alignment, five for
>>      * SS..RIP, and one for the error code.
>>      */
>>     tsk_stack = (unsigned long)task_stack_page(tsk);
>>     if (regs->rsp <= tsk_stack + 7*8 && regs->rsp > tsk_stack - PAGE_SIZE) {
>>         /* A double-fault due to #PF delivery failure is plausible. */
>>         cr2 = read_cr2();
>>         if (tsk_stack - 1 - cr2 < PAGE_SIZE)
>>             handle_stack_overflow(
>>                 "kernel stack overflow (double-fault)",
>>                 regs, cr2);
>>     }
>
> I think RSP anywhere in the guard page would be best, since it could
> have been decremented by a function prologue into the guard page
> before an access that triggers the page fault.
>

I think that can miss some stack overflows.  Suppose that RSP points
very close to the bottom of the stack and we take an unrelated fault.
The CPU can fail to deliver that fault and we get a double fault
instead.  But I counted wrong, too.  Do you like this version and its
explanation?

    /*
     * If we overflow the stack into a guard page, the CPU will fail
     * to deliver #PF and will send #DF instead.  Similarly, if we
     * take any non-IST exception while too close to the bottom of
     * the stack, the processor will get a page fault while
     * delivering the exception and will generate a double fault.
     *
     * According to the SDM (footnote in 6.15 under "Interrupt 14 -
     * Page-Fault Exception (#PF):
     *
     *   Processors update CR2 whenever a page fault is detected. If a
     *   second page fault occurs while an earlier page fault is being
     *   deliv- ered, the faulting linear address of the second fault will
     *   overwrite the contents of CR2 (replacing the previous
     *   address). These updates to CR2 occur even if the page fault
     *   results in a double fault or occurs during the delivery of a
     *   double fault.
     *
     * However, if we got here due to a non-page-fault exception while
     * delivering a non-page-fault exception, CR2 may contain a
     * stale value.
     *
     * As a heuristic: we consider this double fault to be a stack
     * overflow if CR2 points to the guard page and RSP is either
     * in the guard page or close enough to the bottom of the stack
     *
     * We're limited to using heuristics here, since the CPU does
     * not tell us what type of fault failed and, if the first fault
     * wasn't a page fault, CR2 may contain stale garbage.  To
     * mostly rule out garbage, we check if the saved RSP is close
     * enough to the bottom of the stack to cause exception delivery
     * to fail.  If RSP == tsk_stack + 48 and we take an exception,
     * the stack is already aligned and there will be enough room
     * SS, RSP, RFLAGS, CS, RIP, and a possible error code.  With
     * any less space left, exception delivery could fail.
     */
    tsk_stack = (unsigned long)task_stack_page(tsk);
    if (regs->rsp < tsk_stack + 48 && regs->rsp > tsk_stack - PAGE_SIZE) {
        /* A double-fault due to #PF delivery failure is plausible. */
        cr2 = read_cr2();
        if (tsk_stack - 1 - cr2 < PAGE_SIZE)
            handle_stack_overflow(
                "kernel stack overflow (double-fault)",
                regs, cr2);
    }
Brian Gerst June 27, 2016, 5:09 p.m. UTC | #7
On Mon, Jun 27, 2016 at 12:35 PM, Andy Lutomirski <luto@amacapital.net> wrote:
> On Mon, Jun 27, 2016 at 9:17 AM, Brian Gerst <brgerst@gmail.com> wrote:
>> On Mon, Jun 27, 2016 at 11:54 AM, Andy Lutomirski <luto@amacapital.net> wrote:
>>> On Mon, Jun 27, 2016 at 8:22 AM, Andy Lutomirski <luto@amacapital.net> wrote:
>>>> On Mon, Jun 27, 2016 at 8:12 AM, Brian Gerst <brgerst@gmail.com> wrote:
>>>>> On Mon, Jun 27, 2016 at 11:01 AM, Brian Gerst <brgerst@gmail.com> wrote:
>>>>>> On Sun, Jun 26, 2016 at 5:55 PM, Andy Lutomirski <luto@kernel.org> wrote:
>>>>>>>  #ifdef CONFIG_X86_64
>>>>>>>  /* Runs on IST stack */
>>>>>>>  dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
>>>>>>>  {
>>>>>>>         static const char str[] = "double fault";
>>>>>>>         struct task_struct *tsk = current;
>>>>>>> +#ifdef CONFIG_VMAP_STACK
>>>>>>> +       unsigned long cr2;
>>>>>>> +#endif
>>>>>>>
>>>>>>>  #ifdef CONFIG_X86_ESPFIX64
>>>>>>>         extern unsigned char native_irq_return_iret[];
>>>>>>> @@ -332,6 +350,20 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
>>>>>>>         tsk->thread.error_code = error_code;
>>>>>>>         tsk->thread.trap_nr = X86_TRAP_DF;
>>>>>>>
>>>>>>> +#ifdef CONFIG_VMAP_STACK
>>>>>>> +       /*
>>>>>>> +        * If we overflow the stack into a guard page, the CPU will fail
>>>>>>> +        * to deliver #PF and will send #DF instead.  CR2 will contain
>>>>>>> +        * the linear address of the second fault, which will be in the
>>>>>>> +        * guard page below the bottom of the stack.
>>>>>>> +        */
>>>>>>> +       cr2 = read_cr2();
>>>>>>> +       if ((unsigned long)tsk->stack - 1 - cr2 < PAGE_SIZE)
>>>>>>> +               handle_stack_overflow(
>>>>>>> +                       "kernel stack overflow (double-fault)",
>>>>>>> +                       regs, cr2);
>>>>>>> +#endif
>>>>>>
>>>>>> Is there any other way to tell if this was from a page fault?  If it
>>>>>> wasn't a page fault then CR2 is undefined.
>>>>>
>>>>> I guess it doesn't really matter, since the fault is fatal either way.
>>>>> The error message might be incorrect though.
>>>>>
>>>>
>>>> It's at least worth a comment, though.  Maybe I should check if
>>>> regs->rsp is within 40 bytes of the bottom of the stack, too, such
>>>> that delivery of an inner fault would have double-faulted assuming the
>>>> inner fault didn't use an IST vector.
>>>>
>>>
>>> How about:
>>>
>>>     /*
>>>      * If we overflow the stack into a guard page, the CPU will fail
>>>      * to deliver #PF and will send #DF instead.  CR2 will contain
>>>      * the linear address of the second fault, which will be in the
>>>      * guard page below the bottom of the stack.
>>>      *
>>>      * We're limited to using heuristics here, since the CPU does
>>>      * not tell us what type of fault failed and, if the first fault
>>>      * wasn't a page fault, CR2 may contain stale garbage.  To mostly
>>>      * rule out garbage, we check if the saved RSP is close enough to
>>>      * the bottom of the stack to cause exception delivery to fail.
>>>      * The worst case is 7 stack slots: one for alignment, five for
>>>      * SS..RIP, and one for the error code.
>>>      */
>>>     tsk_stack = (unsigned long)task_stack_page(tsk);
>>>     if (regs->rsp <= tsk_stack + 7*8 && regs->rsp > tsk_stack - PAGE_SIZE) {
>>>         /* A double-fault due to #PF delivery failure is plausible. */
>>>         cr2 = read_cr2();
>>>         if (tsk_stack - 1 - cr2 < PAGE_SIZE)
>>>             handle_stack_overflow(
>>>                 "kernel stack overflow (double-fault)",
>>>                 regs, cr2);
>>>     }
>>
>> I think RSP anywhere in the guard page would be best, since it could
>> have been decremented by a function prologue into the guard page
>> before an access that triggers the page fault.
>>
>
> I think that can miss some stack overflows.  Suppose that RSP points
> very close to the bottom of the stack and we take an unrelated fault.
> The CPU can fail to deliver that fault and we get a double fault
> instead.  But I counted wrong, too.  Do you like this version and its
> explanation?
>
>     /*
>      * If we overflow the stack into a guard page, the CPU will fail
>      * to deliver #PF and will send #DF instead.  Similarly, if we
>      * take any non-IST exception while too close to the bottom of
>      * the stack, the processor will get a page fault while
>      * delivering the exception and will generate a double fault.
>      *
>      * According to the SDM (footnote in 6.15 under "Interrupt 14 -
>      * Page-Fault Exception (#PF):
>      *
>      *   Processors update CR2 whenever a page fault is detected. If a
>      *   second page fault occurs while an earlier page fault is being
>      *   deliv- ered, the faulting linear address of the second fault will
>      *   overwrite the contents of CR2 (replacing the previous
>      *   address). These updates to CR2 occur even if the page fault
>      *   results in a double fault or occurs during the delivery of a
>      *   double fault.
>      *
>      * However, if we got here due to a non-page-fault exception while
>      * delivering a non-page-fault exception, CR2 may contain a
>      * stale value.
>      *
>      * As a heuristic: we consider this double fault to be a stack
>      * overflow if CR2 points to the guard page and RSP is either
>      * in the guard page or close enough to the bottom of the stack
>      *
>      * We're limited to using heuristics here, since the CPU does
>      * not tell us what type of fault failed and, if the first fault
>      * wasn't a page fault, CR2 may contain stale garbage.  To
>      * mostly rule out garbage, we check if the saved RSP is close
>      * enough to the bottom of the stack to cause exception delivery
>      * to fail.  If RSP == tsk_stack + 48 and we take an exception,
>      * the stack is already aligned and there will be enough room
>      * SS, RSP, RFLAGS, CS, RIP, and a possible error code.  With
>      * any less space left, exception delivery could fail.
>      */
>     tsk_stack = (unsigned long)task_stack_page(tsk);
>     if (regs->rsp < tsk_stack + 48 && regs->rsp > tsk_stack - PAGE_SIZE) {
>         /* A double-fault due to #PF delivery failure is plausible. */
>         cr2 = read_cr2();
>         if (tsk_stack - 1 - cr2 < PAGE_SIZE)
>             handle_stack_overflow(
>                 "kernel stack overflow (double-fault)",
>                 regs, cr2);
>     }

Actually I think your quote from the SDM contradicts this.  The second
#PF (when trying to invoke the page fault handler) would update CR2
with an address in the guard page.

--
Brian Gerst
Brian Gerst June 27, 2016, 5:23 p.m. UTC | #8
On Mon, Jun 27, 2016 at 1:09 PM, Brian Gerst <brgerst@gmail.com> wrote:
> On Mon, Jun 27, 2016 at 12:35 PM, Andy Lutomirski <luto@amacapital.net> wrote:
>> On Mon, Jun 27, 2016 at 9:17 AM, Brian Gerst <brgerst@gmail.com> wrote:
>>> On Mon, Jun 27, 2016 at 11:54 AM, Andy Lutomirski <luto@amacapital.net> wrote:
>>>> On Mon, Jun 27, 2016 at 8:22 AM, Andy Lutomirski <luto@amacapital.net> wrote:
>>>>> On Mon, Jun 27, 2016 at 8:12 AM, Brian Gerst <brgerst@gmail.com> wrote:
>>>>>> On Mon, Jun 27, 2016 at 11:01 AM, Brian Gerst <brgerst@gmail.com> wrote:
>>>>>>> On Sun, Jun 26, 2016 at 5:55 PM, Andy Lutomirski <luto@kernel.org> wrote:
>>>>>>>>  #ifdef CONFIG_X86_64
>>>>>>>>  /* Runs on IST stack */
>>>>>>>>  dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
>>>>>>>>  {
>>>>>>>>         static const char str[] = "double fault";
>>>>>>>>         struct task_struct *tsk = current;
>>>>>>>> +#ifdef CONFIG_VMAP_STACK
>>>>>>>> +       unsigned long cr2;
>>>>>>>> +#endif
>>>>>>>>
>>>>>>>>  #ifdef CONFIG_X86_ESPFIX64
>>>>>>>>         extern unsigned char native_irq_return_iret[];
>>>>>>>> @@ -332,6 +350,20 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
>>>>>>>>         tsk->thread.error_code = error_code;
>>>>>>>>         tsk->thread.trap_nr = X86_TRAP_DF;
>>>>>>>>
>>>>>>>> +#ifdef CONFIG_VMAP_STACK
>>>>>>>> +       /*
>>>>>>>> +        * If we overflow the stack into a guard page, the CPU will fail
>>>>>>>> +        * to deliver #PF and will send #DF instead.  CR2 will contain
>>>>>>>> +        * the linear address of the second fault, which will be in the
>>>>>>>> +        * guard page below the bottom of the stack.
>>>>>>>> +        */
>>>>>>>> +       cr2 = read_cr2();
>>>>>>>> +       if ((unsigned long)tsk->stack - 1 - cr2 < PAGE_SIZE)
>>>>>>>> +               handle_stack_overflow(
>>>>>>>> +                       "kernel stack overflow (double-fault)",
>>>>>>>> +                       regs, cr2);
>>>>>>>> +#endif
>>>>>>>
>>>>>>> Is there any other way to tell if this was from a page fault?  If it
>>>>>>> wasn't a page fault then CR2 is undefined.
>>>>>>
>>>>>> I guess it doesn't really matter, since the fault is fatal either way.
>>>>>> The error message might be incorrect though.
>>>>>>
>>>>>
>>>>> It's at least worth a comment, though.  Maybe I should check if
>>>>> regs->rsp is within 40 bytes of the bottom of the stack, too, such
>>>>> that delivery of an inner fault would have double-faulted assuming the
>>>>> inner fault didn't use an IST vector.
>>>>>
>>>>
>>>> How about:
>>>>
>>>>     /*
>>>>      * If we overflow the stack into a guard page, the CPU will fail
>>>>      * to deliver #PF and will send #DF instead.  CR2 will contain
>>>>      * the linear address of the second fault, which will be in the
>>>>      * guard page below the bottom of the stack.
>>>>      *
>>>>      * We're limited to using heuristics here, since the CPU does
>>>>      * not tell us what type of fault failed and, if the first fault
>>>>      * wasn't a page fault, CR2 may contain stale garbage.  To mostly
>>>>      * rule out garbage, we check if the saved RSP is close enough to
>>>>      * the bottom of the stack to cause exception delivery to fail.
>>>>      * The worst case is 7 stack slots: one for alignment, five for
>>>>      * SS..RIP, and one for the error code.
>>>>      */
>>>>     tsk_stack = (unsigned long)task_stack_page(tsk);
>>>>     if (regs->rsp <= tsk_stack + 7*8 && regs->rsp > tsk_stack - PAGE_SIZE) {
>>>>         /* A double-fault due to #PF delivery failure is plausible. */
>>>>         cr2 = read_cr2();
>>>>         if (tsk_stack - 1 - cr2 < PAGE_SIZE)
>>>>             handle_stack_overflow(
>>>>                 "kernel stack overflow (double-fault)",
>>>>                 regs, cr2);
>>>>     }
>>>
>>> I think RSP anywhere in the guard page would be best, since it could
>>> have been decremented by a function prologue into the guard page
>>> before an access that triggers the page fault.
>>>
>>
>> I think that can miss some stack overflows.  Suppose that RSP points
>> very close to the bottom of the stack and we take an unrelated fault.
>> The CPU can fail to deliver that fault and we get a double fault
>> instead.  But I counted wrong, too.  Do you like this version and its
>> explanation?
>>
>>     /*
>>      * If we overflow the stack into a guard page, the CPU will fail
>>      * to deliver #PF and will send #DF instead.  Similarly, if we
>>      * take any non-IST exception while too close to the bottom of
>>      * the stack, the processor will get a page fault while
>>      * delivering the exception and will generate a double fault.
>>      *
>>      * According to the SDM (footnote in 6.15 under "Interrupt 14 -
>>      * Page-Fault Exception (#PF):
>>      *
>>      *   Processors update CR2 whenever a page fault is detected. If a
>>      *   second page fault occurs while an earlier page fault is being
>>      *   deliv- ered, the faulting linear address of the second fault will
>>      *   overwrite the contents of CR2 (replacing the previous
>>      *   address). These updates to CR2 occur even if the page fault
>>      *   results in a double fault or occurs during the delivery of a
>>      *   double fault.
>>      *
>>      * However, if we got here due to a non-page-fault exception while
>>      * delivering a non-page-fault exception, CR2 may contain a
>>      * stale value.
>>      *
>>      * As a heuristic: we consider this double fault to be a stack
>>      * overflow if CR2 points to the guard page and RSP is either
>>      * in the guard page or close enough to the bottom of the stack
>>      *
>>      * We're limited to using heuristics here, since the CPU does
>>      * not tell us what type of fault failed and, if the first fault
>>      * wasn't a page fault, CR2 may contain stale garbage.  To
>>      * mostly rule out garbage, we check if the saved RSP is close
>>      * enough to the bottom of the stack to cause exception delivery
>>      * to fail.  If RSP == tsk_stack + 48 and we take an exception,
>>      * the stack is already aligned and there will be enough room
>>      * SS, RSP, RFLAGS, CS, RIP, and a possible error code.  With
>>      * any less space left, exception delivery could fail.
>>      */
>>     tsk_stack = (unsigned long)task_stack_page(tsk);
>>     if (regs->rsp < tsk_stack + 48 && regs->rsp > tsk_stack - PAGE_SIZE) {
>>         /* A double-fault due to #PF delivery failure is plausible. */
>>         cr2 = read_cr2();
>>         if (tsk_stack - 1 - cr2 < PAGE_SIZE)
>>             handle_stack_overflow(
>>                 "kernel stack overflow (double-fault)",
>>                 regs, cr2);
>>     }
>
> Actually I think your quote from the SDM contradicts this.  The second
> #PF (when trying to invoke the page fault handler) would update CR2
> with an address in the guard page.

Nevermind, I'm totally misunderstanding this. I guess the real
question is when is RSP updated?  During each push or once before or
after the frame is pushed.  Checking for the bottom of the valid stack
covers the case when RSP isn't updated until after the whole frame is
pushed successfully.

--
Brian Gerst
Linus Torvalds June 27, 2016, 5:28 p.m. UTC | #9
On Mon, Jun 27, 2016 at 8:54 AM, Andy Lutomirski <luto@amacapital.net> wrote:
>
> How about:
>
>     tsk_stack = (unsigned long)task_stack_page(tsk);
>     if (regs->rsp <= tsk_stack + 7*8 && regs->rsp > tsk_stack - PAGE_SIZE) {

I'm not at all convinced that regs->rsp will be all that reliable
under a double-fault scenario either. I'd be more inclined to trusr
cr2 than the register state.

It's true that double faults can happen for *other* reasons entirely,
and as such it's not clear that %cr2 is reliable either, but since
this is all just about a printout, I'd rather go that way anyway.

                 Linus
Andy Lutomirski June 27, 2016, 5:30 p.m. UTC | #10
On Mon, Jun 27, 2016 at 10:28 AM, Linus Torvalds
<torvalds@linux-foundation.org> wrote:
> On Mon, Jun 27, 2016 at 8:54 AM, Andy Lutomirski <luto@amacapital.net> wrote:
>>
>> How about:
>>
>>     tsk_stack = (unsigned long)task_stack_page(tsk);
>>     if (regs->rsp <= tsk_stack + 7*8 && regs->rsp > tsk_stack - PAGE_SIZE) {
>
> I'm not at all convinced that regs->rsp will be all that reliable
> under a double-fault scenario either. I'd be more inclined to trusr
> cr2 than the register state.
>
> It's true that double faults can happen for *other* reasons entirely,
> and as such it's not clear that %cr2 is reliable either, but since
> this is all just about a printout, I'd rather go that way anyway.

Fair enough.  The chance that we get #GP-in-#GP or similar while CR2
coincidentally points to the guard page is quite low.  I'll add all
the details to the comment but I'll leave the code alone.

FWIW, the manual only says that CS and RIP are untrustworthy, not that
RSP is untrustworthy, but it doesn't specify *what* RSP would contain
anywhere I can find.  I don't think this is important enough to start
harassing the Intel and AMD folks over.

--Andy
diff mbox

Patch

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index d9a94da0c29f..afdcf96ef109 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -92,6 +92,7 @@  config X86
 	select HAVE_ARCH_TRACEHOOK
 	select HAVE_ARCH_TRANSPARENT_HUGEPAGE
 	select HAVE_EBPF_JIT			if X86_64
+	select HAVE_ARCH_VMAP_STACK		if X86_64
 	select HAVE_CC_STACKPROTECTOR
 	select HAVE_CMPXCHG_DOUBLE
 	select HAVE_CMPXCHG_LOCAL
diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
index 8f321a1b03a1..14e4b20f0aaf 100644
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -8,6 +8,28 @@  struct tss_struct;
 void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
 		      struct tss_struct *tss);
 
+/* This runs runs on the previous thread's stack. */
+static inline void prepare_switch_to(struct task_struct *prev,
+				     struct task_struct *next)
+{
+#ifdef CONFIG_VMAP_STACK
+	/*
+	 * If we switch to a stack that has a top-level paging entry
+	 * that is not present in the current mm, the resulting #PF will
+	 * will be promoted to a double-fault and we'll panic.  Probe
+	 * the new stack now so that vmalloc_fault can fix up the page
+	 * tables if needed.  This can only happen if we use a stack
+	 * in vmap space.
+	 *
+	 * We assume that the stack is aligned so that it never spans
+	 * more than one top-level paging entry.
+	 *
+	 * To minimize cache pollution, just follow the stack pointer.
+	 */
+	READ_ONCE(*(unsigned char *)next->thread.sp);
+#endif
+}
+
 #ifdef CONFIG_X86_32
 
 #ifdef CONFIG_CC_STACKPROTECTOR
@@ -39,6 +61,8 @@  do {									\
 	 */								\
 	unsigned long ebx, ecx, edx, esi, edi;				\
 									\
+	prepare_switch_to(prev, next);					\
+									\
 	asm volatile("pushl %%ebp\n\t"		/* save    EBP   */	\
 		     "movl %%esp,%[prev_sp]\n\t"	/* save    ESP   */ \
 		     "movl %[next_sp],%%esp\n\t"	/* restore ESP   */ \
@@ -103,7 +127,9 @@  do {									\
  * clean in kernel mode, with the possible exception of IOPL.  Kernel IOPL
  * has no effect.
  */
-#define switch_to(prev, next, last) \
+#define switch_to(prev, next, last)					  \
+	prepare_switch_to(prev, next);					  \
+									  \
 	asm volatile(SAVE_CONTEXT					  \
 	     "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */	  \
 	     "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */	  \
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 00f03d82e69a..9cb7ea781176 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -292,12 +292,30 @@  DO_ERROR(X86_TRAP_NP,     SIGBUS,  "segment not present",	segment_not_present)
 DO_ERROR(X86_TRAP_SS,     SIGBUS,  "stack segment",		stack_segment)
 DO_ERROR(X86_TRAP_AC,     SIGBUS,  "alignment check",		alignment_check)
 
+#ifdef CONFIG_VMAP_STACK
+static void __noreturn handle_stack_overflow(const char *message,
+					     struct pt_regs *regs,
+					     unsigned long fault_address)
+{
+	printk(KERN_EMERG "BUG: stack guard page was hit at %p (stack is %p..%p)\n",
+		 (void *)fault_address, current->stack,
+		 (char *)current->stack + THREAD_SIZE - 1);
+	die(message, regs, 0);
+
+	/* Be absolutely certain we don't return. */
+	panic(message);
+}
+#endif
+
 #ifdef CONFIG_X86_64
 /* Runs on IST stack */
 dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
 {
 	static const char str[] = "double fault";
 	struct task_struct *tsk = current;
+#ifdef CONFIG_VMAP_STACK
+	unsigned long cr2;
+#endif
 
 #ifdef CONFIG_X86_ESPFIX64
 	extern unsigned char native_irq_return_iret[];
@@ -332,6 +350,20 @@  dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
 	tsk->thread.error_code = error_code;
 	tsk->thread.trap_nr = X86_TRAP_DF;
 
+#ifdef CONFIG_VMAP_STACK
+	/*
+	 * If we overflow the stack into a guard page, the CPU will fail
+	 * to deliver #PF and will send #DF instead.  CR2 will contain
+	 * the linear address of the second fault, which will be in the
+	 * guard page below the bottom of the stack.
+	 */
+	cr2 = read_cr2();
+	if ((unsigned long)tsk->stack - 1 - cr2 < PAGE_SIZE)
+		handle_stack_overflow(
+			"kernel stack overflow (double-fault)",
+			regs, cr2);
+#endif
+
 #ifdef CONFIG_DOUBLEFAULT
 	df_debug(regs, error_code);
 #endif
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 5643fd0b1a7d..fbf036ae72ac 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -77,10 +77,25 @@  void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 	unsigned cpu = smp_processor_id();
 
 	if (likely(prev != next)) {
+		if (IS_ENABLED(CONFIG_VMAP_STACK)) {
+			/*
+			 * If our current stack is in vmalloc space and isn't
+			 * mapped in the new pgd, we'll double-fault.  Forcibly
+			 * map it.
+			 */
+			unsigned int stack_pgd_index =
+				pgd_index(current_stack_pointer());
+			pgd_t *pgd = next->pgd + stack_pgd_index;
+
+			if (unlikely(pgd_none(*pgd)))
+				set_pgd(pgd, init_mm.pgd[stack_pgd_index]);
+		}
+
 #ifdef CONFIG_SMP
 		this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
 		this_cpu_write(cpu_tlbstate.active_mm, next);
 #endif
+
 		cpumask_set_cpu(cpu, mm_cpumask(next));
 
 		/*