diff mbox

[2/2] arm64: Clear the stack

Message ID 20180502203326.9491-3-labbott@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Laura Abbott May 2, 2018, 8:33 p.m. UTC
Implementation of stackleak based heavily on the x86 version

Signed-off-by: Laura Abbott <labbott@redhat.com>
---
Now written in C instead of a bunch of assembly.
---
 arch/arm64/Kconfig                    |  1 +
 arch/arm64/include/asm/processor.h    |  6 ++++
 arch/arm64/kernel/Makefile            |  3 ++
 arch/arm64/kernel/entry.S             |  6 ++++
 arch/arm64/kernel/erase.c             | 55 +++++++++++++++++++++++++++++++++++
 arch/arm64/kernel/process.c           | 16 ++++++++++
 drivers/firmware/efi/libstub/Makefile |  3 +-
 scripts/Makefile.gcc-plugins          |  5 +++-
 8 files changed, 93 insertions(+), 2 deletions(-)
 create mode 100644 arch/arm64/kernel/erase.c

Comments

Kees Cook May 2, 2018, 9:31 p.m. UTC | #1
On Wed, May 2, 2018 at 1:33 PM, Laura Abbott <labbott@redhat.com> wrote:
>
> Implementation of stackleak based heavily on the x86 version

Awesome! Notes below for both you and Alexander, since I think we can
create a common code base instead of having near-duplicates in the
arch/ trees...

>
> Signed-off-by: Laura Abbott <labbott@redhat.com>
> ---
> Now written in C instead of a bunch of assembly.
> ---
>  arch/arm64/Kconfig                    |  1 +
>  arch/arm64/include/asm/processor.h    |  6 ++++
>  arch/arm64/kernel/Makefile            |  3 ++
>  arch/arm64/kernel/entry.S             |  6 ++++
>  arch/arm64/kernel/erase.c             | 55 +++++++++++++++++++++++++++++++++++
>  arch/arm64/kernel/process.c           | 16 ++++++++++
>  drivers/firmware/efi/libstub/Makefile |  3 +-
>  scripts/Makefile.gcc-plugins          |  5 +++-
>  8 files changed, 93 insertions(+), 2 deletions(-)
>  create mode 100644 arch/arm64/kernel/erase.c
>
> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> index eb2cf4938f6d..b0221db95dc9 100644
> --- a/arch/arm64/Kconfig
> +++ b/arch/arm64/Kconfig
> @@ -92,6 +92,7 @@ config ARM64
>         select HAVE_ARCH_MMAP_RND_BITS
>         select HAVE_ARCH_MMAP_RND_COMPAT_BITS if COMPAT
>         select HAVE_ARCH_SECCOMP_FILTER
> +       select HAVE_ARCH_STACKLEAK
>         select HAVE_ARCH_THREAD_STRUCT_WHITELIST
>         select HAVE_ARCH_TRACEHOOK
>         select HAVE_ARCH_TRANSPARENT_HUGEPAGE
> diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h
> index 767598932549..d31ab80ff647 100644
> --- a/arch/arm64/include/asm/processor.h
> +++ b/arch/arm64/include/asm/processor.h
> @@ -124,6 +124,12 @@ struct thread_struct {
>         unsigned long           fault_address;  /* fault info */
>         unsigned long           fault_code;     /* ESR_EL1 value */
>         struct debug_info       debug;          /* debugging */
> +#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
> +       unsigned long           lowest_stack;
> +#ifdef CONFIG_STACKLEAK_METRICS
> +       unsigned long           prev_lowest_stack;
> +#endif
> +#endif

I wonder if x86 and arm64 could include a common struct here that was
empty when the plugin is disabled... it would keep the ifdefs in one
place. Maybe include/linux/stackleak.h could be:

---start---
/* Poison value points to the unused hole in the virtual memory map */
#define STACKLEAK_POISON -0xBEEF
#define STACKLEAK_POISON_CHECK_DEPTH 128

struct stackleak {
#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
       unsigned long           lowest;
#ifdef CONFIG_STACKLEAK_METRICS
       unsigned long           prev_lowest;
#endif
#endif
};

asmlinkage void erase_kstack(void);
---eof---

and arch/*/include/asm/processor.h could do:

@@ -124,6 +124,12 @@ struct thread_struct {
        unsigned long           fault_address;  /* fault info */
        unsigned long           fault_code;     /* ESR_EL1 value */
        struct debug_info       debug;          /* debugging */
+       struct stackleak         stackleak;

and arch/x86/entry/erase.c could move to maybe kernel/stackleak.c?
(Oh, I notice this needs an SPDX line too.)

>  static inline void arch_thread_struct_whitelist(unsigned long *offset,
> diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
> index bf825f38d206..0ceea613c65b 100644
> --- a/arch/arm64/kernel/Makefile
> +++ b/arch/arm64/kernel/Makefile
> @@ -55,6 +55,9 @@ arm64-reloc-test-y := reloc_test_core.o reloc_test_syms.o
>  arm64-obj-$(CONFIG_CRASH_DUMP)         += crash_dump.o
>  arm64-obj-$(CONFIG_ARM_SDE_INTERFACE)  += sdei.o
>
> +arm64-obj-$(CONFIG_GCC_PLUGIN_STACKLEAK) += erase.o
> +KASAN_SANITIZE_erase.o := n
> +
>  obj-y                                  += $(arm64-obj-y) vdso/ probes/
>  obj-m                                  += $(arm64-obj-m)
>  head-y                                 := head.o
> diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
> index ec2ee720e33e..3144f1ebdc18 100644
> --- a/arch/arm64/kernel/entry.S
> +++ b/arch/arm64/kernel/entry.S
> @@ -401,6 +401,11 @@ tsk        .req    x28             // current thread_info
>
>         .text
>
> +       .macro  ERASE_KSTACK
> +#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
> +       bl      erase_kstack
> +#endif
> +       .endm
>  /*
>   * Exception vectors.
>   */
> @@ -906,6 +911,7 @@ ret_to_user:
>         cbnz    x2, work_pending
>  finish_ret_to_user:
>         enable_step_tsk x1, x2
> +       ERASE_KSTACK
>         kernel_exit 0
>  ENDPROC(ret_to_user)

Nice. All of the return paths end up here (I went looking for
ret_from_fork's path). :)

>
> diff --git a/arch/arm64/kernel/erase.c b/arch/arm64/kernel/erase.c
> new file mode 100644
> index 000000000000..b8b5648d893b
> --- /dev/null
> +++ b/arch/arm64/kernel/erase.c
> @@ -0,0 +1,55 @@
> +#include <linux/bug.h>
> +#include <linux/sched.h>
> +#include <asm/current.h>
> +#include <asm/linkage.h>
> +#include <asm/processor.h>
> +
> +asmlinkage void erase_kstack(void)
> +{
> +       unsigned long p = current->thread.lowest_stack;
> +       unsigned long boundary = p & ~(THREAD_SIZE - 1);
> +       unsigned long poison = 0;
> +       const unsigned long check_depth = STACKLEAK_POISON_CHECK_DEPTH /
> +                                                       sizeof(unsigned long);
> +
> +       /*
> +        * Let's search for the poison value in the stack.
> +        * Start from the lowest_stack and go to the bottom.
> +        */
> +       while (p > boundary && poison <= check_depth) {
> +               if (*(unsigned long *)p == STACKLEAK_POISON)
> +                       poison++;
> +               else
> +                       poison = 0;
> +
> +               p -= sizeof(unsigned long);
> +       }
> +
> +       /*
> +        * One long int at the bottom of the thread stack is reserved and
> +        * should not be poisoned (see CONFIG_SCHED_STACK_END_CHECK).
> +        */
> +       if (p == boundary)
> +               p += sizeof(unsigned long);
> +
> +#ifdef CONFIG_STACKLEAK_METRICS
> +       current->thread.prev_lowest_stack = p;
> +#endif
> +
> +       /*
> +        * So let's write the poison value to the kernel stack.
> +        * Start from the address in p and move up till the new boundary.
> +        */
> +       boundary = current_stack_pointer;

This is the only difference between x86 and arm64 in this code. What
do you think about implementing on_thread_stack() to match x86:

        if (on_thread_stack())
                boundary = current_stack_pointer;
        else
                boundary = current_top_of_stack();

then we could make this common code too instead of having two copies in arch/?

> +       BUG_ON(boundary - p >= THREAD_SIZE);
> +
> +       while (p < boundary) {
> +               *(unsigned long *)p = STACKLEAK_POISON;
> +               p += sizeof(unsigned long);
> +       }
> +
> +       /* Reset the lowest_stack value for the next syscall */
> +       current->thread.lowest_stack = current_stack_pointer;
> +}
> +
> diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
> index f08a2ed9db0d..156fa0a0da19 100644
> --- a/arch/arm64/kernel/process.c
> +++ b/arch/arm64/kernel/process.c
> @@ -364,6 +364,9 @@ int copy_thread(unsigned long clone_flags, unsigned long stack_start,
>         p->thread.cpu_context.pc = (unsigned long)ret_from_fork;
>         p->thread.cpu_context.sp = (unsigned long)childregs;
>
> +#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
> +       p->thread.lowest_stack = (unsigned long)task_stack_page(p);
> +#endif
>         ptrace_hw_copy_thread(p);
>
>         return 0;
> @@ -493,3 +496,16 @@ void arch_setup_new_exec(void)
>  {
>         current->mm->context.flags = is_compat_task() ? MMCF_AARCH32 : 0;
>  }
> +
> +#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
> +void __used check_alloca(unsigned long size)
> +{
> +       unsigned long sp, stack_left;
> +
> +       sp = current_stack_pointer;
> +
> +       stack_left = sp & (THREAD_SIZE - 1);
> +       BUG_ON(stack_left < 256 || size >= stack_left - 256);
> +}
> +EXPORT_SYMBOL(check_alloca);

This is pretty different from x86. Is this just an artifact of ORC, or
something else?

> +#endif
> diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile
> index a34e9290a699..25dd2a14560d 100644
> --- a/drivers/firmware/efi/libstub/Makefile
> +++ b/drivers/firmware/efi/libstub/Makefile
> @@ -20,7 +20,8 @@ cflags-$(CONFIG_EFI_ARMSTUB)  += -I$(srctree)/scripts/dtc/libfdt
>  KBUILD_CFLAGS                  := $(cflags-y) -DDISABLE_BRANCH_PROFILING \
>                                    -D__NO_FORTIFY \
>                                    $(call cc-option,-ffreestanding) \
> -                                  $(call cc-option,-fno-stack-protector)
> +                                  $(call cc-option,-fno-stack-protector) \
> +                                  $(DISABLE_STACKLEAK_PLUGIN)
>
>  GCOV_PROFILE                   := n
>  KASAN_SANITIZE                 := n
> diff --git a/scripts/Makefile.gcc-plugins b/scripts/Makefile.gcc-plugins
> index 8d6070fc538f..6cc0e35d3324 100644
> --- a/scripts/Makefile.gcc-plugins
> +++ b/scripts/Makefile.gcc-plugins
> @@ -37,11 +37,14 @@ ifdef CONFIG_GCC_PLUGINS
>
>    gcc-plugin-$(CONFIG_GCC_PLUGIN_STACKLEAK)    += stackleak_plugin.so
>    gcc-plugin-cflags-$(CONFIG_GCC_PLUGIN_STACKLEAK)     += -DSTACKLEAK_PLUGIN -fplugin-arg-stackleak_plugin-track-min-size=$(CONFIG_STACKLEAK_TRACK_MIN_SIZE)
> +  ifdef CONFIG_GCC_PLUGIN_STACKLEAK
> +    DISABLE_STACKLEAK_PLUGIN           += -fplugin-arg-stackleak_plugin-disable
> +  endif
>
>    GCC_PLUGINS_CFLAGS := $(strip $(addprefix -fplugin=$(objtree)/scripts/gcc-plugins/, $(gcc-plugin-y)) $(gcc-plugin-cflags-y))
>
>    export PLUGINCC GCC_PLUGINS_CFLAGS GCC_PLUGIN GCC_PLUGIN_SUBDIR
> -  export SANCOV_PLUGIN DISABLE_LATENT_ENTROPY_PLUGIN
> +  export SANCOV_PLUGIN DISABLE_LATENT_ENTROPY_PLUGIN DISABLE_STACKLEAK_PLUGIN
>
>    ifneq ($(PLUGINCC),)
>      # SANCOV_PLUGIN can be only in CFLAGS_KCOV because avoid duplication.
> --
> 2.14.3
>

-Kees
Laura Abbott May 2, 2018, 11:07 p.m. UTC | #2
On 05/02/2018 02:31 PM, Kees Cook wrote:
> On Wed, May 2, 2018 at 1:33 PM, Laura Abbott <labbott@redhat.com> wrote:
>>
>> Implementation of stackleak based heavily on the x86 version
> 
> Awesome! Notes below for both you and Alexander, since I think we can
> create a common code base instead of having near-duplicates in the
> arch/ trees...
> 
>>
>> Signed-off-by: Laura Abbott <labbott@redhat.com>
>> ---
>> Now written in C instead of a bunch of assembly.
>> ---
>>   arch/arm64/Kconfig                    |  1 +
>>   arch/arm64/include/asm/processor.h    |  6 ++++
>>   arch/arm64/kernel/Makefile            |  3 ++
>>   arch/arm64/kernel/entry.S             |  6 ++++
>>   arch/arm64/kernel/erase.c             | 55 +++++++++++++++++++++++++++++++++++
>>   arch/arm64/kernel/process.c           | 16 ++++++++++
>>   drivers/firmware/efi/libstub/Makefile |  3 +-
>>   scripts/Makefile.gcc-plugins          |  5 +++-
>>   8 files changed, 93 insertions(+), 2 deletions(-)
>>   create mode 100644 arch/arm64/kernel/erase.c
>>
>> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
>> index eb2cf4938f6d..b0221db95dc9 100644
>> --- a/arch/arm64/Kconfig
>> +++ b/arch/arm64/Kconfig
>> @@ -92,6 +92,7 @@ config ARM64
>>          select HAVE_ARCH_MMAP_RND_BITS
>>          select HAVE_ARCH_MMAP_RND_COMPAT_BITS if COMPAT
>>          select HAVE_ARCH_SECCOMP_FILTER
>> +       select HAVE_ARCH_STACKLEAK
>>          select HAVE_ARCH_THREAD_STRUCT_WHITELIST
>>          select HAVE_ARCH_TRACEHOOK
>>          select HAVE_ARCH_TRANSPARENT_HUGEPAGE
>> diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h
>> index 767598932549..d31ab80ff647 100644
>> --- a/arch/arm64/include/asm/processor.h
>> +++ b/arch/arm64/include/asm/processor.h
>> @@ -124,6 +124,12 @@ struct thread_struct {
>>          unsigned long           fault_address;  /* fault info */
>>          unsigned long           fault_code;     /* ESR_EL1 value */
>>          struct debug_info       debug;          /* debugging */
>> +#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
>> +       unsigned long           lowest_stack;
>> +#ifdef CONFIG_STACKLEAK_METRICS
>> +       unsigned long           prev_lowest_stack;
>> +#endif
>> +#endif
> 
> I wonder if x86 and arm64 could include a common struct here that was
> empty when the plugin is disabled... it would keep the ifdefs in one
> place. Maybe include/linux/stackleak.h could be:
> 
> ---start---
> /* Poison value points to the unused hole in the virtual memory map */
> #define STACKLEAK_POISON -0xBEEF
> #define STACKLEAK_POISON_CHECK_DEPTH 128
> 
> struct stackleak {
> #ifdef CONFIG_GCC_PLUGIN_STACKLEAK
>         unsigned long           lowest;
> #ifdef CONFIG_STACKLEAK_METRICS
>         unsigned long           prev_lowest;
> #endif
> #endif
> };
> 

Is this well defined across all compilers if the plugin is off?
This seems to compile with gcc at least but 0 sized structs
make me a little uneasy.

> asmlinkage void erase_kstack(void);
> ---eof---
> 
> and arch/*/include/asm/processor.h could do:
> 
> @@ -124,6 +124,12 @@ struct thread_struct {
>          unsigned long           fault_address;  /* fault info */
>          unsigned long           fault_code;     /* ESR_EL1 value */
>          struct debug_info       debug;          /* debugging */
> +       struct stackleak         stackleak;
> 
> and arch/x86/entry/erase.c could move to maybe kernel/stackleak.c?
> (Oh, I notice this needs an SPDX line too.)
> 
>>   static inline void arch_thread_struct_whitelist(unsigned long *offset,
>> diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
>> index bf825f38d206..0ceea613c65b 100644
>> --- a/arch/arm64/kernel/Makefile
>> +++ b/arch/arm64/kernel/Makefile
>> @@ -55,6 +55,9 @@ arm64-reloc-test-y := reloc_test_core.o reloc_test_syms.o
>>   arm64-obj-$(CONFIG_CRASH_DUMP)         += crash_dump.o
>>   arm64-obj-$(CONFIG_ARM_SDE_INTERFACE)  += sdei.o
>>
>> +arm64-obj-$(CONFIG_GCC_PLUGIN_STACKLEAK) += erase.o
>> +KASAN_SANITIZE_erase.o := n
>> +
>>   obj-y                                  += $(arm64-obj-y) vdso/ probes/
>>   obj-m                                  += $(arm64-obj-m)
>>   head-y                                 := head.o
>> diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
>> index ec2ee720e33e..3144f1ebdc18 100644
>> --- a/arch/arm64/kernel/entry.S
>> +++ b/arch/arm64/kernel/entry.S
>> @@ -401,6 +401,11 @@ tsk        .req    x28             // current thread_info
>>
>>          .text
>>
>> +       .macro  ERASE_KSTACK
>> +#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
>> +       bl      erase_kstack
>> +#endif
>> +       .endm
>>   /*
>>    * Exception vectors.
>>    */
>> @@ -906,6 +911,7 @@ ret_to_user:
>>          cbnz    x2, work_pending
>>   finish_ret_to_user:
>>          enable_step_tsk x1, x2
>> +       ERASE_KSTACK
>>          kernel_exit 0
>>   ENDPROC(ret_to_user)
> 
> Nice. All of the return paths end up here (I went looking for
> ret_from_fork's path). :)
> 
>>
>> diff --git a/arch/arm64/kernel/erase.c b/arch/arm64/kernel/erase.c
>> new file mode 100644
>> index 000000000000..b8b5648d893b
>> --- /dev/null
>> +++ b/arch/arm64/kernel/erase.c
>> @@ -0,0 +1,55 @@
>> +#include <linux/bug.h>
>> +#include <linux/sched.h>
>> +#include <asm/current.h>
>> +#include <asm/linkage.h>
>> +#include <asm/processor.h>
>> +
>> +asmlinkage void erase_kstack(void)
>> +{
>> +       unsigned long p = current->thread.lowest_stack;
>> +       unsigned long boundary = p & ~(THREAD_SIZE - 1);
>> +       unsigned long poison = 0;
>> +       const unsigned long check_depth = STACKLEAK_POISON_CHECK_DEPTH /
>> +                                                       sizeof(unsigned long);
>> +
>> +       /*
>> +        * Let's search for the poison value in the stack.
>> +        * Start from the lowest_stack and go to the bottom.
>> +        */
>> +       while (p > boundary && poison <= check_depth) {
>> +               if (*(unsigned long *)p == STACKLEAK_POISON)
>> +                       poison++;
>> +               else
>> +                       poison = 0;
>> +
>> +               p -= sizeof(unsigned long);
>> +       }
>> +
>> +       /*
>> +        * One long int at the bottom of the thread stack is reserved and
>> +        * should not be poisoned (see CONFIG_SCHED_STACK_END_CHECK).
>> +        */
>> +       if (p == boundary)
>> +               p += sizeof(unsigned long);
>> +
>> +#ifdef CONFIG_STACKLEAK_METRICS
>> +       current->thread.prev_lowest_stack = p;
>> +#endif
>> +
>> +       /*
>> +        * So let's write the poison value to the kernel stack.
>> +        * Start from the address in p and move up till the new boundary.
>> +        */
>> +       boundary = current_stack_pointer;
> 
> This is the only difference between x86 and arm64 in this code. What
> do you think about implementing on_thread_stack() to match x86:
> 
>          if (on_thread_stack())
>                  boundary = current_stack_pointer;
>          else
>                  boundary = current_top_of_stack();
> 
> then we could make this common code too instead of having two copies in arch/?
> 

The issue isn't on_thread_stack, it's current_top_of_stack which isn't
defined on arm64. I agree it would be good if the code would be common
but I'm not sure how much we want to start trying to force APIs.

>> +       BUG_ON(boundary - p >= THREAD_SIZE);
>> +
>> +       while (p < boundary) {
>> +               *(unsigned long *)p = STACKLEAK_POISON;
>> +               p += sizeof(unsigned long);
>> +       }
>> +
>> +       /* Reset the lowest_stack value for the next syscall */
>> +       current->thread.lowest_stack = current_stack_pointer;
>> +}
>> +
>> diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
>> index f08a2ed9db0d..156fa0a0da19 100644
>> --- a/arch/arm64/kernel/process.c
>> +++ b/arch/arm64/kernel/process.c
>> @@ -364,6 +364,9 @@ int copy_thread(unsigned long clone_flags, unsigned long stack_start,
>>          p->thread.cpu_context.pc = (unsigned long)ret_from_fork;
>>          p->thread.cpu_context.sp = (unsigned long)childregs;
>>
>> +#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
>> +       p->thread.lowest_stack = (unsigned long)task_stack_page(p);
>> +#endif
>>          ptrace_hw_copy_thread(p);
>>
>>          return 0;
>> @@ -493,3 +496,16 @@ void arch_setup_new_exec(void)
>>   {
>>          current->mm->context.flags = is_compat_task() ? MMCF_AARCH32 : 0;
>>   }
>> +
>> +#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
>> +void __used check_alloca(unsigned long size)
>> +{
>> +       unsigned long sp, stack_left;
>> +
>> +       sp = current_stack_pointer;
>> +
>> +       stack_left = sp & (THREAD_SIZE - 1);
>> +       BUG_ON(stack_left < 256 || size >= stack_left - 256);
>> +}
>> +EXPORT_SYMBOL(check_alloca);
> 
> This is pretty different from x86. Is this just an artifact of ORC, or
> something else?
> 

This was based on the earlier version of x86. I'll confess to
not seeing how the current x86 version ended up with get_stack_info
but I suspect it's either related to ORC unwinding or it's best
practice.

>> +#endif
>> diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile
>> index a34e9290a699..25dd2a14560d 100644
>> --- a/drivers/firmware/efi/libstub/Makefile
>> +++ b/drivers/firmware/efi/libstub/Makefile
>> @@ -20,7 +20,8 @@ cflags-$(CONFIG_EFI_ARMSTUB)  += -I$(srctree)/scripts/dtc/libfdt
>>   KBUILD_CFLAGS                  := $(cflags-y) -DDISABLE_BRANCH_PROFILING \
>>                                     -D__NO_FORTIFY \
>>                                     $(call cc-option,-ffreestanding) \
>> -                                  $(call cc-option,-fno-stack-protector)
>> +                                  $(call cc-option,-fno-stack-protector) \
>> +                                  $(DISABLE_STACKLEAK_PLUGIN)
>>
>>   GCOV_PROFILE                   := n
>>   KASAN_SANITIZE                 := n
>> diff --git a/scripts/Makefile.gcc-plugins b/scripts/Makefile.gcc-plugins
>> index 8d6070fc538f..6cc0e35d3324 100644
>> --- a/scripts/Makefile.gcc-plugins
>> +++ b/scripts/Makefile.gcc-plugins
>> @@ -37,11 +37,14 @@ ifdef CONFIG_GCC_PLUGINS
>>
>>     gcc-plugin-$(CONFIG_GCC_PLUGIN_STACKLEAK)    += stackleak_plugin.so
>>     gcc-plugin-cflags-$(CONFIG_GCC_PLUGIN_STACKLEAK)     += -DSTACKLEAK_PLUGIN -fplugin-arg-stackleak_plugin-track-min-size=$(CONFIG_STACKLEAK_TRACK_MIN_SIZE)
>> +  ifdef CONFIG_GCC_PLUGIN_STACKLEAK
>> +    DISABLE_STACKLEAK_PLUGIN           += -fplugin-arg-stackleak_plugin-disable
>> +  endif
>>
>>     GCC_PLUGINS_CFLAGS := $(strip $(addprefix -fplugin=$(objtree)/scripts/gcc-plugins/, $(gcc-plugin-y)) $(gcc-plugin-cflags-y))
>>
>>     export PLUGINCC GCC_PLUGINS_CFLAGS GCC_PLUGIN GCC_PLUGIN_SUBDIR
>> -  export SANCOV_PLUGIN DISABLE_LATENT_ENTROPY_PLUGIN
>> +  export SANCOV_PLUGIN DISABLE_LATENT_ENTROPY_PLUGIN DISABLE_STACKLEAK_PLUGIN
>>
>>     ifneq ($(PLUGINCC),)
>>       # SANCOV_PLUGIN can be only in CFLAGS_KCOV because avoid duplication.
>> --
>> 2.14.3
>>
> 
> -Kees
> 

Thanks,
Laura
Kees Cook May 2, 2018, 11:37 p.m. UTC | #3
On Wed, May 2, 2018 at 4:07 PM, Laura Abbott <labbott@redhat.com> wrote:
> On 05/02/2018 02:31 PM, Kees Cook wrote:
>> struct stackleak {
>> #ifdef CONFIG_GCC_PLUGIN_STACKLEAK
>>         unsigned long           lowest;
>> #ifdef CONFIG_STACKLEAK_METRICS
>>         unsigned long           prev_lowest;
>> #endif
>> #endif
>> };
>>
>
> Is this well defined across all compilers if the plugin is off?
> This seems to compile with gcc at least but 0 sized structs
> make me a little uneasy.

Yup! Or at least, there have been no problems with this and the
seccomp struct, which is empty when !CONFIG_SECCOMP.

>> This is the only difference between x86 and arm64 in this code. What
>> do you think about implementing on_thread_stack() to match x86:
>>
>>          if (on_thread_stack())
>>                  boundary = current_stack_pointer;
>>          else
>>                  boundary = current_top_of_stack();
>>
>> then we could make this common code too instead of having two copies in
>> arch/?
>>
>
> The issue isn't on_thread_stack, it's current_top_of_stack which isn't
> defined on arm64. I agree it would be good if the code would be common
> but I'm not sure how much we want to start trying to force APIs.

Ah, gotcha. Well, I'd rather we had an #ifdef here that two copies of
the code. ;)

>>> +#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
>>> +void __used check_alloca(unsigned long size)
>>> +{
>>> +       unsigned long sp, stack_left;
>>> +
>>> +       sp = current_stack_pointer;
>>> +
>>> +       stack_left = sp & (THREAD_SIZE - 1);
>>> +       BUG_ON(stack_left < 256 || size >= stack_left - 256);
>>> +}
>>> +EXPORT_SYMBOL(check_alloca);
>>
>>
>> This is pretty different from x86. Is this just an artifact of ORC, or
>> something else?
>>
>
> This was based on the earlier version of x86. I'll confess to
> not seeing how the current x86 version ended up with get_stack_info
> but I suspect it's either related to ORC unwinding or it's best
> practice.

Alexander, what was the history here?

-Kees
Mark Rutland May 3, 2018, 7:19 a.m. UTC | #4
Hi Laura,

On Wed, May 02, 2018 at 01:33:26PM -0700, Laura Abbott wrote:
> 
> Implementation of stackleak based heavily on the x86 version
> 
> Signed-off-by: Laura Abbott <labbott@redhat.com>
> ---
> Now written in C instead of a bunch of assembly.

This looks neat!

I have a few minor comments below.

> diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
> index bf825f38d206..0ceea613c65b 100644
> --- a/arch/arm64/kernel/Makefile
> +++ b/arch/arm64/kernel/Makefile
> @@ -55,6 +55,9 @@ arm64-reloc-test-y := reloc_test_core.o reloc_test_syms.o
>  arm64-obj-$(CONFIG_CRASH_DUMP)		+= crash_dump.o
>  arm64-obj-$(CONFIG_ARM_SDE_INTERFACE)	+= sdei.o
>  
> +arm64-obj-$(CONFIG_GCC_PLUGIN_STACKLEAK) += erase.o
> +KASAN_SANITIZE_erase.o	:= n

I suspect we want to avoid the full set of instrumentation suspects here, e.g.
GKOV, KASAN, UBSAN, and KCOV.

> +
>  obj-y					+= $(arm64-obj-y) vdso/ probes/
>  obj-m					+= $(arm64-obj-m)
>  head-y					:= head.o
> diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
> index ec2ee720e33e..3144f1ebdc18 100644
> --- a/arch/arm64/kernel/entry.S
> +++ b/arch/arm64/kernel/entry.S
> @@ -401,6 +401,11 @@ tsk	.req	x28		// current thread_info
>  
>  	.text
>  
> +	.macro	ERASE_KSTACK
> +#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
> +	bl	erase_kstack
> +#endif
> +	.endm

Nit: The rest of our asm macros are lower-case -- can we stick to that here?

>  /*
>   * Exception vectors.
>   */
> @@ -906,6 +911,7 @@ ret_to_user:
>  	cbnz	x2, work_pending
>  finish_ret_to_user:
>  	enable_step_tsk x1, x2
> +	ERASE_KSTACK
>  	kernel_exit 0
>  ENDPROC(ret_to_user)

I believe we also need this in ret_fast_syscall.

[...]

> +asmlinkage void erase_kstack(void)
> +{
> +	unsigned long p = current->thread.lowest_stack;
> +	unsigned long boundary = p & ~(THREAD_SIZE - 1);
> +	unsigned long poison = 0;
> +	const unsigned long check_depth = STACKLEAK_POISON_CHECK_DEPTH /
> +							sizeof(unsigned long);
> +
> +	/*
> +	 * Let's search for the poison value in the stack.
> +	 * Start from the lowest_stack and go to the bottom.
> +	 */
> +	while (p > boundary && poison <= check_depth) {
> +		if (*(unsigned long *)p == STACKLEAK_POISON)
> +			poison++;
> +		else
> +			poison = 0;
> +
> +		p -= sizeof(unsigned long);
> +	}
> +
> +	/*
> +	 * One long int at the bottom of the thread stack is reserved and
> +	 * should not be poisoned (see CONFIG_SCHED_STACK_END_CHECK).
> +	 */
> +	if (p == boundary)
> +		p += sizeof(unsigned long);

I wonder if end_of_stack() should be taught about CONFIG_SCHED_STACK_END_CHECK,
given that's supposed to return the last *usable* long on the stack, and we
don't account for this elsewhere.

If we did, then IIUC we could do:

	unsigned long boundary = (unsigned long)end_of_stack(current);

... at the start of the function, and not have to worry about this explicitly.

> +
> +#ifdef CONFIG_STACKLEAK_METRICS
> +	current->thread.prev_lowest_stack = p;
> +#endif
> +
> +	/*
> +	 * So let's write the poison value to the kernel stack.
> +	 * Start from the address in p and move up till the new boundary.
> +	 */
> +	boundary = current_stack_pointer;

I worry a little that the compiler can move the SP during a function's
lifetime, but maybe that's only the case when there are VLAs, or something like
that?

> +
> +	BUG_ON(boundary - p >= THREAD_SIZE);
> +
> +	while (p < boundary) {
> +		*(unsigned long *)p = STACKLEAK_POISON;
> +		p += sizeof(unsigned long);
> +	}
> +
> +	/* Reset the lowest_stack value for the next syscall */
> +	current->thread.lowest_stack = current_stack_pointer;
> +}

Once this function returns, its data is left on the stack. Is that not a problem?

No strong feelings either way, but it might be worth mentioning in the commit
message.

> diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
> index f08a2ed9db0d..156fa0a0da19 100644
> --- a/arch/arm64/kernel/process.c
> +++ b/arch/arm64/kernel/process.c
> @@ -364,6 +364,9 @@ int copy_thread(unsigned long clone_flags, unsigned long stack_start,
>  	p->thread.cpu_context.pc = (unsigned long)ret_from_fork;
>  	p->thread.cpu_context.sp = (unsigned long)childregs;
>  
> +#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
> +	p->thread.lowest_stack = (unsigned long)task_stack_page(p);

Nit: end_of_stack(p) would be slightly better semantically, even though
currently equivalent to task_stack_page(p).

[...]

> +#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
> +void __used check_alloca(unsigned long size)
> +{
> +	unsigned long sp, stack_left;
> +
> +	sp = current_stack_pointer;
> +
> +	stack_left = sp & (THREAD_SIZE - 1);
> +	BUG_ON(stack_left < 256 || size >= stack_left - 256);
> +}

Is this arbitrary, or is there something special about 256?

Even if this is arbitrary, can we give it some mnemonic?

> +EXPORT_SYMBOL(check_alloca);
> +#endif
> diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile
> index a34e9290a699..25dd2a14560d 100644
> --- a/drivers/firmware/efi/libstub/Makefile
> +++ b/drivers/firmware/efi/libstub/Makefile
> @@ -20,7 +20,8 @@ cflags-$(CONFIG_EFI_ARMSTUB)	+= -I$(srctree)/scripts/dtc/libfdt
>  KBUILD_CFLAGS			:= $(cflags-y) -DDISABLE_BRANCH_PROFILING \
>  				   -D__NO_FORTIFY \
>  				   $(call cc-option,-ffreestanding) \
> -				   $(call cc-option,-fno-stack-protector)
> +				   $(call cc-option,-fno-stack-protector) \
> +				   $(DISABLE_STACKLEAK_PLUGIN)
>  
>  GCOV_PROFILE			:= n
>  KASAN_SANITIZE			:= n

I believe we'll also need to do this for the KVM hyp code in arch/arm64/kvm/hyp/.

Thanks,
Mark.
Ard Biesheuvel May 3, 2018, 11:37 a.m. UTC | #5
On 3 May 2018 at 09:19, Mark Rutland <mark.rutland@arm.com> wrote:
> Hi Laura,
>
> On Wed, May 02, 2018 at 01:33:26PM -0700, Laura Abbott wrote:
>>
>> Implementation of stackleak based heavily on the x86 version
>>
>> Signed-off-by: Laura Abbott <labbott@redhat.com>
>> ---
>> Now written in C instead of a bunch of assembly.
>
> This looks neat!
>
> I have a few minor comments below.
>
>> diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
>> index bf825f38d206..0ceea613c65b 100644
>> --- a/arch/arm64/kernel/Makefile
>> +++ b/arch/arm64/kernel/Makefile
>> @@ -55,6 +55,9 @@ arm64-reloc-test-y := reloc_test_core.o reloc_test_syms.o
>>  arm64-obj-$(CONFIG_CRASH_DUMP)               += crash_dump.o
>>  arm64-obj-$(CONFIG_ARM_SDE_INTERFACE)        += sdei.o
>>
>> +arm64-obj-$(CONFIG_GCC_PLUGIN_STACKLEAK) += erase.o
>> +KASAN_SANITIZE_erase.o       := n
>
> I suspect we want to avoid the full set of instrumentation suspects here, e.g.
> GKOV, KASAN, UBSAN, and KCOV.
>
>> +
>>  obj-y                                        += $(arm64-obj-y) vdso/ probes/
>>  obj-m                                        += $(arm64-obj-m)
>>  head-y                                       := head.o
>> diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
>> index ec2ee720e33e..3144f1ebdc18 100644
>> --- a/arch/arm64/kernel/entry.S
>> +++ b/arch/arm64/kernel/entry.S
>> @@ -401,6 +401,11 @@ tsk      .req    x28             // current thread_info
>>
>>       .text
>>
>> +     .macro  ERASE_KSTACK
>> +#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
>> +     bl      erase_kstack
>> +#endif
>> +     .endm
>
> Nit: The rest of our asm macros are lower-case -- can we stick to that here?
>
>>  /*
>>   * Exception vectors.
>>   */
>> @@ -906,6 +911,7 @@ ret_to_user:
>>       cbnz    x2, work_pending
>>  finish_ret_to_user:
>>       enable_step_tsk x1, x2
>> +     ERASE_KSTACK
>>       kernel_exit 0
>>  ENDPROC(ret_to_user)
>
> I believe we also need this in ret_fast_syscall.
>
> [...]
>
>> +asmlinkage void erase_kstack(void)
>> +{
>> +     unsigned long p = current->thread.lowest_stack;
>> +     unsigned long boundary = p & ~(THREAD_SIZE - 1);
>> +     unsigned long poison = 0;
>> +     const unsigned long check_depth = STACKLEAK_POISON_CHECK_DEPTH /
>> +                                                     sizeof(unsigned long);
>> +
>> +     /*
>> +      * Let's search for the poison value in the stack.
>> +      * Start from the lowest_stack and go to the bottom.
>> +      */
>> +     while (p > boundary && poison <= check_depth) {
>> +             if (*(unsigned long *)p == STACKLEAK_POISON)
>> +                     poison++;
>> +             else
>> +                     poison = 0;
>> +
>> +             p -= sizeof(unsigned long);
>> +     }
>> +
>> +     /*
>> +      * One long int at the bottom of the thread stack is reserved and
>> +      * should not be poisoned (see CONFIG_SCHED_STACK_END_CHECK).
>> +      */
>> +     if (p == boundary)
>> +             p += sizeof(unsigned long);
>
> I wonder if end_of_stack() should be taught about CONFIG_SCHED_STACK_END_CHECK,
> given that's supposed to return the last *usable* long on the stack, and we
> don't account for this elsewhere.
>
> If we did, then IIUC we could do:
>
>         unsigned long boundary = (unsigned long)end_of_stack(current);
>
> ... at the start of the function, and not have to worry about this explicitly.
>
>> +
>> +#ifdef CONFIG_STACKLEAK_METRICS
>> +     current->thread.prev_lowest_stack = p;
>> +#endif
>> +
>> +     /*
>> +      * So let's write the poison value to the kernel stack.
>> +      * Start from the address in p and move up till the new boundary.
>> +      */
>> +     boundary = current_stack_pointer;
>
> I worry a little that the compiler can move the SP during a function's
> lifetime, but maybe that's only the case when there are VLAs, or something like
> that?
>

I think the AAPCS permits the compiler to allocate the stack space for
outgoing variables (i.e., args 9 and beyond or other argument types
that require passing via the stack) at a smaller scope than the entire
function, although GCC does appear to allocate it at the beginning
(based on some quick experiments)


>> +
>> +     BUG_ON(boundary - p >= THREAD_SIZE);
>> +
>> +     while (p < boundary) {
>> +             *(unsigned long *)p = STACKLEAK_POISON;
>> +             p += sizeof(unsigned long);
>> +     }
>> +
>> +     /* Reset the lowest_stack value for the next syscall */
>> +     current->thread.lowest_stack = current_stack_pointer;
>> +}
>
> Once this function returns, its data is left on the stack. Is that not a problem?
>
> No strong feelings either way, but it might be worth mentioning in the commit
> message.
>
>> diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
>> index f08a2ed9db0d..156fa0a0da19 100644
>> --- a/arch/arm64/kernel/process.c
>> +++ b/arch/arm64/kernel/process.c
>> @@ -364,6 +364,9 @@ int copy_thread(unsigned long clone_flags, unsigned long stack_start,
>>       p->thread.cpu_context.pc = (unsigned long)ret_from_fork;
>>       p->thread.cpu_context.sp = (unsigned long)childregs;
>>
>> +#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
>> +     p->thread.lowest_stack = (unsigned long)task_stack_page(p);
>
> Nit: end_of_stack(p) would be slightly better semantically, even though
> currently equivalent to task_stack_page(p).
>
> [...]
>
>> +#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
>> +void __used check_alloca(unsigned long size)
>> +{
>> +     unsigned long sp, stack_left;
>> +
>> +     sp = current_stack_pointer;
>> +
>> +     stack_left = sp & (THREAD_SIZE - 1);
>> +     BUG_ON(stack_left < 256 || size >= stack_left - 256);
>> +}
>
> Is this arbitrary, or is there something special about 256?
>
> Even if this is arbitrary, can we give it some mnemonic?
>
>> +EXPORT_SYMBOL(check_alloca);
>> +#endif
>> diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile
>> index a34e9290a699..25dd2a14560d 100644
>> --- a/drivers/firmware/efi/libstub/Makefile
>> +++ b/drivers/firmware/efi/libstub/Makefile
>> @@ -20,7 +20,8 @@ cflags-$(CONFIG_EFI_ARMSTUB)        += -I$(srctree)/scripts/dtc/libfdt
>>  KBUILD_CFLAGS                        := $(cflags-y) -DDISABLE_BRANCH_PROFILING \
>>                                  -D__NO_FORTIFY \
>>                                  $(call cc-option,-ffreestanding) \
>> -                                $(call cc-option,-fno-stack-protector)
>> +                                $(call cc-option,-fno-stack-protector) \
>> +                                $(DISABLE_STACKLEAK_PLUGIN)
>>
>>  GCOV_PROFILE                 := n
>>  KASAN_SANITIZE                       := n
>
> I believe we'll also need to do this for the KVM hyp code in arch/arm64/kvm/hyp/.
>
> Thanks,
> Mark.
Alexander Popov May 3, 2018, 4:05 p.m. UTC | #6
Hello Laura and Kees,

On 03.05.2018 02:07, Laura Abbott wrote:
> On 05/02/2018 02:31 PM, Kees Cook wrote:
>> On Wed, May 2, 2018 at 1:33 PM, Laura Abbott <labbott@redhat.com> wrote:
>>>
>>> Implementation of stackleak based heavily on the x86 version
>>
>> Awesome! Notes below for both you and Alexander, since I think we can
>> create a common code base instead of having near-duplicates in the
>> arch/ trees...

Yes, sure.

I will extract the common part and send v12 for x86. Then Laura will be able to
add arm64 support in a separate patch series. Is it fine?

>>> Signed-off-by: Laura Abbott <labbott@redhat.com>
>>> ---
>>> Now written in C instead of a bunch of assembly.
>>> ---
>>>   arch/arm64/Kconfig                    |  1 +
>>>   arch/arm64/include/asm/processor.h    |  6 ++++
>>>   arch/arm64/kernel/Makefile            |  3 ++
>>>   arch/arm64/kernel/entry.S             |  6 ++++
>>>   arch/arm64/kernel/erase.c             | 55 +++++++++++++++++++++++++++++++++++
>>>   arch/arm64/kernel/process.c           | 16 ++++++++++
>>>   drivers/firmware/efi/libstub/Makefile |  3 +-
>>>   scripts/Makefile.gcc-plugins          |  5 +++-
>>>   8 files changed, 93 insertions(+), 2 deletions(-)
>>>   create mode 100644 arch/arm64/kernel/erase.c
>>>
>>> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
>>> index eb2cf4938f6d..b0221db95dc9 100644
>>> --- a/arch/arm64/Kconfig
>>> +++ b/arch/arm64/Kconfig
>>> @@ -92,6 +92,7 @@ config ARM64
>>>          select HAVE_ARCH_MMAP_RND_BITS
>>>          select HAVE_ARCH_MMAP_RND_COMPAT_BITS if COMPAT
>>>          select HAVE_ARCH_SECCOMP_FILTER
>>> +       select HAVE_ARCH_STACKLEAK
>>>          select HAVE_ARCH_THREAD_STRUCT_WHITELIST
>>>          select HAVE_ARCH_TRACEHOOK
>>>          select HAVE_ARCH_TRANSPARENT_HUGEPAGE
>>> diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h
>>> index 767598932549..d31ab80ff647 100644
>>> --- a/arch/arm64/include/asm/processor.h
>>> +++ b/arch/arm64/include/asm/processor.h
>>> @@ -124,6 +124,12 @@ struct thread_struct {
>>>          unsigned long           fault_address;  /* fault info */
>>>          unsigned long           fault_code;     /* ESR_EL1 value */
>>>          struct debug_info       debug;          /* debugging */
>>> +#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
>>> +       unsigned long           lowest_stack;
>>> +#ifdef CONFIG_STACKLEAK_METRICS
>>> +       unsigned long           prev_lowest_stack;
>>> +#endif
>>> +#endif
>>
>> I wonder if x86 and arm64 could include a common struct here that was
>> empty when the plugin is disabled... it would keep the ifdefs in one
>> place. Maybe include/linux/stackleak.h could be:
>>
>> ---start---
>> /* Poison value points to the unused hole in the virtual memory map */
>> #define STACKLEAK_POISON -0xBEEF
>> #define STACKLEAK_POISON_CHECK_DEPTH 128
>>
>> struct stackleak {
>> #ifdef CONFIG_GCC_PLUGIN_STACKLEAK
>>         unsigned long           lowest;
>> #ifdef CONFIG_STACKLEAK_METRICS
>>         unsigned long           prev_lowest;
>> #endif
>> #endif
>> };
>>
> 
> Is this well defined across all compilers if the plugin is off?
> This seems to compile with gcc at least but 0 sized structs
> make me a little uneasy.

Empty struct is not defined by C standard but is permitted by gcc
https://gcc.gnu.org/onlinedocs/gcc/Empty-Structures.html#Empty-Structures

Fast example:

#include <stdio.h>

int main(void)
{
	struct a {};

	printf("size %zu\n", sizeof(struct a));

	return 0;
}

# gcc -pedantic t.c -o t
t.c: In function ‘main’:
t.c:5:9: warning: struct has no members [-Wpedantic]
  struct a {};
         ^

# clang -Weverything t.c -o tc
t.c:5:2: warning: empty struct has size 0 in C, size 1 in C++ [-Wc++-compat]
        struct a {};
        ^
t.c:5:2: warning: empty struct is a GNU extension [-Wgnu-empty-struct]
2 warnings generated.


But both programs print "size 0". There are a lot of empty structs around the
kernel, so I'll create another one.

>> asmlinkage void erase_kstack(void);
>> ---eof---
>>
>> and arch/*/include/asm/processor.h could do:
>>
>> @@ -124,6 +124,12 @@ struct thread_struct {
>>          unsigned long           fault_address;  /* fault info */
>>          unsigned long           fault_code;     /* ESR_EL1 value */
>>          struct debug_info       debug;          /* debugging */
>> +       struct stackleak         stackleak;
>>
>> and arch/x86/entry/erase.c could move to maybe kernel/stackleak.c?
>> (Oh, I notice this needs an SPDX line too.)

Thanks, I'll add it.

>>>   static inline void arch_thread_struct_whitelist(unsigned long *offset,
>>> diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
>>> index bf825f38d206..0ceea613c65b 100644
>>> --- a/arch/arm64/kernel/Makefile
>>> +++ b/arch/arm64/kernel/Makefile
>>> @@ -55,6 +55,9 @@ arm64-reloc-test-y := reloc_test_core.o reloc_test_syms.o
>>>   arm64-obj-$(CONFIG_CRASH_DUMP)         += crash_dump.o
>>>   arm64-obj-$(CONFIG_ARM_SDE_INTERFACE)  += sdei.o
>>>
>>> +arm64-obj-$(CONFIG_GCC_PLUGIN_STACKLEAK) += erase.o
>>> +KASAN_SANITIZE_erase.o := n
>>> +
>>>   obj-y                                  += $(arm64-obj-y) vdso/ probes/
>>>   obj-m                                  += $(arm64-obj-m)
>>>   head-y                                 := head.o
>>> diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
>>> index ec2ee720e33e..3144f1ebdc18 100644
>>> --- a/arch/arm64/kernel/entry.S
>>> +++ b/arch/arm64/kernel/entry.S
>>> @@ -401,6 +401,11 @@ tsk        .req    x28             // current thread_info
>>>
>>>          .text
>>>
>>> +       .macro  ERASE_KSTACK
>>> +#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
>>> +       bl      erase_kstack
>>> +#endif
>>> +       .endm
>>>   /*
>>>    * Exception vectors.
>>>    */
>>> @@ -906,6 +911,7 @@ ret_to_user:
>>>          cbnz    x2, work_pending
>>>   finish_ret_to_user:
>>>          enable_step_tsk x1, x2
>>> +       ERASE_KSTACK
>>>          kernel_exit 0
>>>   ENDPROC(ret_to_user)
>>
>> Nice. All of the return paths end up here (I went looking for
>> ret_from_fork's path). :)
>>
>>>
>>> diff --git a/arch/arm64/kernel/erase.c b/arch/arm64/kernel/erase.c
>>> new file mode 100644
>>> index 000000000000..b8b5648d893b
>>> --- /dev/null
>>> +++ b/arch/arm64/kernel/erase.c
>>> @@ -0,0 +1,55 @@
>>> +#include <linux/bug.h>
>>> +#include <linux/sched.h>
>>> +#include <asm/current.h>
>>> +#include <asm/linkage.h>
>>> +#include <asm/processor.h>
>>> +
>>> +asmlinkage void erase_kstack(void)
>>> +{
>>> +       unsigned long p = current->thread.lowest_stack;
>>> +       unsigned long boundary = p & ~(THREAD_SIZE - 1);
>>> +       unsigned long poison = 0;
>>> +       const unsigned long check_depth = STACKLEAK_POISON_CHECK_DEPTH /
>>> +                                                       sizeof(unsigned long);
>>> +
>>> +       /*
>>> +        * Let's search for the poison value in the stack.
>>> +        * Start from the lowest_stack and go to the bottom.
>>> +        */
>>> +       while (p > boundary && poison <= check_depth) {
>>> +               if (*(unsigned long *)p == STACKLEAK_POISON)
>>> +                       poison++;
>>> +               else
>>> +                       poison = 0;
>>> +
>>> +               p -= sizeof(unsigned long);
>>> +       }
>>> +
>>> +       /*
>>> +        * One long int at the bottom of the thread stack is reserved and
>>> +        * should not be poisoned (see CONFIG_SCHED_STACK_END_CHECK).
>>> +        */
>>> +       if (p == boundary)
>>> +               p += sizeof(unsigned long);
>>> +
>>> +#ifdef CONFIG_STACKLEAK_METRICS
>>> +       current->thread.prev_lowest_stack = p;
>>> +#endif
>>> +
>>> +       /*
>>> +        * So let's write the poison value to the kernel stack.
>>> +        * Start from the address in p and move up till the new boundary.
>>> +        */
>>> +       boundary = current_stack_pointer;
>>
>> This is the only difference between x86 and arm64 in this code. What
>> do you think about implementing on_thread_stack() to match x86:
>>
>>          if (on_thread_stack())
>>                  boundary = current_stack_pointer;
>>          else
>>                  boundary = current_top_of_stack();
>>
>> then we could make this common code too instead of having two copies in arch/?
>>
> 
> The issue isn't on_thread_stack, it's current_top_of_stack which isn't
> defined on arm64. I agree it would be good if the code would be common
> but I'm not sure how much we want to start trying to force APIs.
> 
>>> +       BUG_ON(boundary - p >= THREAD_SIZE);
>>> +
>>> +       while (p < boundary) {
>>> +               *(unsigned long *)p = STACKLEAK_POISON;
>>> +               p += sizeof(unsigned long);
>>> +       }
>>> +
>>> +       /* Reset the lowest_stack value for the next syscall */
>>> +       current->thread.lowest_stack = current_stack_pointer;
>>> +}
>>> +
>>> diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
>>> index f08a2ed9db0d..156fa0a0da19 100644
>>> --- a/arch/arm64/kernel/process.c
>>> +++ b/arch/arm64/kernel/process.c
>>> @@ -364,6 +364,9 @@ int copy_thread(unsigned long clone_flags, unsigned long stack_start,
>>>          p->thread.cpu_context.pc = (unsigned long)ret_from_fork;
>>>          p->thread.cpu_context.sp = (unsigned long)childregs;
>>>
>>> +#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
>>> +       p->thread.lowest_stack = (unsigned long)task_stack_page(p);
>>> +#endif

I think it should be (unsigned long)task_stack_page(p) + sizeof(unsigned long).

>>>          ptrace_hw_copy_thread(p);
>>>
>>>          return 0;
>>> @@ -493,3 +496,16 @@ void arch_setup_new_exec(void)
>>>   {
>>>          current->mm->context.flags = is_compat_task() ? MMCF_AARCH32 : 0;
>>>   }
>>> +
>>> +#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
>>> +void __used check_alloca(unsigned long size)
>>> +{
>>> +       unsigned long sp, stack_left;
>>> +
>>> +       sp = current_stack_pointer;
>>> +
>>> +       stack_left = sp & (THREAD_SIZE - 1);
>>> +       BUG_ON(stack_left < 256 || size >= stack_left - 256);
>>> +}
>>> +EXPORT_SYMBOL(check_alloca);
>>
>> This is pretty different from x86. Is this just an artifact of ORC, or
>> something else?
>>
> 
> This was based on the earlier version of x86. I'll confess to
> not seeing how the current x86 version ended up with get_stack_info
> but I suspect it's either related to ORC unwinding or it's best
> practice.

I've changed that in v4. Quote from the changelog:
  - Fixed the surplus and erroneous code for calculating stack_left in
     check_alloca() on x86_64. That code repeats the work which is already
     done in get_stack_info() and it misses the fact that different
     exception stacks on x86_64 have different size.

http://www.openwall.com/lists/kernel-hardening/2017/10/04/68

We can see that in arch/x86/kernel/dumpstack_64.c.

Is it fine if check_alloca() would be arch-specific?

>>> +#endif
>>> diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile
>>> index a34e9290a699..25dd2a14560d 100644
>>> --- a/drivers/firmware/efi/libstub/Makefile
>>> +++ b/drivers/firmware/efi/libstub/Makefile
>>> @@ -20,7 +20,8 @@ cflags-$(CONFIG_EFI_ARMSTUB)  += -I$(srctree)/scripts/dtc/libfdt
>>>   KBUILD_CFLAGS                  := $(cflags-y) -DDISABLE_BRANCH_PROFILING \
>>>                                     -D__NO_FORTIFY \
>>>                                     $(call cc-option,-ffreestanding) \
>>> -                                  $(call cc-option,-fno-stack-protector)
>>> +                                  $(call cc-option,-fno-stack-protector) \
>>> +                                  $(DISABLE_STACKLEAK_PLUGIN)
>>>
>>>   GCOV_PROFILE                   := n
>>>   KASAN_SANITIZE                 := n
>>> diff --git a/scripts/Makefile.gcc-plugins b/scripts/Makefile.gcc-plugins
>>> index 8d6070fc538f..6cc0e35d3324 100644
>>> --- a/scripts/Makefile.gcc-plugins
>>> +++ b/scripts/Makefile.gcc-plugins
>>> @@ -37,11 +37,14 @@ ifdef CONFIG_GCC_PLUGINS
>>>
>>>     gcc-plugin-$(CONFIG_GCC_PLUGIN_STACKLEAK)    += stackleak_plugin.so
>>>     gcc-plugin-cflags-$(CONFIG_GCC_PLUGIN_STACKLEAK)     += -DSTACKLEAK_PLUGIN -fplugin-arg-stackleak_plugin-track-min-size=$(CONFIG_STACKLEAK_TRACK_MIN_SIZE)
>>> +  ifdef CONFIG_GCC_PLUGIN_STACKLEAK
>>> +    DISABLE_STACKLEAK_PLUGIN           += -fplugin-arg-stackleak_plugin-disable
>>> +  endif
>>>
>>>     GCC_PLUGINS_CFLAGS := $(strip $(addprefix -fplugin=$(objtree)/scripts/gcc-plugins/, $(gcc-plugin-y)) $(gcc-plugin-cflags-y))
>>>
>>>     export PLUGINCC GCC_PLUGINS_CFLAGS GCC_PLUGIN GCC_PLUGIN_SUBDIR
>>> -  export SANCOV_PLUGIN DISABLE_LATENT_ENTROPY_PLUGIN
>>> +  export SANCOV_PLUGIN DISABLE_LATENT_ENTROPY_PLUGIN DISABLE_STACKLEAK_PLUGIN
>>>
>>>     ifneq ($(PLUGINCC),)
>>>       # SANCOV_PLUGIN can be only in CFLAGS_KCOV because avoid duplication.
>>> --
>>> 2.14.3
>>>

Best regards,
Alexander
Kees Cook May 3, 2018, 4:45 p.m. UTC | #7
On Thu, May 3, 2018 at 9:05 AM, Alexander Popov <alex.popov@linux.com> wrote:
> Hello Laura and Kees,
>
> On 03.05.2018 02:07, Laura Abbott wrote:
>> On 05/02/2018 02:31 PM, Kees Cook wrote:
>>> On Wed, May 2, 2018 at 1:33 PM, Laura Abbott <labbott@redhat.com> wrote:
>>>>
>>>> Implementation of stackleak based heavily on the x86 version
>>>
>>> Awesome! Notes below for both you and Alexander, since I think we can
>>> create a common code base instead of having near-duplicates in the
>>> arch/ trees...
>
> Yes, sure.
>
> I will extract the common part and send v12 for x86. Then Laura will be able to
> add arm64 support in a separate patch series. Is it fine?

Sure, though if you could fold the plugin fix from her, that would be
ideal. I'll likely carry both patch sets together once the arm64 one
stabilizes.

>> This was based on the earlier version of x86. I'll confess to
>> not seeing how the current x86 version ended up with get_stack_info
>> but I suspect it's either related to ORC unwinding or it's best
>> practice.
>
> I've changed that in v4. Quote from the changelog:
>   - Fixed the surplus and erroneous code for calculating stack_left in
>      check_alloca() on x86_64. That code repeats the work which is already
>      done in get_stack_info() and it misses the fact that different
>      exception stacks on x86_64 have different size.
>
> http://www.openwall.com/lists/kernel-hardening/2017/10/04/68
>
> We can see that in arch/x86/kernel/dumpstack_64.c.
>
> Is it fine if check_alloca() would be arch-specific?

I'm fine if check_alloca() remains arch-specific.

Thanks!

-Kees
Alexander Popov May 3, 2018, 5:33 p.m. UTC | #8
Hello Mark and Laura,

Let me join the discussion. Mark, thanks for your feedback!

On 03.05.2018 10:19, Mark Rutland wrote:
> Hi Laura,
> 
> On Wed, May 02, 2018 at 01:33:26PM -0700, Laura Abbott wrote:
>>
>> Implementation of stackleak based heavily on the x86 version
>>
>> Signed-off-by: Laura Abbott <labbott@redhat.com>
>> ---
>> Now written in C instead of a bunch of assembly.
> 
> This looks neat!
> 
> I have a few minor comments below.
> 
>> diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
>> index bf825f38d206..0ceea613c65b 100644
>> --- a/arch/arm64/kernel/Makefile
>> +++ b/arch/arm64/kernel/Makefile
>> @@ -55,6 +55,9 @@ arm64-reloc-test-y := reloc_test_core.o reloc_test_syms.o
>>  arm64-obj-$(CONFIG_CRASH_DUMP)		+= crash_dump.o
>>  arm64-obj-$(CONFIG_ARM_SDE_INTERFACE)	+= sdei.o
>>  
>> +arm64-obj-$(CONFIG_GCC_PLUGIN_STACKLEAK) += erase.o
>> +KASAN_SANITIZE_erase.o	:= n
> 
> I suspect we want to avoid the full set of instrumentation suspects here, e.g.
> GKOV, KASAN, UBSAN, and KCOV.

I've disabled KASAN instrumentation for that file on x86 because erase_kstack()
intentionally writes to the stack and causes KASAN false positive reports.

But I didn't see any conflicts with other types of instrumentation that you
mentioned.

>> +
>>  obj-y					+= $(arm64-obj-y) vdso/ probes/
>>  obj-m					+= $(arm64-obj-m)
>>  head-y					:= head.o
>> diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
>> index ec2ee720e33e..3144f1ebdc18 100644
>> --- a/arch/arm64/kernel/entry.S
>> +++ b/arch/arm64/kernel/entry.S
>> @@ -401,6 +401,11 @@ tsk	.req	x28		// current thread_info
>>  
>>  	.text
>>  
>> +	.macro	ERASE_KSTACK
>> +#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
>> +	bl	erase_kstack
>> +#endif
>> +	.endm
> 
> Nit: The rest of our asm macros are lower-case -- can we stick to that here?
> 
>>  /*
>>   * Exception vectors.
>>   */
>> @@ -906,6 +911,7 @@ ret_to_user:
>>  	cbnz	x2, work_pending
>>  finish_ret_to_user:
>>  	enable_step_tsk x1, x2
>> +	ERASE_KSTACK
>>  	kernel_exit 0
>>  ENDPROC(ret_to_user)
> 
> I believe we also need this in ret_fast_syscall.
> 
> [...]
> 
>> +asmlinkage void erase_kstack(void)
>> +{
>> +	unsigned long p = current->thread.lowest_stack;
>> +	unsigned long boundary = p & ~(THREAD_SIZE - 1);
>> +	unsigned long poison = 0;
>> +	const unsigned long check_depth = STACKLEAK_POISON_CHECK_DEPTH /
>> +							sizeof(unsigned long);
>> +
>> +	/*
>> +	 * Let's search for the poison value in the stack.
>> +	 * Start from the lowest_stack and go to the bottom.
>> +	 */
>> +	while (p > boundary && poison <= check_depth) {
>> +		if (*(unsigned long *)p == STACKLEAK_POISON)
>> +			poison++;
>> +		else
>> +			poison = 0;
>> +
>> +		p -= sizeof(unsigned long);
>> +	}
>> +
>> +	/*
>> +	 * One long int at the bottom of the thread stack is reserved and
>> +	 * should not be poisoned (see CONFIG_SCHED_STACK_END_CHECK).
>> +	 */
>> +	if (p == boundary)
>> +		p += sizeof(unsigned long);
> 
> I wonder if end_of_stack() should be taught about CONFIG_SCHED_STACK_END_CHECK,
> given that's supposed to return the last *usable* long on the stack, and we
> don't account for this elsewhere.

I would be afraid to change the meaning of end_of_stack()... Currently it
considers that magic long as usable (include/linux/sched/task_stack.h):

#define task_stack_end_corrupted(task) \
		(*(end_of_stack(task)) != STACK_END_MAGIC)


> If we did, then IIUC we could do:
> 
> 	unsigned long boundary = (unsigned long)end_of_stack(current);
> 
> ... at the start of the function, and not have to worry about this explicitly.

I should mention that erase_kstack() can be called from x86 trampoline stack.
That's why the boundary is calculated from the lowest_stack.

>> +
>> +#ifdef CONFIG_STACKLEAK_METRICS
>> +	current->thread.prev_lowest_stack = p;
>> +#endif
>> +
>> +	/*
>> +	 * So let's write the poison value to the kernel stack.
>> +	 * Start from the address in p and move up till the new boundary.
>> +	 */
>> +	boundary = current_stack_pointer;
> 
> I worry a little that the compiler can move the SP during a function's
> lifetime, but maybe that's only the case when there are VLAs, or something like
> that?

Oh, I don't know.

However, erase_kstack() doesn't call anything except simple inline functions.
And as I see from its disasm on x86, the local variables reside in registers.

>> +
>> +	BUG_ON(boundary - p >= THREAD_SIZE);
>> +
>> +	while (p < boundary) {
>> +		*(unsigned long *)p = STACKLEAK_POISON;
>> +		p += sizeof(unsigned long);
>> +	}
>> +
>> +	/* Reset the lowest_stack value for the next syscall */
>> +	current->thread.lowest_stack = current_stack_pointer;

Laura, that might be wrong and introduce huge performance impact.

I think, lowest_stack should be reset similarly to the original version.

>> +}
> 
> Once this function returns, its data is left on the stack. Is that not a problem?
> 
> No strong feelings either way, but it might be worth mentioning in the commit
> message.

I managed to bypass that with "register" specifier. Although it doesn't give an
absolute guarantee.

>> diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
>> index f08a2ed9db0d..156fa0a0da19 100644
>> --- a/arch/arm64/kernel/process.c
>> +++ b/arch/arm64/kernel/process.c
>> @@ -364,6 +364,9 @@ int copy_thread(unsigned long clone_flags, unsigned long stack_start,
>>  	p->thread.cpu_context.pc = (unsigned long)ret_from_fork;
>>  	p->thread.cpu_context.sp = (unsigned long)childregs;
>>  
>> +#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
>> +	p->thread.lowest_stack = (unsigned long)task_stack_page(p);
> 
> Nit: end_of_stack(p) would be slightly better semantically, even though
> currently equivalent to task_stack_page(p).

Thanks, I agree, I'll fix it in v12.

> [...]
> 
>> +#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
>> +void __used check_alloca(unsigned long size)
>> +{
>> +	unsigned long sp, stack_left;
>> +
>> +	sp = current_stack_pointer;
>> +
>> +	stack_left = sp & (THREAD_SIZE - 1);
>> +	BUG_ON(stack_left < 256 || size >= stack_left - 256);
>> +}
> 
> Is this arbitrary, or is there something special about 256?
> 
> Even if this is arbitrary, can we give it some mnemonic?

It's just a reasonable number. We can introduce a macro for it.

>> +EXPORT_SYMBOL(check_alloca);
>> +#endif
>> diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile
>> index a34e9290a699..25dd2a14560d 100644
>> --- a/drivers/firmware/efi/libstub/Makefile
>> +++ b/drivers/firmware/efi/libstub/Makefile
>> @@ -20,7 +20,8 @@ cflags-$(CONFIG_EFI_ARMSTUB)	+= -I$(srctree)/scripts/dtc/libfdt
>>  KBUILD_CFLAGS			:= $(cflags-y) -DDISABLE_BRANCH_PROFILING \
>>  				   -D__NO_FORTIFY \
>>  				   $(call cc-option,-ffreestanding) \
>> -				   $(call cc-option,-fno-stack-protector)
>> +				   $(call cc-option,-fno-stack-protector) \
>> +				   $(DISABLE_STACKLEAK_PLUGIN)
>>  
>>  GCOV_PROFILE			:= n
>>  KASAN_SANITIZE			:= n
> 
> I believe we'll also need to do this for the KVM hyp code in arch/arm64/kvm/hyp/.

Could you please give more details on that? Why STACKLEAK breaks it?

Thanks a lot!

Best regards,
Alexander
Laura Abbott May 3, 2018, 7 p.m. UTC | #9
On 05/03/2018 12:19 AM, Mark Rutland wrote:
> Hi Laura,
> 
> On Wed, May 02, 2018 at 01:33:26PM -0700, Laura Abbott wrote:
>>
>> Implementation of stackleak based heavily on the x86 version
>>
>> Signed-off-by: Laura Abbott <labbott@redhat.com>
>> ---
>> Now written in C instead of a bunch of assembly.
> 
> This looks neat!
> 
> I have a few minor comments below.
> 
>> diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
>> index bf825f38d206..0ceea613c65b 100644
>> --- a/arch/arm64/kernel/Makefile
>> +++ b/arch/arm64/kernel/Makefile
>> @@ -55,6 +55,9 @@ arm64-reloc-test-y := reloc_test_core.o reloc_test_syms.o
>>   arm64-obj-$(CONFIG_CRASH_DUMP)		+= crash_dump.o
>>   arm64-obj-$(CONFIG_ARM_SDE_INTERFACE)	+= sdei.o
>>   
>> +arm64-obj-$(CONFIG_GCC_PLUGIN_STACKLEAK) += erase.o
>> +KASAN_SANITIZE_erase.o	:= n
> 
> I suspect we want to avoid the full set of instrumentation suspects here, e.g.
> GKOV, KASAN, UBSAN, and KCOV.
> 
>> +
>>   obj-y					+= $(arm64-obj-y) vdso/ probes/
>>   obj-m					+= $(arm64-obj-m)
>>   head-y					:= head.o
>> diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
>> index ec2ee720e33e..3144f1ebdc18 100644
>> --- a/arch/arm64/kernel/entry.S
>> +++ b/arch/arm64/kernel/entry.S
>> @@ -401,6 +401,11 @@ tsk	.req	x28		// current thread_info
>>   
>>   	.text
>>   
>> +	.macro	ERASE_KSTACK
>> +#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
>> +	bl	erase_kstack
>> +#endif
>> +	.endm
> 
> Nit: The rest of our asm macros are lower-case -- can we stick to that here?
> 
>>   /*
>>    * Exception vectors.
>>    */
>> @@ -906,6 +911,7 @@ ret_to_user:
>>   	cbnz	x2, work_pending
>>   finish_ret_to_user:
>>   	enable_step_tsk x1, x2
>> +	ERASE_KSTACK
>>   	kernel_exit 0
>>   ENDPROC(ret_to_user)
> 
> I believe we also need this in ret_fast_syscall.
> 
> [...]
> 

Yeah I had this in previous versions but I managed to out think
myself. I'll add it in with a comment to avoid confusion.

>> +asmlinkage void erase_kstack(void)
>> +{
>> +	unsigned long p = current->thread.lowest_stack;
>> +	unsigned long boundary = p & ~(THREAD_SIZE - 1);
>> +	unsigned long poison = 0;
>> +	const unsigned long check_depth = STACKLEAK_POISON_CHECK_DEPTH /
>> +							sizeof(unsigned long);
>> +
>> +	/*
>> +	 * Let's search for the poison value in the stack.
>> +	 * Start from the lowest_stack and go to the bottom.
>> +	 */
>> +	while (p > boundary && poison <= check_depth) {
>> +		if (*(unsigned long *)p == STACKLEAK_POISON)
>> +			poison++;
>> +		else
>> +			poison = 0;
>> +
>> +		p -= sizeof(unsigned long);
>> +	}
>> +
>> +	/*
>> +	 * One long int at the bottom of the thread stack is reserved and
>> +	 * should not be poisoned (see CONFIG_SCHED_STACK_END_CHECK).
>> +	 */
>> +	if (p == boundary)
>> +		p += sizeof(unsigned long);
> 
> I wonder if end_of_stack() should be taught about CONFIG_SCHED_STACK_END_CHECK,
> given that's supposed to return the last *usable* long on the stack, and we
> don't account for this elsewhere.
> 
> If we did, then IIUC we could do:
> 
> 	unsigned long boundary = (unsigned long)end_of_stack(current);
> 
> ... at the start of the function, and not have to worry about this explicitly.
> 
>> +
>> +#ifdef CONFIG_STACKLEAK_METRICS
>> +	current->thread.prev_lowest_stack = p;
>> +#endif
>> +
>> +	/*
>> +	 * So let's write the poison value to the kernel stack.
>> +	 * Start from the address in p and move up till the new boundary.
>> +	 */
>> +	boundary = current_stack_pointer;
> 
> I worry a little that the compiler can move the SP during a function's
> lifetime, but maybe that's only the case when there are VLAs, or something like
> that?
> 

I think that's true and a risk we take writing this in C. Here's
the disassembly on gcc-7.3.1:

ffff00000809d4d8 <erase_kstack>:
ffff00000809d4d8:       a9bf7bfd        stp     x29, x30, [sp, #-16]!
ffff00000809d4dc:       d5384100        mrs     x0, sp_el0
ffff00000809d4e0:       910003fd        mov     x29, sp
ffff00000809d4e4:       f946e400        ldr     x0, [x0, #3528]
ffff00000809d4e8:       9272c404        and     x4, x0, #0xffffffffffffc000
ffff00000809d4ec:       eb04001f        cmp     x0, x4
ffff00000809d4f0:       540002c9        b.ls    ffff00000809d548 <erase_kstack+0x70>  // b.plast
ffff00000809d4f4:       d2800003        mov     x3, #0x0                        // #0
ffff00000809d4f8:       9297ddc5        mov     x5, #0xffffffffffff4111         // #-48879
ffff00000809d4fc:       14000008        b       ffff00000809d51c <erase_kstack+0x44>
ffff00000809d500:       d1002000        sub     x0, x0, #0x8
ffff00000809d504:       52800022        mov     w2, #0x1                        // #1
ffff00000809d508:       eb00009f        cmp     x4, x0
ffff00000809d50c:       d2800003        mov     x3, #0x0                        // #0
ffff00000809d510:       1a9f27e1        cset    w1, cc  // cc = lo, ul, last
ffff00000809d514:       6a01005f        tst     w2, w1
ffff00000809d518:       54000180        b.eq    ffff00000809d548 <erase_kstack+0x70>  // b.none
ffff00000809d51c:       f9400001        ldr     x1, [x0]
ffff00000809d520:       eb05003f        cmp     x1, x5
ffff00000809d524:       54fffee1        b.ne    ffff00000809d500 <erase_kstack+0x28>  // b.any
ffff00000809d528:       91000463        add     x3, x3, #0x1
ffff00000809d52c:       d1002000        sub     x0, x0, #0x8
ffff00000809d530:       f100407f        cmp     x3, #0x10
ffff00000809d534:       1a9f87e2        cset    w2, ls  // ls = plast
ffff00000809d538:       eb00009f        cmp     x4, x0
ffff00000809d53c:       1a9f27e1        cset    w1, cc  // cc = lo, ul, last
ffff00000809d540:       6a01005f        tst     w2, w1
ffff00000809d544:       54fffec1        b.ne    ffff00000809d51c <erase_kstack+0x44>  // b.any
ffff00000809d548:       eb00009f        cmp     x4, x0
ffff00000809d54c:       91002001        add     x1, x0, #0x8
ffff00000809d550:       9a800020        csel    x0, x1, x0, eq  // eq = none
ffff00000809d554:       910003e1        mov     x1, sp
ffff00000809d558:       d5384102        mrs     x2, sp_el0
ffff00000809d55c:       f906e840        str     x0, [x2, #3536]
ffff00000809d560:       cb000023        sub     x3, x1, x0
ffff00000809d564:       d287ffe2        mov     x2, #0x3fff                     // #16383
ffff00000809d568:       eb02007f        cmp     x3, x2
ffff00000809d56c:       540001a8        b.hi    ffff00000809d5a0 <erase_kstack+0xc8>  // b.pmore
ffff00000809d570:       9297ddc2        mov     x2, #0xffffffffffff4111         // #-48879
ffff00000809d574:       eb01001f        cmp     x0, x1
ffff00000809d578:       54000082        b.cs    ffff00000809d588 <erase_kstack+0xb0>  // b.hs, b.nlast
ffff00000809d57c:       f8008402        str     x2, [x0], #8
ffff00000809d580:       eb00003f        cmp     x1, x0
ffff00000809d584:       54ffffc8        b.hi    ffff00000809d57c <erase_kstack+0xa4>  // b.pmore
ffff00000809d588:       910003e1        mov     x1, sp
ffff00000809d58c:       d5384100        mrs     x0, sp_el0
ffff00000809d590:       f906e401        str     x1, [x0, #3528]
ffff00000809d594:       a8c17bfd        ldp     x29, x30, [sp], #16
ffff00000809d598:       d65f03c0        ret
ffff00000809d59c:       d503201f        nop
ffff00000809d5a0:       d4210000        brk     #0x800
ffff00000809d5a4:       00000000        .inst   0x00000000 ; undefined

It looks to be okay although admittedly that's subject to compiler
whims. It might be safer to save the stack pointer almost as soon as
we get into the function and use that?

>> +
>> +	BUG_ON(boundary - p >= THREAD_SIZE);
>> +
>> +	while (p < boundary) {
>> +		*(unsigned long *)p = STACKLEAK_POISON;
>> +		p += sizeof(unsigned long);
>> +	}
>> +
>> +	/* Reset the lowest_stack value for the next syscall */
>> +	current->thread.lowest_stack = current_stack_pointer;
>> +}
> 
> Once this function returns, its data is left on the stack. Is that not a problem?
> 
> No strong feelings either way, but it might be worth mentioning in the commit
> message.
> 
>> diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
>> index f08a2ed9db0d..156fa0a0da19 100644
>> --- a/arch/arm64/kernel/process.c
>> +++ b/arch/arm64/kernel/process.c
>> @@ -364,6 +364,9 @@ int copy_thread(unsigned long clone_flags, unsigned long stack_start,
>>   	p->thread.cpu_context.pc = (unsigned long)ret_from_fork;
>>   	p->thread.cpu_context.sp = (unsigned long)childregs;
>>   
>> +#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
>> +	p->thread.lowest_stack = (unsigned long)task_stack_page(p);
> 
> Nit: end_of_stack(p) would be slightly better semantically, even though
> currently equivalent to task_stack_page(p).
> 
> [...]
> 
>> +#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
>> +void __used check_alloca(unsigned long size)
>> +{
>> +	unsigned long sp, stack_left;
>> +
>> +	sp = current_stack_pointer;
>> +
>> +	stack_left = sp & (THREAD_SIZE - 1);
>> +	BUG_ON(stack_left < 256 || size >= stack_left - 256);
>> +}
> 
> Is this arbitrary, or is there something special about 256?
> 
> Even if this is arbitrary, can we give it some mnemonic?
> 
>> +EXPORT_SYMBOL(check_alloca);
>> +#endif
>> diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile
>> index a34e9290a699..25dd2a14560d 100644
>> --- a/drivers/firmware/efi/libstub/Makefile
>> +++ b/drivers/firmware/efi/libstub/Makefile
>> @@ -20,7 +20,8 @@ cflags-$(CONFIG_EFI_ARMSTUB)	+= -I$(srctree)/scripts/dtc/libfdt
>>   KBUILD_CFLAGS			:= $(cflags-y) -DDISABLE_BRANCH_PROFILING \
>>   				   -D__NO_FORTIFY \
>>   				   $(call cc-option,-ffreestanding) \
>> -				   $(call cc-option,-fno-stack-protector)
>> +				   $(call cc-option,-fno-stack-protector) \
>> +				   $(DISABLE_STACKLEAK_PLUGIN)
>>   
>>   GCOV_PROFILE			:= n
>>   KASAN_SANITIZE			:= n
> 
> I believe we'll also need to do this for the KVM hyp code in arch/arm64/kvm/hyp/.
> 
> Thanks,
> Mark.
> 

Thanks,
Laura
Laura Abbott May 3, 2018, 7:09 p.m. UTC | #10
On 05/03/2018 10:33 AM, Alexander Popov wrote:
> Hello Mark and Laura,
> 
> Let me join the discussion. Mark, thanks for your feedback!
> 
> On 03.05.2018 10:19, Mark Rutland wrote:
>> Hi Laura,
>>
>> On Wed, May 02, 2018 at 01:33:26PM -0700, Laura Abbott wrote:
>>>
>>> Implementation of stackleak based heavily on the x86 version
>>>
>>> Signed-off-by: Laura Abbott <labbott@redhat.com>
>>> ---
>>> Now written in C instead of a bunch of assembly.
>>
>> This looks neat!
>>
>> I have a few minor comments below.
>>
>>> diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
>>> index bf825f38d206..0ceea613c65b 100644
>>> --- a/arch/arm64/kernel/Makefile
>>> +++ b/arch/arm64/kernel/Makefile
>>> @@ -55,6 +55,9 @@ arm64-reloc-test-y := reloc_test_core.o reloc_test_syms.o
>>>   arm64-obj-$(CONFIG_CRASH_DUMP)		+= crash_dump.o
>>>   arm64-obj-$(CONFIG_ARM_SDE_INTERFACE)	+= sdei.o
>>>   
>>> +arm64-obj-$(CONFIG_GCC_PLUGIN_STACKLEAK) += erase.o
>>> +KASAN_SANITIZE_erase.o	:= n
>>
>> I suspect we want to avoid the full set of instrumentation suspects here, e.g.
>> GKOV, KASAN, UBSAN, and KCOV.
> 
> I've disabled KASAN instrumentation for that file on x86 because erase_kstack()
> intentionally writes to the stack and causes KASAN false positive reports.
> 
> But I didn't see any conflicts with other types of instrumentation that you
> mentioned.
> 
>>> +
>>>   obj-y					+= $(arm64-obj-y) vdso/ probes/
>>>   obj-m					+= $(arm64-obj-m)
>>>   head-y					:= head.o
>>> diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
>>> index ec2ee720e33e..3144f1ebdc18 100644
>>> --- a/arch/arm64/kernel/entry.S
>>> +++ b/arch/arm64/kernel/entry.S
>>> @@ -401,6 +401,11 @@ tsk	.req	x28		// current thread_info
>>>   
>>>   	.text
>>>   
>>> +	.macro	ERASE_KSTACK
>>> +#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
>>> +	bl	erase_kstack
>>> +#endif
>>> +	.endm
>>
>> Nit: The rest of our asm macros are lower-case -- can we stick to that here?
>>
>>>   /*
>>>    * Exception vectors.
>>>    */
>>> @@ -906,6 +911,7 @@ ret_to_user:
>>>   	cbnz	x2, work_pending
>>>   finish_ret_to_user:
>>>   	enable_step_tsk x1, x2
>>> +	ERASE_KSTACK
>>>   	kernel_exit 0
>>>   ENDPROC(ret_to_user)
>>
>> I believe we also need this in ret_fast_syscall.
>>
>> [...]
>>
>>> +asmlinkage void erase_kstack(void)
>>> +{
>>> +	unsigned long p = current->thread.lowest_stack;
>>> +	unsigned long boundary = p & ~(THREAD_SIZE - 1);
>>> +	unsigned long poison = 0;
>>> +	const unsigned long check_depth = STACKLEAK_POISON_CHECK_DEPTH /
>>> +							sizeof(unsigned long);
>>> +
>>> +	/*
>>> +	 * Let's search for the poison value in the stack.
>>> +	 * Start from the lowest_stack and go to the bottom.
>>> +	 */
>>> +	while (p > boundary && poison <= check_depth) {
>>> +		if (*(unsigned long *)p == STACKLEAK_POISON)
>>> +			poison++;
>>> +		else
>>> +			poison = 0;
>>> +
>>> +		p -= sizeof(unsigned long);
>>> +	}
>>> +
>>> +	/*
>>> +	 * One long int at the bottom of the thread stack is reserved and
>>> +	 * should not be poisoned (see CONFIG_SCHED_STACK_END_CHECK).
>>> +	 */
>>> +	if (p == boundary)
>>> +		p += sizeof(unsigned long);
>>
>> I wonder if end_of_stack() should be taught about CONFIG_SCHED_STACK_END_CHECK,
>> given that's supposed to return the last *usable* long on the stack, and we
>> don't account for this elsewhere.
> 
> I would be afraid to change the meaning of end_of_stack()... Currently it
> considers that magic long as usable (include/linux/sched/task_stack.h):
> 
> #define task_stack_end_corrupted(task) \
> 		(*(end_of_stack(task)) != STACK_END_MAGIC)
> 
> 
>> If we did, then IIUC we could do:
>>
>> 	unsigned long boundary = (unsigned long)end_of_stack(current);
>>
>> ... at the start of the function, and not have to worry about this explicitly.
> 
> I should mention that erase_kstack() can be called from x86 trampoline stack.
> That's why the boundary is calculated from the lowest_stack.
> 
>>> +
>>> +#ifdef CONFIG_STACKLEAK_METRICS
>>> +	current->thread.prev_lowest_stack = p;
>>> +#endif
>>> +
>>> +	/*
>>> +	 * So let's write the poison value to the kernel stack.
>>> +	 * Start from the address in p and move up till the new boundary.
>>> +	 */
>>> +	boundary = current_stack_pointer;
>>
>> I worry a little that the compiler can move the SP during a function's
>> lifetime, but maybe that's only the case when there are VLAs, or something like
>> that?
> 
> Oh, I don't know.
> 
> However, erase_kstack() doesn't call anything except simple inline functions.
> And as I see from its disasm on x86, the local variables reside in registers.
> 
>>> +
>>> +	BUG_ON(boundary - p >= THREAD_SIZE);
>>> +
>>> +	while (p < boundary) {
>>> +		*(unsigned long *)p = STACKLEAK_POISON;
>>> +		p += sizeof(unsigned long);
>>> +	}
>>> +
>>> +	/* Reset the lowest_stack value for the next syscall */
>>> +	current->thread.lowest_stack = current_stack_pointer;
> 
> Laura, that might be wrong and introduce huge performance impact.
> 
> I think, lowest_stack should be reset similarly to the original version.
> 

Sorry, I'm not understanding here. What's the performance impact and
what do you mean by original version?

>>> +}
>>
>> Once this function returns, its data is left on the stack. Is that not a problem?
>>
>> No strong feelings either way, but it might be worth mentioning in the commit
>> message.
> 
> I managed to bypass that with "register" specifier. Although it doesn't give an
> absolute guarantee.
> 

I guess I was assuming gcc would be smart enough not to spill stuff
on the stack. I also intentionally removed the register keyword
since it wasn't clear gcc does much with it on a modern system? I
could be completely off base here though so please correct me if
I'm wrong. It probably is worth documenting what we are assuming about
the compiler here.


>>> diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
>>> index f08a2ed9db0d..156fa0a0da19 100644
>>> --- a/arch/arm64/kernel/process.c
>>> +++ b/arch/arm64/kernel/process.c
>>> @@ -364,6 +364,9 @@ int copy_thread(unsigned long clone_flags, unsigned long stack_start,
>>>   	p->thread.cpu_context.pc = (unsigned long)ret_from_fork;
>>>   	p->thread.cpu_context.sp = (unsigned long)childregs;
>>>   
>>> +#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
>>> +	p->thread.lowest_stack = (unsigned long)task_stack_page(p);
>>
>> Nit: end_of_stack(p) would be slightly better semantically, even though
>> currently equivalent to task_stack_page(p).
> 
> Thanks, I agree, I'll fix it in v12.
> 
>> [...]
>>
>>> +#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
>>> +void __used check_alloca(unsigned long size)
>>> +{
>>> +	unsigned long sp, stack_left;
>>> +
>>> +	sp = current_stack_pointer;
>>> +
>>> +	stack_left = sp & (THREAD_SIZE - 1);
>>> +	BUG_ON(stack_left < 256 || size >= stack_left - 256);
>>> +}
>>
>> Is this arbitrary, or is there something special about 256?
>>
>> Even if this is arbitrary, can we give it some mnemonic?
> 
> It's just a reasonable number. We can introduce a macro for it.
> 
>>> +EXPORT_SYMBOL(check_alloca);
>>> +#endif
>>> diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile
>>> index a34e9290a699..25dd2a14560d 100644
>>> --- a/drivers/firmware/efi/libstub/Makefile
>>> +++ b/drivers/firmware/efi/libstub/Makefile
>>> @@ -20,7 +20,8 @@ cflags-$(CONFIG_EFI_ARMSTUB)	+= -I$(srctree)/scripts/dtc/libfdt
>>>   KBUILD_CFLAGS			:= $(cflags-y) -DDISABLE_BRANCH_PROFILING \
>>>   				   -D__NO_FORTIFY \
>>>   				   $(call cc-option,-ffreestanding) \
>>> -				   $(call cc-option,-fno-stack-protector)
>>> +				   $(call cc-option,-fno-stack-protector) \
>>> +				   $(DISABLE_STACKLEAK_PLUGIN)
>>>   
>>>   GCOV_PROFILE			:= n
>>>   KASAN_SANITIZE			:= n
>>
>> I believe we'll also need to do this for the KVM hyp code in arch/arm64/kvm/hyp/.
> 
> Could you please give more details on that? Why STACKLEAK breaks it?
> 

For reference, I originally added this for the efistub because
it would not compile. I did compile this against my Fedora tree
which has KVM enabled.

> Thanks a lot!
> 
> Best regards,
> Alexander
> 

Thanks,
Laura
Alexander Popov May 4, 2018, 8:30 a.m. UTC | #11
On 03.05.2018 22:09, Laura Abbott wrote:
> On 05/03/2018 10:33 AM, Alexander Popov wrote:
>> On 03.05.2018 10:19, Mark Rutland wrote:
>>> On Wed, May 02, 2018 at 01:33:26PM -0700, Laura Abbott wrote:
>>>> +	/* Reset the lowest_stack value for the next syscall */
>>>> +	current->thread.lowest_stack = current_stack_pointer;
>>
>> Laura, that might be wrong and introduce huge performance impact.
>>
>> I think, lowest_stack should be reset similarly to the original version.
>>
> 
> Sorry, I'm not understanding here. What's the performance impact and
> what do you mean by original version?

I meant the code for x86:
	/* Reset the lowest_stack value for the next syscall */
	current->thread.lowest_stack = current_top_of_stack() - 256;

...Now when I'm writing about the performance impact, I see that I was wrong
about "huge". Excuse me.

Let me describe the implications of this code change.

So we are at the end of a syscall. We've just erased the used part of the kernel
stack. The current stack pointer is near to the top of stack. On x86_64 I see
that the stack pointer is stack top minus 56 bytes (just before switching onto
the trampoline stack).

I took the idea of resetting lowest_stack to stack top minus 256 from the
original PaX Team's code. It should give the speedup when lowest_stack is not
updated during a syscall (a lot of functions are not instrumented) and we start
to search for the poison value from that reasonable point.

If we speak about the common erase_kstack() code, this code change can break
x86, because this function can be called from the trampoline stack (separate
from the thread stack).

>>>> +}
>>>
>>> Once this function returns, its data is left on the stack. Is that not a problem?
>>>
>>> No strong feelings either way, but it might be worth mentioning in the commit
>>> message.
>>
>> I managed to bypass that with "register" specifier. Although it doesn't give an
>> absolute guarantee.
>>
> 
> I guess I was assuming gcc would be smart enough not to spill stuff
> on the stack. I also intentionally removed the register keyword
> since it wasn't clear gcc does much with it on a modern system? I
> could be completely off base here though so please correct me if
> I'm wrong. It probably is worth documenting what we are assuming about
> the compiler here.

I think having register storage class specifier here is a bit better than
nothing. And yes, I'll add a comment. Right now don't see a better solution.

>>>> diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile
>>>> index a34e9290a699..25dd2a14560d 100644
>>>> --- a/drivers/firmware/efi/libstub/Makefile
>>>> +++ b/drivers/firmware/efi/libstub/Makefile
>>>> @@ -20,7 +20,8 @@ cflags-$(CONFIG_EFI_ARMSTUB)	+= -I$(srctree)/scripts/dtc/libfdt
>>>>   KBUILD_CFLAGS			:= $(cflags-y) -DDISABLE_BRANCH_PROFILING \
>>>>   				   -D__NO_FORTIFY \
>>>>   				   $(call cc-option,-ffreestanding) \
>>>> -				   $(call cc-option,-fno-stack-protector)
>>>> +				   $(call cc-option,-fno-stack-protector) \
>>>> +				   $(DISABLE_STACKLEAK_PLUGIN)
>>>>   
>>>>   GCOV_PROFILE			:= n
>>>>   KASAN_SANITIZE			:= n
>>>
>>> I believe we'll also need to do this for the KVM hyp code in arch/arm64/kvm/hyp/.
>>
>> Could you please give more details on that? Why STACKLEAK breaks it?
>>
> 
> For reference, I originally added this for the efistub because
> it would not compile.

I guess it was a linkage error, right?

> I did compile this against my Fedora tree which has KVM enabled.

Looked through this big article about ARM, KVM and HYP mode:
https://lwn.net/Articles/557132/

So we have some limited amount of kernel code which runs in HYP mode. Is it only
in arch/arm64/kvm/hyp/ directory?

Mark, could you give a clue what trouble will we have if we call track_stack()
or check_alloca() from that code?

Thanks in advance!

--
Alexander
Mark Rutland May 4, 2018, 11:09 a.m. UTC | #12
On Thu, May 03, 2018 at 08:33:38PM +0300, Alexander Popov wrote:
> Hello Mark and Laura,
> 
> Let me join the discussion. Mark, thanks for your feedback!
> 
> On 03.05.2018 10:19, Mark Rutland wrote:
> > Hi Laura,
> > 
> > On Wed, May 02, 2018 at 01:33:26PM -0700, Laura Abbott wrote:
> >>
> >> Implementation of stackleak based heavily on the x86 version
> >>
> >> Signed-off-by: Laura Abbott <labbott@redhat.com>
> >> ---
> >> Now written in C instead of a bunch of assembly.
> > 
> > This looks neat!
> > 
> > I have a few minor comments below.
> > 
> >> diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
> >> index bf825f38d206..0ceea613c65b 100644
> >> --- a/arch/arm64/kernel/Makefile
> >> +++ b/arch/arm64/kernel/Makefile
> >> @@ -55,6 +55,9 @@ arm64-reloc-test-y := reloc_test_core.o reloc_test_syms.o
> >>  arm64-obj-$(CONFIG_CRASH_DUMP)		+= crash_dump.o
> >>  arm64-obj-$(CONFIG_ARM_SDE_INTERFACE)	+= sdei.o
> >>  
> >> +arm64-obj-$(CONFIG_GCC_PLUGIN_STACKLEAK) += erase.o
> >> +KASAN_SANITIZE_erase.o	:= n
> > 
> > I suspect we want to avoid the full set of instrumentation suspects here, e.g.
> > GKOV, KASAN, UBSAN, and KCOV.
> 
> I've disabled KASAN instrumentation for that file on x86 because erase_kstack()
> intentionally writes to the stack and causes KASAN false positive reports.
> 
> But I didn't see any conflicts with other types of instrumentation that you
> mentioned.

The rationale is that any of these can result in implicit calls to C
functions at arbitrary points during erase_kstack(). That could
interfere with the search for poison, and/or leave data on the stack
which is not erased.

They won't result in hard failures, as KASAN would, but we should
probably avoid them regardless.

[...]

> >> +asmlinkage void erase_kstack(void)
> >> +{
> >> +	unsigned long p = current->thread.lowest_stack;
> >> +	unsigned long boundary = p & ~(THREAD_SIZE - 1);
> >> +	unsigned long poison = 0;
> >> +	const unsigned long check_depth = STACKLEAK_POISON_CHECK_DEPTH /
> >> +							sizeof(unsigned long);
> >> +
> >> +	/*
> >> +	 * Let's search for the poison value in the stack.
> >> +	 * Start from the lowest_stack and go to the bottom.
> >> +	 */
> >> +	while (p > boundary && poison <= check_depth) {
> >> +		if (*(unsigned long *)p == STACKLEAK_POISON)
> >> +			poison++;
> >> +		else
> >> +			poison = 0;
> >> +
> >> +		p -= sizeof(unsigned long);
> >> +	}
> >> +
> >> +	/*
> >> +	 * One long int at the bottom of the thread stack is reserved and
> >> +	 * should not be poisoned (see CONFIG_SCHED_STACK_END_CHECK).
> >> +	 */
> >> +	if (p == boundary)
> >> +		p += sizeof(unsigned long);
> > 
> > I wonder if end_of_stack() should be taught about CONFIG_SCHED_STACK_END_CHECK,
> > given that's supposed to return the last *usable* long on the stack, and we
> > don't account for this elsewhere.
> 
> I would be afraid to change the meaning of end_of_stack()... Currently it
> considers that magic long as usable (include/linux/sched/task_stack.h):
> 
> #define task_stack_end_corrupted(task) \
> 		(*(end_of_stack(task)) != STACK_END_MAGIC)
> 
> 
> > If we did, then IIUC we could do:
> > 
> > 	unsigned long boundary = (unsigned long)end_of_stack(current);
> > 
> > ... at the start of the function, and not have to worry about this explicitly.
> 
> I should mention that erase_kstack() can be called from x86 trampoline stack.
> That's why the boundary is calculated from the lowest_stack.

Ok. Under what circumstances does that happen?

It seems a little scary that curent::thread::lowest_stack might not be
on current's task stack. Is that reset when transitioning to/from the
trampoile stack?

[...]

> >> +#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
> >> +void __used check_alloca(unsigned long size)
> >> +{
> >> +	unsigned long sp, stack_left;
> >> +
> >> +	sp = current_stack_pointer;
> >> +
> >> +	stack_left = sp & (THREAD_SIZE - 1);
> >> +	BUG_ON(stack_left < 256 || size >= stack_left - 256);
> >> +}
> > 
> > Is this arbitrary, or is there something special about 256?
> > 
> > Even if this is arbitrary, can we give it some mnemonic?
> 
> It's just a reasonable number. We can introduce a macro for it.

I'm just not sure I see the point in the offset, given things like
VMAP_STACK exist. BUG_ON() handling will likely require *more* than 256
bytes of stack, so it seems superfluous, as we'd be relying on stack
overflow detection at that point.

I can see that we should take the CONFIG_SCHED_STACK_END_CHECK offset
into account, though.

> >> +EXPORT_SYMBOL(check_alloca);
> >> +#endif
> >> diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile
> >> index a34e9290a699..25dd2a14560d 100644
> >> --- a/drivers/firmware/efi/libstub/Makefile
> >> +++ b/drivers/firmware/efi/libstub/Makefile
> >> @@ -20,7 +20,8 @@ cflags-$(CONFIG_EFI_ARMSTUB)	+= -I$(srctree)/scripts/dtc/libfdt
> >>  KBUILD_CFLAGS			:= $(cflags-y) -DDISABLE_BRANCH_PROFILING \
> >>  				   -D__NO_FORTIFY \
> >>  				   $(call cc-option,-ffreestanding) \
> >> -				   $(call cc-option,-fno-stack-protector)
> >> +				   $(call cc-option,-fno-stack-protector) \
> >> +				   $(DISABLE_STACKLEAK_PLUGIN)
> >>  
> >>  GCOV_PROFILE			:= n
> >>  KASAN_SANITIZE			:= n
> > 
> > I believe we'll also need to do this for the KVM hyp code in arch/arm64/kvm/hyp/.
> 
> Could you please give more details on that? Why STACKLEAK breaks it?

In the hyp/EL2 exception level, we only map the hyp text, and not the
rest of the kernel. So erase_kstack and check_alloca won't be mapped,
and attempt to branch to them will fault.

Even if it were mapped, things like BUG_ON(), get_current(), etc do not
work at hyp.

Additionally, the hyp code is mapped as a different virtual address from
the rest of the kernel, so if any of the STACKLEAK code happens to use
an absolute address, this will not work correctly.

Thanks,
Mark.
Mark Rutland May 4, 2018, 11:16 a.m. UTC | #13
On Thu, May 03, 2018 at 12:00:26PM -0700, Laura Abbott wrote:
> On 05/03/2018 12:19 AM, Mark Rutland wrote:
> > On Wed, May 02, 2018 at 01:33:26PM -0700, Laura Abbott wrote:

> > > +asmlinkage void erase_kstack(void)
> > > +{

> > > +
> > > +	/*
> > > +	 * So let's write the poison value to the kernel stack.
> > > +	 * Start from the address in p and move up till the new boundary.
> > > +	 */
> > > +	boundary = current_stack_pointer;
> > 
> > I worry a little that the compiler can move the SP during a function's
> > lifetime, but maybe that's only the case when there are VLAs, or something like
> > that?
> 
> I think that's true and a risk we take writing this in C. Here's
> the disassembly on gcc-7.3.1:
> 
> ffff00000809d4d8 <erase_kstack>:
> ffff00000809d4d8:       a9bf7bfd        stp     x29, x30, [sp, #-16]!
> ffff00000809d4dc:       d5384100        mrs     x0, sp_el0
> ffff00000809d4e0:       910003fd        mov     x29, sp
> ffff00000809d4e4:       f946e400        ldr     x0, [x0, #3528]
> ffff00000809d4e8:       9272c404        and     x4, x0, #0xffffffffffffc000
> ffff00000809d4ec:       eb04001f        cmp     x0, x4
> ffff00000809d4f0:       540002c9        b.ls    ffff00000809d548 <erase_kstack+0x70>  // b.plast
> ffff00000809d4f4:       d2800003        mov     x3, #0x0                        // #0
> ffff00000809d4f8:       9297ddc5        mov     x5, #0xffffffffffff4111         // #-48879
> ffff00000809d4fc:       14000008        b       ffff00000809d51c <erase_kstack+0x44>
> ffff00000809d500:       d1002000        sub     x0, x0, #0x8
> ffff00000809d504:       52800022        mov     w2, #0x1                        // #1
> ffff00000809d508:       eb00009f        cmp     x4, x0
> ffff00000809d50c:       d2800003        mov     x3, #0x0                        // #0
> ffff00000809d510:       1a9f27e1        cset    w1, cc  // cc = lo, ul, last
> ffff00000809d514:       6a01005f        tst     w2, w1
> ffff00000809d518:       54000180        b.eq    ffff00000809d548 <erase_kstack+0x70>  // b.none
> ffff00000809d51c:       f9400001        ldr     x1, [x0]
> ffff00000809d520:       eb05003f        cmp     x1, x5
> ffff00000809d524:       54fffee1        b.ne    ffff00000809d500 <erase_kstack+0x28>  // b.any
> ffff00000809d528:       91000463        add     x3, x3, #0x1
> ffff00000809d52c:       d1002000        sub     x0, x0, #0x8
> ffff00000809d530:       f100407f        cmp     x3, #0x10
> ffff00000809d534:       1a9f87e2        cset    w2, ls  // ls = plast
> ffff00000809d538:       eb00009f        cmp     x4, x0
> ffff00000809d53c:       1a9f27e1        cset    w1, cc  // cc = lo, ul, last
> ffff00000809d540:       6a01005f        tst     w2, w1
> ffff00000809d544:       54fffec1        b.ne    ffff00000809d51c <erase_kstack+0x44>  // b.any
> ffff00000809d548:       eb00009f        cmp     x4, x0
> ffff00000809d54c:       91002001        add     x1, x0, #0x8
> ffff00000809d550:       9a800020        csel    x0, x1, x0, eq  // eq = none
> ffff00000809d554:       910003e1        mov     x1, sp
> ffff00000809d558:       d5384102        mrs     x2, sp_el0
> ffff00000809d55c:       f906e840        str     x0, [x2, #3536]
> ffff00000809d560:       cb000023        sub     x3, x1, x0
> ffff00000809d564:       d287ffe2        mov     x2, #0x3fff                     // #16383
> ffff00000809d568:       eb02007f        cmp     x3, x2
> ffff00000809d56c:       540001a8        b.hi    ffff00000809d5a0 <erase_kstack+0xc8>  // b.pmore
> ffff00000809d570:       9297ddc2        mov     x2, #0xffffffffffff4111         // #-48879
> ffff00000809d574:       eb01001f        cmp     x0, x1
> ffff00000809d578:       54000082        b.cs    ffff00000809d588 <erase_kstack+0xb0>  // b.hs, b.nlast
> ffff00000809d57c:       f8008402        str     x2, [x0], #8
> ffff00000809d580:       eb00003f        cmp     x1, x0
> ffff00000809d584:       54ffffc8        b.hi    ffff00000809d57c <erase_kstack+0xa4>  // b.pmore
> ffff00000809d588:       910003e1        mov     x1, sp
> ffff00000809d58c:       d5384100        mrs     x0, sp_el0
> ffff00000809d590:       f906e401        str     x1, [x0, #3528]
> ffff00000809d594:       a8c17bfd        ldp     x29, x30, [sp], #16
> ffff00000809d598:       d65f03c0        ret
> ffff00000809d59c:       d503201f        nop
> ffff00000809d5a0:       d4210000        brk     #0x800
> ffff00000809d5a4:       00000000        .inst   0x00000000 ; undefined
> 
> It looks to be okay although admittedly that's subject to compiler
> whims. It might be safer to save the stack pointer almost as soon as
> we get into the function and use that?

I think that's still potentially a problem. If the compiler expands the
stack frame after we've taken a snaphot of the stack pointer, we might
end up erasing portions of the active stackframe.

Maybe we should just document we rely on the compiler not doing that,
and if we end up seeing it in practice we rewrite this in asm? I can't
think of a simple way we can auto-detect if this happens. :/

Thanks,
Mark.
Alexander Popov May 6, 2018, 8:22 a.m. UTC | #14
On 04.05.2018 14:09, Mark Rutland wrote:
> On Thu, May 03, 2018 at 08:33:38PM +0300, Alexander Popov wrote:
>> Hello Mark and Laura,
>>
>> Let me join the discussion. Mark, thanks for your feedback!
>>
>> On 03.05.2018 10:19, Mark Rutland wrote:
>>> Hi Laura,
>>>
>>> On Wed, May 02, 2018 at 01:33:26PM -0700, Laura Abbott wrote:
>>>>
>>>> Implementation of stackleak based heavily on the x86 version
>>>>
>>>> Signed-off-by: Laura Abbott <labbott@redhat.com>
>>>> ---
>>>> Now written in C instead of a bunch of assembly.
>>>
>>> This looks neat!
>>>
>>> I have a few minor comments below.
>>>
>>>> diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
>>>> index bf825f38d206..0ceea613c65b 100644
>>>> --- a/arch/arm64/kernel/Makefile
>>>> +++ b/arch/arm64/kernel/Makefile
>>>> @@ -55,6 +55,9 @@ arm64-reloc-test-y := reloc_test_core.o reloc_test_syms.o
>>>>  arm64-obj-$(CONFIG_CRASH_DUMP)		+= crash_dump.o
>>>>  arm64-obj-$(CONFIG_ARM_SDE_INTERFACE)	+= sdei.o
>>>>  
>>>> +arm64-obj-$(CONFIG_GCC_PLUGIN_STACKLEAK) += erase.o
>>>> +KASAN_SANITIZE_erase.o	:= n
>>>
>>> I suspect we want to avoid the full set of instrumentation suspects here, e.g.
>>> GKOV, KASAN, UBSAN, and KCOV.
>>
>> I've disabled KASAN instrumentation for that file on x86 because erase_kstack()
>> intentionally writes to the stack and causes KASAN false positive reports.
>>
>> But I didn't see any conflicts with other types of instrumentation that you
>> mentioned.
> 
> The rationale is that any of these can result in implicit calls to C
> functions at arbitrary points during erase_kstack(). That could
> interfere with the search for poison, and/or leave data on the stack
> which is not erased.
> 
> They won't result in hard failures, as KASAN would, but we should
> probably avoid them regardless.

Thanks, Mark! Agree about KCOV, I'll switch it off for that file.

And I think I should _not_ disable UBSAN for that file. I didn't make any
intentional UB, so if UBSAN finds anything, that will be a true positive report.

> [...]
> 
>>>> +asmlinkage void erase_kstack(void)
>>>> +{
>>>> +	unsigned long p = current->thread.lowest_stack;
>>>> +	unsigned long boundary = p & ~(THREAD_SIZE - 1);
>>>> +	unsigned long poison = 0;
>>>> +	const unsigned long check_depth = STACKLEAK_POISON_CHECK_DEPTH /
>>>> +							sizeof(unsigned long);
>>>> +
>>>> +	/*
>>>> +	 * Let's search for the poison value in the stack.
>>>> +	 * Start from the lowest_stack and go to the bottom.
>>>> +	 */
>>>> +	while (p > boundary && poison <= check_depth) {
>>>> +		if (*(unsigned long *)p == STACKLEAK_POISON)
>>>> +			poison++;
>>>> +		else
>>>> +			poison = 0;
>>>> +
>>>> +		p -= sizeof(unsigned long);
>>>> +	}
>>>> +
>>>> +	/*
>>>> +	 * One long int at the bottom of the thread stack is reserved and
>>>> +	 * should not be poisoned (see CONFIG_SCHED_STACK_END_CHECK).
>>>> +	 */
>>>> +	if (p == boundary)
>>>> +		p += sizeof(unsigned long);
>>>
>>> I wonder if end_of_stack() should be taught about CONFIG_SCHED_STACK_END_CHECK,
>>> given that's supposed to return the last *usable* long on the stack, and we
>>> don't account for this elsewhere.
>>
>> I would be afraid to change the meaning of end_of_stack()... Currently it
>> considers that magic long as usable (include/linux/sched/task_stack.h):
>>
>> #define task_stack_end_corrupted(task) \
>> 		(*(end_of_stack(task)) != STACK_END_MAGIC)
>>
>>
>>> If we did, then IIUC we could do:
>>>
>>> 	unsigned long boundary = (unsigned long)end_of_stack(current);
>>>
>>> ... at the start of the function, and not have to worry about this explicitly.
>>
>> I should mention that erase_kstack() can be called from x86 trampoline stack.
>> That's why the boundary is calculated from the lowest_stack.
> 
> Ok. Under what circumstances does that happen?
> 
> It seems a little scary that curent::thread::lowest_stack might not be
> on current's task stack. 

Yes, indeed. That's why I check against that, please see BUG_ON() in
erase_kstack() for x86.

1. Calculate the boundary from the lowest_stack.
2. Search for poison between lowest_stack and boundary.
3. Now ready to write the poison.
4. Reset the boundary to current_stack_pointer if we are on the thread stack and
to current_top_of_stack otherwise (we are on the trampoline stack).
5. BUG_ON(boundary - p >= THREAD_SIZE);
6. Write poison till the boundary.

> Is that reset when transitioning to/from the
> trampoile stack?

We switch to the trampoline stack from the current thread stack just before
returning to the userspace. Please search for "trampoline stack" in
arch/x86/entry/entry_64.S.

> [...]
> 
>>>> +#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
>>>> +void __used check_alloca(unsigned long size)
>>>> +{
>>>> +	unsigned long sp, stack_left;
>>>> +
>>>> +	sp = current_stack_pointer;
>>>> +
>>>> +	stack_left = sp & (THREAD_SIZE - 1);
>>>> +	BUG_ON(stack_left < 256 || size >= stack_left - 256);
>>>> +}
>>>
>>> Is this arbitrary, or is there something special about 256?
>>>
>>> Even if this is arbitrary, can we give it some mnemonic?
>>
>> It's just a reasonable number. We can introduce a macro for it.
> 
> I'm just not sure I see the point in the offset, given things like
> VMAP_STACK exist. BUG_ON() handling will likely require *more* than 256
> bytes of stack, so it seems superfluous, as we'd be relying on stack
> overflow detection at that point.
> 
> I can see that we should take the CONFIG_SCHED_STACK_END_CHECK offset
> into account, though.

Mark, thank you for such an important remark!

In Kconfig STACKLEAK implies but doesn't depend on VMAP_STACK. In fact x86_32
doesn't have VMAP_STACK at all but can have STACKLEAK.

[Adding Andy Lutomirski]

I've made some additional experiments: I exhaust the thread stack to have only
(MIN_STACK_LEFT - 1) bytes left and then force alloca. If VMAP_STACK is
disabled, BUG_ON() handling causes stack depth overflow, which is detected by
SCHED_STACK_END_CHECK. If VMAP_STACK is enabled, the kernel hangs on BUG_ON()
handling! Enabling CONFIG_PROVE_LOCKING gives the needed report from VMAP_STACK:

[   43.543962] lkdtm: try a large alloca of 14647 bytes (sp 18446683600580263344)...
[   43.545188] BUG: stack guard page was hit at 00000000830608b8 (stack is 000000009375e943..00000000cb7f52d9)
[   43.545189] kernel stack overflow (double-fault): 0000 [#1] SMP PTI
[   43.545189] Dumping ftrace buffer:
[   43.545190]    (ftrace buffer empty)
[   43.545190] Modules linked in: lkdtm
[   43.545192] CPU: 0 PID: 2682 Comm: sh Not tainted 4.17.0-rc3+ #23
[   43.545192] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
[   43.545193] RIP: 0010:mark_lock+0xe/0x540
[   43.545193] RSP: 0018:ffffc900009c0000 EFLAGS: 00010002
[   43.545194] RAX: 000000000000000c RBX: ffff880079b3b590 RCX: 0000000000000008
[   43.545194] RDX: 0000000000000008 RSI: ffff880079b3b590 RDI: ffff880079b3ad40
[   43.545195] RBP: ffffc900009c0100 R08: 0000000000000002 R09: 0000000000000000
[   43.545195] R10: ffffc900009c0118 R11: 0000000000000000 R12: 0000000000000000
[   43.545196] R13: ffff880079b3ad40 R14: ffff880079b3ad40 R15: ffffffff810cb8d7
[   43.545196] FS:  00007f544c7d8700(0000) GS:ffff88007fc00000(0000) knlGS:0000000000000000
[   43.545197] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   43.545200] CR2: ffffc900009bfff8 CR3: 0000000079194000 CR4: 00000000000006f0
[   43.545200] Call Trace:
[   43.545201]  ? vprintk_emit+0x67/0x440
[   43.545201]  __lock_acquire+0x2e0/0x13e0
[   43.545201]  ? lock_acquire+0x9d/0x1e0
[   43.545202]  lock_acquire+0x9d/0x1e0
[   43.545202]  ? vprintk_emit+0x67/0x440
[   43.545203]  _raw_spin_lock+0x20/0x30
[   43.545203]  ? vprintk_emit+0x67/0x440
[   43.545203]  vprintk_emit+0x67/0x440
[   43.545204]  ? check_alloca+0x9a/0xb0
[   43.545204]  printk+0x50/0x6f
[   43.545204]  ? __probe_kernel_read+0x34/0x60
[   43.545205]  ? check_alloca+0x9a/0xb0
[   43.545205]  report_bug+0xd3/0x110
[   43.545206]  fixup_bug.part.10+0x13/0x30
[   43.545206]  do_error_trap+0x158/0x190
[   43.545206]  ? trace_hardirqs_off_thunk+0x1a/0x1c
[   43.545207]  invalid_op+0x14/0x20
[   43.545207] RIP: 0010:check_alloca+0x9a/0xb0
[   43.545207] RSP: 0018:ffffc900009c0408 EFLAGS: 00010287
[   43.545208] RAX: 0000000000000008 RBX: 0000000000003936 RCX: 0000000000000001
[   43.545209] RDX: 0000000000000002 RSI: 0000000000000000 RDI: ffffc900009c0408
[   43.545209] RBP: ffffc900009c3da0 R08: 0000000000000000 R09: 0000000000000000
[   43.545210] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000003936
[   43.545210] R13: 0000000001ff0610 R14: 0000000000000011 R15: ffffc900009c3f08
[   43.545210]  ? check_alloca+0x64/0xb0
[   43.545211]  do_alloca+0x55/0x71b [lkdtm]
[   43.545211]  ? noop_count+0x10/0x10
[   43.545211]  ? check_usage+0xb1/0x4d0
[   43.545212]  ? noop_count+0x10/0x10
[   43.545212]  ? check_usage+0xb1/0x4d0
[   43.545213]  ? serial8250_console_write+0x253/0x2b0
[   43.545213]  ? serial8250_console_write+0x253/0x2b0
[   43.545213]  ? __lock_acquire+0x2e0/0x13e0
[   43.545214]  ? up+0xd/0x50
[   43.545214]  ? console_unlock+0x374/0x660
[   43.545215]  ? __lock_acquire+0x2e0/0x13e0
[   43.545215]  ? retint_kernel+0x10/0x10
[   43.545215]  ? trace_hardirqs_on_caller+0xed/0x180
[   43.545216]  ? find_held_lock+0x2d/0x90
[   43.545216]  ? mark_held_locks+0x4e/0x80
[   43.545216]  ? console_unlock+0x471/0x660
[   43.545217]  ? trace_hardirqs_on_caller+0xed/0x180
[   43.545217]  ? vprintk_emit+0x235/0x440
[   43.545218]  ? get_stack_info+0x32/0x160
[   43.545218]  ? check_alloca+0x64/0xb0
[   43.545218]  ? do_alloca+0x1f/0x71b [lkdtm]
[   43.545219]  lkdtm_STACKLEAK_ALLOCA+0x8f/0xb0 [lkdtm]
[   43.545219]  direct_entry+0xc5/0x110 [lkdtm]
[   43.545220]  full_proxy_write+0x51/0x80
[   43.545220]  __vfs_write+0x49/0x180
[   43.545220]  ? rcu_read_lock_sched_held+0x53/0x60
[   43.545221]  ? rcu_sync_lockdep_assert+0x29/0x50
[   43.545221]  ? __sb_start_write+0x110/0x160
[   43.545221]  ? vfs_write+0x172/0x190
[   43.545222]  vfs_write+0xa8/0x190
[   43.545222]  ksys_write+0x50/0xc0
[   43.545223]  do_syscall_64+0x51/0x1a0
[   43.545223]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
[   43.545223] RIP: 0033:0x7f544c306370
[   43.545224] RSP: 002b:00007ffc223bacb8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
[   43.545225] RAX: ffffffffffffffda RBX: 0000000001ff0610 RCX: 00007f544c306370
[   43.545225] RDX: 0000000000000011 RSI: 0000000001ff0610 RDI: 0000000000000001
[   43.545225] RBP: 0000000000000011 R08: 41434f4c4c415f4b R09: 00007f544c5bce90
[   43.545226] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000001
[   43.545226] R13: 0000000000000011 R14: 7fffffffffffffff R15: 0000000000000000
[   43.545227] Code: 08 08 00 00 48 c7 c7 70 56 2d 82 5b 48 89 d1 e9 a4 48 01 00 66 0f 1f 84 00 00 00 00 00 41 57 41 56
89 d1 41 55 41 54 49 89 fd 55 <53> bb 01 00 00 00 d3 e3 48 89 f5 41 89 d4 48 83 ec 08 0f b7 46
[   43.545241] RIP: mark_lock+0xe/0x540 RSP: ffffc900009c0000
[   43.545241] ---[ end trace 63196de7418a092e ]---
[   43.545242] Kernel panic - not syncing: corrupted stack end detected inside scheduler
[   43.545242]


I can't say why VMAP_STACK report hangs during BUG_ON() handling on defconfig.
Andy, can you give a clue?

I see that MIN_STACK_LEFT = 2048 is enough for BUG_ON() handling on both x86_64
and x86_32. So I'm going to:
 - set MIN_STACK_LEFT to 2048;
 - improve the lkdtm test to cover this case.

Mark, Kees, Laura, does it sound good?

>>>> +EXPORT_SYMBOL(check_alloca);
>>>> +#endif
>>>> diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile
>>>> index a34e9290a699..25dd2a14560d 100644
>>>> --- a/drivers/firmware/efi/libstub/Makefile
>>>> +++ b/drivers/firmware/efi/libstub/Makefile
>>>> @@ -20,7 +20,8 @@ cflags-$(CONFIG_EFI_ARMSTUB)	+= -I$(srctree)/scripts/dtc/libfdt
>>>>  KBUILD_CFLAGS			:= $(cflags-y) -DDISABLE_BRANCH_PROFILING \
>>>>  				   -D__NO_FORTIFY \
>>>>  				   $(call cc-option,-ffreestanding) \
>>>> -				   $(call cc-option,-fno-stack-protector)
>>>> +				   $(call cc-option,-fno-stack-protector) \
>>>> +				   $(DISABLE_STACKLEAK_PLUGIN)
>>>>  
>>>>  GCOV_PROFILE			:= n
>>>>  KASAN_SANITIZE			:= n
>>>
>>> I believe we'll also need to do this for the KVM hyp code in arch/arm64/kvm/hyp/.
>>
>> Could you please give more details on that? Why STACKLEAK breaks it?
> 
> In the hyp/EL2 exception level, we only map the hyp text, and not the
> rest of the kernel. So erase_kstack and check_alloca won't be mapped,
> and attempt to branch to them will fault.

Here you mean track_stack() and not erase_kstack(), right?

> Even if it were mapped, things like BUG_ON(), get_current(), etc do not
> work at hyp.
> 
> Additionally, the hyp code is mapped as a different virtual address from
> the rest of the kernel, so if any of the STACKLEAK code happens to use
> an absolute address, this will not work correctly.

Thanks for the details. This quite old article [1] says:
  The code run in HYP mode is limited to a few hundred instructions and isolated
  to two assembly files: arch/arm/kvm/interrupts.S and arch/arm/kvm/interrupts_head.S.

Is all hyp code now localized in arch/arm64/kvm/hyp/?

[1]: https://lwn.net/Articles/557132/

Best regards,
Alexander
Alexander Popov May 11, 2018, 3:50 p.m. UTC | #15
Hello everyone,

On 06.05.2018 11:22, Alexander Popov wrote:
> On 04.05.2018 14:09, Mark Rutland wrote:
>>>>> +	stack_left = sp & (THREAD_SIZE - 1);
>>>>> +	BUG_ON(stack_left < 256 || size >= stack_left - 256);
>>>>
>>>> Is this arbitrary, or is there something special about 256?
>>>>
>>>> Even if this is arbitrary, can we give it some mnemonic?
>>>
>>> It's just a reasonable number. We can introduce a macro for it.
>>
>> I'm just not sure I see the point in the offset, given things like
>> VMAP_STACK exist. BUG_ON() handling will likely require *more* than 256
>> bytes of stack, so it seems superfluous, as we'd be relying on stack
>> overflow detection at that point.
>>
>> I can see that we should take the CONFIG_SCHED_STACK_END_CHECK offset
>> into account, though.
> 
> Mark, thank you for such an important remark!
> 
> In Kconfig STACKLEAK implies but doesn't depend on VMAP_STACK. In fact x86_32
> doesn't have VMAP_STACK at all but can have STACKLEAK.
> 
> [Adding Andy Lutomirski]
> 
> I've made some additional experiments: I exhaust the thread stack to have only
> (MIN_STACK_LEFT - 1) bytes left and then force alloca. If VMAP_STACK is
> disabled, BUG_ON() handling causes stack depth overflow, which is detected by
> SCHED_STACK_END_CHECK. If VMAP_STACK is enabled, the kernel hangs on BUG_ON()
> handling! Enabling CONFIG_PROVE_LOCKING gives the needed report from VMAP_STACK:

[...]

> I can't say why VMAP_STACK report hangs during BUG_ON() handling on defconfig.
> Andy, can you give a clue?
> 
> I see that MIN_STACK_LEFT = 2048 is enough for BUG_ON() handling on both x86_64
> and x86_32. So I'm going to:
>  - set MIN_STACK_LEFT to 2048;
>  - improve the lkdtm test to cover this case.
> 
> Mark, Kees, Laura, does it sound good?


Could you have a look at the following changes in check_alloca() before I send
the next version?

If VMAP_STACK is enabled and alloca causes stack depth overflow, I write to
guard page below the thread stack to cause double fault and VMAP_STACK report.

If VMAP_STACK is disabled, I use MIN_STACK_LEFT = 2048, which seems to be enough
for BUG_ON() handling both on x86_32 and x86_64. Unfortunately, I can't
guarantee that it is always enough.


 #ifdef CONFIG_GCC_PLUGIN_STACKLEAK
-#define MIN_STACK_LEFT 256
+#define MIN_STACK_LEFT 2048

 void __used check_alloca(unsigned long size)
 {
        unsigned long sp = (unsigned long)&sp;
        struct stack_info stack_info = {0};
        unsigned long visit_mask = 0;
        unsigned long stack_left;

        BUG_ON(get_stack_info(&sp, current, &stack_info, &visit_mask));

        stack_left = sp - (unsigned long)stack_info.begin;
+
+#ifdef CONFIG_VMAP_STACK
+       /*
+        * If alloca oversteps the thread stack boundary, we touch the guard
+        * page provided by VMAP_STACK to trigger handle_stack_overflow().
+        */
+       if (size >= stack_left)
+               *(stack_info.begin - 1) = 42;
+#else
        BUG_ON(stack_left < MIN_STACK_LEFT ||
                                size >= stack_left - MIN_STACK_LEFT);
+#endif
 }
 EXPORT_SYMBOL(check_alloca);
 #endif


Looking forward to your feedback.

Best regards,
Alexander
Mark Rutland May 11, 2018, 4:13 p.m. UTC | #16
On Fri, May 11, 2018 at 06:50:09PM +0300, Alexander Popov wrote:
> Hello everyone,
> 
> On 06.05.2018 11:22, Alexander Popov wrote:
> > On 04.05.2018 14:09, Mark Rutland wrote:
> >>>>> +	stack_left = sp & (THREAD_SIZE - 1);
> >>>>> +	BUG_ON(stack_left < 256 || size >= stack_left - 256);
> >>>>
> >>>> Is this arbitrary, or is there something special about 256?
> >>>>
> >>>> Even if this is arbitrary, can we give it some mnemonic?
> >>>
> >>> It's just a reasonable number. We can introduce a macro for it.
> >>
> >> I'm just not sure I see the point in the offset, given things like
> >> VMAP_STACK exist. BUG_ON() handling will likely require *more* than 256
> >> bytes of stack, so it seems superfluous, as we'd be relying on stack
> >> overflow detection at that point.
> >>
> >> I can see that we should take the CONFIG_SCHED_STACK_END_CHECK offset
> >> into account, though.
> > 
> > Mark, thank you for such an important remark!
> > 
> > In Kconfig STACKLEAK implies but doesn't depend on VMAP_STACK. In fact x86_32
> > doesn't have VMAP_STACK at all but can have STACKLEAK.
> > 
> > [Adding Andy Lutomirski]
> > 
> > I've made some additional experiments: I exhaust the thread stack to have only
> > (MIN_STACK_LEFT - 1) bytes left and then force alloca. If VMAP_STACK is
> > disabled, BUG_ON() handling causes stack depth overflow, which is detected by
> > SCHED_STACK_END_CHECK. If VMAP_STACK is enabled, the kernel hangs on BUG_ON()
> > handling! Enabling CONFIG_PROVE_LOCKING gives the needed report from VMAP_STACK:

I can't see why CONFIG_VMAP_STACK would only work in conjunction with
CONFIG_PROVE_LOCKING.

On arm64 at least, if we overflow the stack while handling a BUG(), we
*should* trigger the overflow handler as usual, and that should work,
unless I'm missing something.

Maybe it gets part-way into panic(), sets up some state,
stack-overflows, and we get wedged because we're already in a panic?
Perhaps CONFIG_PROVE_LOCKING causes more stack to be used, so it dies a
little earlier in panic(), before setting up some state that causes
wedging.

... which sounds like something best fixed in those code paths, and not
here.

> [...]
> 
> > I can't say why VMAP_STACK report hangs during BUG_ON() handling on defconfig.
> > Andy, can you give a clue?
> > 
> > I see that MIN_STACK_LEFT = 2048 is enough for BUG_ON() handling on both x86_64
> > and x86_32. So I'm going to:
> >  - set MIN_STACK_LEFT to 2048;
> >  - improve the lkdtm test to cover this case.
> > 
> > Mark, Kees, Laura, does it sound good?
> 
> 
> Could you have a look at the following changes in check_alloca() before I send
> the next version?
> 
> If VMAP_STACK is enabled and alloca causes stack depth overflow, I write to
> guard page below the thread stack to cause double fault and VMAP_STACK report.

On arm64 at least, writing to the guard page will not itself trigger a
stack overflow, but will trigger a data abort. I suspect similar is true
on x86, if the stack pointer is sufficiently far above the guard page.

> If VMAP_STACK is disabled, I use MIN_STACK_LEFT = 2048, which seems to be enough
> for BUG_ON() handling both on x86_32 and x86_64. Unfortunately, I can't
> guarantee that it is always enough.

I don't think that we can choose something that's guaranteed to be
sufficient for BUG() handling and also not wasting a tonne of space
under normal operation.

Let's figure out what's going wrong on x86 in the case that you mention,
and try to solve that.

Here I don't think we should reserve space at all -- it's completely
arbitrary, and as above we can't guarantee that it's sufficient anyway.

>  #ifdef CONFIG_GCC_PLUGIN_STACKLEAK
> -#define MIN_STACK_LEFT 256
> +#define MIN_STACK_LEFT 2048
> 
>  void __used check_alloca(unsigned long size)
>  {
>         unsigned long sp = (unsigned long)&sp;
>         struct stack_info stack_info = {0};
>         unsigned long visit_mask = 0;
>         unsigned long stack_left;
> 
>         BUG_ON(get_stack_info(&sp, current, &stack_info, &visit_mask));
> 
>         stack_left = sp - (unsigned long)stack_info.begin;
> +
> +#ifdef CONFIG_VMAP_STACK
> +       /*
> +        * If alloca oversteps the thread stack boundary, we touch the guard
> +        * page provided by VMAP_STACK to trigger handle_stack_overflow().
> +        */
> +       if (size >= stack_left)
> +               *(stack_info.begin - 1) = 42;
> +#else

On arm64, this won't trigger our stack overflow handler, unless the SP
is already very close to the boundary.

Please just use BUG(). If there is an issue on x86, it would be good to
solve that in the x86 code.

>         BUG_ON(stack_left < MIN_STACK_LEFT ||
>                                 size >= stack_left - MIN_STACK_LEFT);

I really don't think we should bother with this arbitrary offset at all.

Thanks,
Mark.
Alexander Popov May 13, 2018, 8:40 a.m. UTC | #17
Hello Mark,

Thanks a lot for your reply!

On 11.05.2018 19:13, Mark Rutland wrote:
> On Fri, May 11, 2018 at 06:50:09PM +0300, Alexander Popov wrote:
>> On 06.05.2018 11:22, Alexander Popov wrote:
>>> On 04.05.2018 14:09, Mark Rutland wrote:
>>>>>>> +	stack_left = sp & (THREAD_SIZE - 1);
>>>>>>> +	BUG_ON(stack_left < 256 || size >= stack_left - 256);
>>>>>>
>>>>>> Is this arbitrary, or is there something special about 256?
>>>>>>
>>>>>> Even if this is arbitrary, can we give it some mnemonic?
>>>>>
>>>>> It's just a reasonable number. We can introduce a macro for it.
>>>>
>>>> I'm just not sure I see the point in the offset, given things like
>>>> VMAP_STACK exist. BUG_ON() handling will likely require *more* than 256
>>>> bytes of stack, so it seems superfluous, as we'd be relying on stack
>>>> overflow detection at that point.
>>>>
>>>> I can see that we should take the CONFIG_SCHED_STACK_END_CHECK offset
>>>> into account, though.
>>>
>>> Mark, thank you for such an important remark!
>>>
>>> In Kconfig STACKLEAK implies but doesn't depend on VMAP_STACK. In fact x86_32
>>> doesn't have VMAP_STACK at all but can have STACKLEAK.
>>>
>>> [Adding Andy Lutomirski]
>>>
>>> I've made some additional experiments: I exhaust the thread stack to have only
>>> (MIN_STACK_LEFT - 1) bytes left and then force alloca. If VMAP_STACK is
>>> disabled, BUG_ON() handling causes stack depth overflow, which is detected by
>>> SCHED_STACK_END_CHECK. If VMAP_STACK is enabled, the kernel hangs on BUG_ON()
>>> handling! Enabling CONFIG_PROVE_LOCKING gives the needed report from VMAP_STACK:
> 
> I can't see why CONFIG_VMAP_STACK would only work in conjunction with
> CONFIG_PROVE_LOCKING.
> 
> On arm64 at least, if we overflow the stack while handling a BUG(), we
> *should* trigger the overflow handler as usual, and that should work,
> unless I'm missing something.
> 
> Maybe it gets part-way into panic(), sets up some state,
> stack-overflows, and we get wedged because we're already in a panic?
> Perhaps CONFIG_PROVE_LOCKING causes more stack to be used, so it dies a
> little earlier in panic(), before setting up some state that causes
> wedging.

That seems likely. I later noticed that I had oops=panic kernel parameter.

> ... which sounds like something best fixed in those code paths, and not
> here.
> 
>> [...]
>>
>>> I can't say why VMAP_STACK report hangs during BUG_ON() handling on defconfig.
>>> Andy, can you give a clue?
>>>
>>> I see that MIN_STACK_LEFT = 2048 is enough for BUG_ON() handling on both x86_64
>>> and x86_32. So I'm going to:
>>>  - set MIN_STACK_LEFT to 2048;
>>>  - improve the lkdtm test to cover this case.
>>>
>>> Mark, Kees, Laura, does it sound good?
>>
>>
>> Could you have a look at the following changes in check_alloca() before I send
>> the next version?
>>
>> If VMAP_STACK is enabled and alloca causes stack depth overflow, I write to
>> guard page below the thread stack to cause double fault and VMAP_STACK report.
> 
> On arm64 at least, writing to the guard page will not itself trigger a
> stack overflow, but will trigger a data abort. I suspect similar is true
> on x86, if the stack pointer is sufficiently far above the guard page.

Yes, you are right, my mistake.

The comment about CONFIG_VMAP_STACK in arch/x86/kernel/traps.c says:
"If we overflow the stack into a guard page, the CPU will fail to deliver #PF
and will send #DF instead."

>> If VMAP_STACK is disabled, I use MIN_STACK_LEFT = 2048, which seems to be enough
>> for BUG_ON() handling both on x86_32 and x86_64. Unfortunately, I can't
>> guarantee that it is always enough.
> 
> I don't think that we can choose something that's guaranteed to be
> sufficient for BUG() handling and also not wasting a tonne of space
> under normal operation.
> 
> Let's figure out what's going wrong on x86 in the case that you mention,
> and try to solve that.
> 
> Here I don't think we should reserve space at all -- it's completely
> arbitrary, and as above we can't guarantee that it's sufficient anyway.
> 
>>  #ifdef CONFIG_GCC_PLUGIN_STACKLEAK
>> -#define MIN_STACK_LEFT 256
>> +#define MIN_STACK_LEFT 2048
>>
>>  void __used check_alloca(unsigned long size)
>>  {
>>         unsigned long sp = (unsigned long)&sp;
>>         struct stack_info stack_info = {0};
>>         unsigned long visit_mask = 0;
>>         unsigned long stack_left;
>>
>>         BUG_ON(get_stack_info(&sp, current, &stack_info, &visit_mask));
>>
>>         stack_left = sp - (unsigned long)stack_info.begin;
>> +
>> +#ifdef CONFIG_VMAP_STACK
>> +       /*
>> +        * If alloca oversteps the thread stack boundary, we touch the guard
>> +        * page provided by VMAP_STACK to trigger handle_stack_overflow().
>> +        */
>> +       if (size >= stack_left)
>> +               *(stack_info.begin - 1) = 42;
>> +#else
> 
> On arm64, this won't trigger our stack overflow handler, unless the SP
> is already very close to the boundary.
> 
> Please just use BUG(). If there is an issue on x86, it would be good to
> solve that in the x86 code.
> 
>>         BUG_ON(stack_left < MIN_STACK_LEFT ||
>>                                 size >= stack_left - MIN_STACK_LEFT);
> 
> I really don't think we should bother with this arbitrary offset at all.

Thanks. I agree with all your points.

I wrote a third lkdtm test for STACKLEAK which runs deep recursion with alloca.
If I have just BUG_ON(size >= stack_left) in check_alloca(), I get the following
nice report without any trouble:

[    8.407261] lkdtm: Performing direct entry STACKLEAK_RECURSION_WITH_ALLOCA
[    8.408641] lkdtm: checking unused part of the thread stack (15744 bytes)...
[    8.409936] lkdtm: first 744 bytes are unpoisoned
[    8.410751] lkdtm: the rest of the thread stack is properly erased
[    8.411760] lkdtm: try to overflow the thread stack using recursion & alloca
[    8.412914] BUG: stack guard page was hit at 00000000b993c2bc (stack is 00000000764adcd4..000000005b443f11)
[    8.414471] kernel stack overflow (double-fault): 0000 [#1] SMP PTI
[    8.415409] Dumping ftrace buffer:
[    8.415907]    (ftrace buffer empty)
[    8.416404] Modules linked in: lkdtm
[    8.416905] CPU: 0 PID: 2664 Comm: sh Not tainted 4.17.0-rc3+ #39
[    8.417766] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
[    8.419088] RIP: 0010:do_error_trap+0x31/0x130
[    8.419647] RSP: 0018:ffffc900009b3fc0 EFLAGS: 00010046
[    8.420263] RAX: 0000000000000000 RBX: ffffc900009b4078 RCX: 0000000000000006
[    8.421322] RDX: ffffffff81fdbe4d RSI: 0000000000000000 RDI: ffffc900009b4078
[    8.422837] RBP: 0000000000000006 R08: 0000000000000004 R09: 0000000000000000
[    8.425095] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000004
[    8.427365] R13: ffffffff81fdbe4d R14: 0000000000000000 R15: 0000000000000000
[    8.430111] FS:  00007f7c340c1700(0000) GS:ffff88007fc00000(0000) knlGS:0000000000000000
[    8.432515] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[    8.433132] CR2: ffffc900009b3fb8 CR3: 000000007b330000 CR4: 00000000000006f0
[    8.433904] Call Trace:
[    8.434180]  invalid_op+0x14/0x20
[    8.434546] RIP: 0010:check_alloca+0x8e/0xa0
[    8.434995] RSP: 0018:ffffc900009b4128 EFLAGS: 00010283
[    8.435555] RAX: 0000000000000128 RBX: 0000000000000190 RCX: 0000000000000001
[    8.436479] RDX: 0000000000000002 RSI: 0000000000000000 RDI: ffffc900009b4128
[    8.437871] RBP: ffffc900009b4180 R08: 000000000000018f R09: 0000000000000007
[    8.438661] R10: 0000000000000000 R11: 0000000000000030 R12: ffff88007a626000
[    8.439433] R13: 0000000001cf5610 R14: 0000000000000020 R15: ffffc900009b7f08
[    8.440329]  ? check_alloca+0x64/0xa0
[    8.440845]  do_alloca+0x20/0x60 [lkdtm]
[    8.441937]  recursion+0xa0/0xd0 [lkdtm]
[    8.443370]  ? vsnprintf+0xf2/0x4b0
[    8.444289]  ? get_stack_info+0x32/0x160
[    8.445359]  ? check_alloca+0x64/0xa0
[    8.445995]  ? do_alloca+0x20/0x60 [lkdtm]
[    8.446449]  recursion+0xbb/0xd0 [lkdtm]
[    8.446881]  ? vsnprintf+0xf2/0x4b0
[    8.447259]  ? get_stack_info+0x32/0x160
[    8.447693]  ? check_alloca+0x64/0xa0
[    8.448088]  ? do_alloca+0x20/0x60 [lkdtm]
[    8.448539]  recursion+0xbb/0xd0 [lkdtm]
...

It seems that previously I was very "lucky" to accidentally have those MIN_STACK_LEFT,
call trace depth and oops=panic together to experience a hang on stack overflow
during BUG().


When I run my test in a loop _without_ VMAP_STACK, I manage to corrupt the neighbour
processes with BUG() handling overstepping the stack boundary. It's a pity, but
I have an idea.

In kernel/sched/core.c we already have:

#ifdef CONFIG_SCHED_STACK_END_CHECK
  	if (task_stack_end_corrupted(prev))
		panic("corrupted stack end detected inside scheduler\n");
#endif

So what would you think if I do the following in check_alloca():

	if (size >= stack_left) {
#if !defined(CONFIG_VMAP_STACK) && defined(CONFIG_SCHED_STACK_END_CHECK)
		panic("alloca over the kernel stack boundary\n");
#else
		BUG();
#endif

I think that fits well to the CONFIG_SCHED_STACK_END_CHECK policy.

Best regards,
Alexander
Mark Rutland May 14, 2018, 5:15 a.m. UTC | #18
On Sun, May 13, 2018 at 11:40:07AM +0300, Alexander Popov wrote:
> It seems that previously I was very "lucky" to accidentally have those MIN_STACK_LEFT,
> call trace depth and oops=panic together to experience a hang on stack overflow
> during BUG().
> 
> 
> When I run my test in a loop _without_ VMAP_STACK, I manage to corrupt the neighbour
> processes with BUG() handling overstepping the stack boundary. It's a pity, but
> I have an idea.

I think that in the absence of VMAP_STACK, there will always be cases where we
*could* corrupt a neighbouring stack, but I agree that trying to minimize that
possibility would be good.

> In kernel/sched/core.c we already have:
> 
> #ifdef CONFIG_SCHED_STACK_END_CHECK
>   	if (task_stack_end_corrupted(prev))
> 		panic("corrupted stack end detected inside scheduler\n");
> #endif
> 
> So what would you think if I do the following in check_alloca():
> 
> 	if (size >= stack_left) {
> #if !defined(CONFIG_VMAP_STACK) && defined(CONFIG_SCHED_STACK_END_CHECK)
> 		panic("alloca over the kernel stack boundary\n");
> #else
> 		BUG();
> #endif

Given this is already out-of-line, how about we always use panic(), regardless
of VMAP_STACK and SCHED_STACK_END_CHECK? i.e. just

	if (unlikely(size >= stack_left))
		panic("alloca over the kernel stack boundary");

If we have VMAP_STACK selected, and overflow during the panic, it's the same as
if we overflowed during the BUG(). It's likely that panic() will use less stack
space than BUG(), and the compiler can put the call in a slow path that
shouldn't affect most calls, so in all cases it's likely preferable.

Thanks,
Mark.
Alexander Popov May 14, 2018, 9:35 a.m. UTC | #19
On 14.05.2018 08:15, Mark Rutland wrote:
> On Sun, May 13, 2018 at 11:40:07AM +0300, Alexander Popov wrote:
>> It seems that previously I was very "lucky" to accidentally have those MIN_STACK_LEFT,
>> call trace depth and oops=panic together to experience a hang on stack overflow
>> during BUG().
>>
>>
>> When I run my test in a loop _without_ VMAP_STACK, I manage to corrupt the neighbour
>> processes with BUG() handling overstepping the stack boundary. It's a pity, but
>> I have an idea.
> 
> I think that in the absence of VMAP_STACK, there will always be cases where we
> *could* corrupt a neighbouring stack, but I agree that trying to minimize that
> possibility would be good.

Ok!

>> In kernel/sched/core.c we already have:
>>
>> #ifdef CONFIG_SCHED_STACK_END_CHECK
>>   	if (task_stack_end_corrupted(prev))
>> 		panic("corrupted stack end detected inside scheduler\n");
>> #endif
>>
>> So what would you think if I do the following in check_alloca():
>>
>> 	if (size >= stack_left) {
>> #if !defined(CONFIG_VMAP_STACK) && defined(CONFIG_SCHED_STACK_END_CHECK)
>> 		panic("alloca over the kernel stack boundary\n");
>> #else
>> 		BUG();
>> #endif
> 
> Given this is already out-of-line, how about we always use panic(), regardless
> of VMAP_STACK and SCHED_STACK_END_CHECK? i.e. just
> 
> 	if (unlikely(size >= stack_left))
> 		panic("alloca over the kernel stack boundary");
> 
> If we have VMAP_STACK selected, and overflow during the panic, it's the same as
> if we overflowed during the BUG(). It's likely that panic() will use less stack
> space than BUG(), and the compiler can put the call in a slow path that
> shouldn't affect most calls, so in all cases it's likely preferable.

I'm sure that maintainers and Linus will strongly dislike my patch if I always
use panic() here. panic() kills the whole kernel and we shouldn't use it when we
can safely continue to work.

Let me describe my logic. So let's have size >= stack_left on a thread stack.

1. If CONFIG_VMAP_STACK is enabled, we can safely use BUG(). Even if BUG()
handling overflows the thread stack into the guard page, handle_stack_overflow()
is called and the neighbour memory is not corrupted. The kernel can proceed to live.

2. If CONFIG_VMAP_STACK is disabled, BUG() handling can corrupt the neighbour
kernel memory and cause the undefined behaviour of the whole kernel. I see it on
my lkdtm test. That is a cogent reason for panic().

2.a. If CONFIG_SCHED_STACK_END_CHECK is enabled, the kernel already does panic()
when STACK_END_MAGIC is corrupted. So we will _not_ break the safety policy if
we do panic() in a similar situation in check_alloca().

2.b. If CONFIG_SCHED_STACK_END_CHECK is disabled, the user has some real reasons
not to do panic() when the kernel stack is corrupted. So we should not do it in
check_alloca() as well, just use BUG() and hope for the best.

That logic can be expressed this way:

	if (size >= stack_left) {
#if !defined(CONFIG_VMAP_STACK) && defined(CONFIG_SCHED_STACK_END_CHECK)
		panic("alloca over the kernel stack boundary\n");
#else
		BUG();
#endif

I think I should add a proper comment to describe it.

Thank you.

Best regards,
Alexander
Mark Rutland May 14, 2018, 10:06 a.m. UTC | #20
On Mon, May 14, 2018 at 12:35:25PM +0300, Alexander Popov wrote:
> On 14.05.2018 08:15, Mark Rutland wrote:
> > On Sun, May 13, 2018 at 11:40:07AM +0300, Alexander Popov wrote:
> >> So what would you think if I do the following in check_alloca():
> >>
> >> 	if (size >= stack_left) {
> >> #if !defined(CONFIG_VMAP_STACK) && defined(CONFIG_SCHED_STACK_END_CHECK)
> >> 		panic("alloca over the kernel stack boundary\n");
> >> #else
> >> 		BUG();
> >> #endif
> > 
> > Given this is already out-of-line, how about we always use panic(), regardless
> > of VMAP_STACK and SCHED_STACK_END_CHECK? i.e. just
> > 
> > 	if (unlikely(size >= stack_left))
> > 		panic("alloca over the kernel stack boundary");
> > 
> > If we have VMAP_STACK selected, and overflow during the panic, it's the same as
> > if we overflowed during the BUG(). It's likely that panic() will use less stack
> > space than BUG(), and the compiler can put the call in a slow path that
> > shouldn't affect most calls, so in all cases it's likely preferable.
> 
> I'm sure that maintainers and Linus will strongly dislike my patch if I always
> use panic() here. panic() kills the whole kernel and we shouldn't use it when we
> can safely continue to work.
> 
> Let me describe my logic. So let's have size >= stack_left on a thread stack.
> 
> 1. If CONFIG_VMAP_STACK is enabled, we can safely use BUG(). Even if BUG()
> handling overflows the thread stack into the guard page, handle_stack_overflow()
> is called and the neighbour memory is not corrupted. The kernel can proceed to live.

On arm64 with CONFIG_VMAP_STACK, a stack overflow will result in a
panic(). My understanding was that the same is true on x86.

> 2. If CONFIG_VMAP_STACK is disabled, BUG() handling can corrupt the neighbour
> kernel memory and cause the undefined behaviour of the whole kernel. I see it on
> my lkdtm test. That is a cogent reason for panic().

In this case, panic() can also corrupt the neighbour stack, and could
also fail.

When CONFIG_VMAP_STACK is not selected, a stack overflow simply cannot
be handled reliably -- while panic() may be more likely to succeed, it
is not gauranteed to.

> 2.a. If CONFIG_SCHED_STACK_END_CHECK is enabled, the kernel already does panic()
> when STACK_END_MAGIC is corrupted. So we will _not_ break the safety policy if
> we do panic() in a similar situation in check_alloca().

Sure, I'm certainly happy with panic() here.

> 2.b. If CONFIG_SCHED_STACK_END_CHECK is disabled, the user has some real reasons
> not to do panic() when the kernel stack is corrupted. 

I believe that CONFIG_SCHED_STACK_END_CHECK is seen as a debug feature,
and hence people don't select it. I strongly doubt that people have
reasons to disable it other than not wanting the overhead associated
with debug features.

I think it is reasonable to panic() here even with CONFIG_VMAP_STACK
selected.

> So we should not do it in check_alloca() as well, just use BUG() and
> hope for the best.

Regardless of whether we BUG() or panic(), we're hoping for the best.

Consistently using panic() here will keep things simpler, so any failure
reported will be easier to reason about, and easier to debug.

Thanks,
Mark.
Alexander Popov May 14, 2018, 1:53 p.m. UTC | #21
On 14.05.2018 13:06, Mark Rutland wrote:
> On Mon, May 14, 2018 at 12:35:25PM +0300, Alexander Popov wrote:
>> On 14.05.2018 08:15, Mark Rutland wrote:
>>> On Sun, May 13, 2018 at 11:40:07AM +0300, Alexander Popov wrote:
>>>> So what would you think if I do the following in check_alloca():
>>>>
>>>> 	if (size >= stack_left) {
>>>> #if !defined(CONFIG_VMAP_STACK) && defined(CONFIG_SCHED_STACK_END_CHECK)
>>>> 		panic("alloca over the kernel stack boundary\n");
>>>> #else
>>>> 		BUG();
>>>> #endif
>>>
>>> Given this is already out-of-line, how about we always use panic(), regardless
>>> of VMAP_STACK and SCHED_STACK_END_CHECK? i.e. just
>>>
>>> 	if (unlikely(size >= stack_left))
>>> 		panic("alloca over the kernel stack boundary");
>>>
>>> If we have VMAP_STACK selected, and overflow during the panic, it's the same as
>>> if we overflowed during the BUG(). It's likely that panic() will use less stack
>>> space than BUG(), and the compiler can put the call in a slow path that
>>> shouldn't affect most calls, so in all cases it's likely preferable.
>>
>> I'm sure that maintainers and Linus will strongly dislike my patch if I always
>> use panic() here. panic() kills the whole kernel and we shouldn't use it when we
>> can safely continue to work.
>>
>> Let me describe my logic. So let's have size >= stack_left on a thread stack.
>>
>> 1. If CONFIG_VMAP_STACK is enabled, we can safely use BUG(). Even if BUG()
>> handling overflows the thread stack into the guard page, handle_stack_overflow()
>> is called and the neighbour memory is not corrupted. The kernel can proceed to live.
> 
> On arm64 with CONFIG_VMAP_STACK, a stack overflow will result in a
> panic(). My understanding was that the same is true on x86.

No, x86 CONFIG_VMAP_STACK only kills the offending process. I see it on my deep
recursion test, the kernel continues to live. handle_stack_overflow() in
arch/x86/kernel/traps.c calls die().

>> 2. If CONFIG_VMAP_STACK is disabled, BUG() handling can corrupt the neighbour
>> kernel memory and cause the undefined behaviour of the whole kernel. I see it on
>> my lkdtm test. That is a cogent reason for panic().
> 
> In this case, panic() can also corrupt the neighbour stack, and could
> also fail.
> 
> When CONFIG_VMAP_STACK is not selected, a stack overflow simply cannot
> be handled reliably -- while panic() may be more likely to succeed, it
> is not gauranteed to.
>
>> 2.a. If CONFIG_SCHED_STACK_END_CHECK is enabled, the kernel already does panic()
>> when STACK_END_MAGIC is corrupted. So we will _not_ break the safety policy if
>> we do panic() in a similar situation in check_alloca().
> 
> Sure, I'm certainly happy with panic() here.

Ok!

>> 2.b. If CONFIG_SCHED_STACK_END_CHECK is disabled, the user has some real reasons
>> not to do panic() when the kernel stack is corrupted. 
> 
> I believe that CONFIG_SCHED_STACK_END_CHECK is seen as a debug feature,
> and hence people don't select it. 

I see CONFIG_SCHED_STACK_END_CHECK enabled by default in Ubuntu config...

> I strongly doubt that people have
> reasons to disable it other than not wanting the overhead associated
> with debug features.

I think it's not a question of performance here. There are cases when a system
must live as long as possible (even partially corrupted) and must not die
entirely. Oops is ok for those systems, but panic (full DoS) is not.

> I think it is reasonable to panic() here even with CONFIG_VMAP_STACK
> selected.

It's too tough for CONFIG_VMAP_STACK on x86 - the system can proceed to live.
Anyway, the check_alloca() code will not be shared between x86 and arm64, I've
described the reasons in this thread. So I can have BUG() for CONFIG_VMAP_STACK
on x86 and Laura can consistently use panic() on arm64.

>> So we should not do it in check_alloca() as well, just use BUG() and
>> hope for the best.
> 
> Regardless of whether we BUG() or panic(), we're hoping for the best.
> 
> Consistently using panic() here will keep things simpler, so any failure
> reported will be easier to reason about, and easier to debug.

Let me keep BUG() for !CONFIG_SCHED_STACK_END_CHECK. I beware of using panic()
by default, let distro/user decide this. I remember very well how I was shouted
at, when this one was merged:
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=ce6fa91b93630396ca220c33dd38ffc62686d499


Mark, I'm really grateful to you for such a nice code review!
Alexander
Mark Rutland May 14, 2018, 2:07 p.m. UTC | #22
On Mon, May 14, 2018 at 04:53:12PM +0300, Alexander Popov wrote:
> On 14.05.2018 13:06, Mark Rutland wrote:
> > I think it is reasonable to panic() here even with CONFIG_VMAP_STACK
> > selected.
> 
> It's too tough for CONFIG_VMAP_STACK on x86 - the system can proceed to live.
> Anyway, the check_alloca() code will not be shared between x86 and arm64, I've
> described the reasons in this thread. So I can have BUG() for CONFIG_VMAP_STACK
> on x86 and Laura can consistently use panic() on arm64.

If we need arch-specific implementations anyway, then that's fine by me.

> >> So we should not do it in check_alloca() as well, just use BUG() and
> >> hope for the best.
> > 
> > Regardless of whether we BUG() or panic(), we're hoping for the best.
> > 
> > Consistently using panic() here will keep things simpler, so any failure
> > reported will be easier to reason about, and easier to debug.
> 
> Let me keep BUG() for !CONFIG_SCHED_STACK_END_CHECK. I beware of using panic()
> by default, let distro/user decide this. I remember very well how I was shouted
> at, when this one was merged:
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=ce6fa91b93630396ca220c33dd38ffc62686d499

Sure; my comments needn't hold up your patches.

Thanks,
Mark.
diff mbox

Patch

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index eb2cf4938f6d..b0221db95dc9 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -92,6 +92,7 @@  config ARM64
 	select HAVE_ARCH_MMAP_RND_BITS
 	select HAVE_ARCH_MMAP_RND_COMPAT_BITS if COMPAT
 	select HAVE_ARCH_SECCOMP_FILTER
+	select HAVE_ARCH_STACKLEAK
 	select HAVE_ARCH_THREAD_STRUCT_WHITELIST
 	select HAVE_ARCH_TRACEHOOK
 	select HAVE_ARCH_TRANSPARENT_HUGEPAGE
diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h
index 767598932549..d31ab80ff647 100644
--- a/arch/arm64/include/asm/processor.h
+++ b/arch/arm64/include/asm/processor.h
@@ -124,6 +124,12 @@  struct thread_struct {
 	unsigned long		fault_address;	/* fault info */
 	unsigned long		fault_code;	/* ESR_EL1 value */
 	struct debug_info	debug;		/* debugging */
+#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
+	unsigned long           lowest_stack;
+#ifdef CONFIG_STACKLEAK_METRICS
+	unsigned long		prev_lowest_stack;
+#endif
+#endif
 };
 
 static inline void arch_thread_struct_whitelist(unsigned long *offset,
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index bf825f38d206..0ceea613c65b 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -55,6 +55,9 @@  arm64-reloc-test-y := reloc_test_core.o reloc_test_syms.o
 arm64-obj-$(CONFIG_CRASH_DUMP)		+= crash_dump.o
 arm64-obj-$(CONFIG_ARM_SDE_INTERFACE)	+= sdei.o
 
+arm64-obj-$(CONFIG_GCC_PLUGIN_STACKLEAK) += erase.o
+KASAN_SANITIZE_erase.o	:= n
+
 obj-y					+= $(arm64-obj-y) vdso/ probes/
 obj-m					+= $(arm64-obj-m)
 head-y					:= head.o
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index ec2ee720e33e..3144f1ebdc18 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -401,6 +401,11 @@  tsk	.req	x28		// current thread_info
 
 	.text
 
+	.macro	ERASE_KSTACK
+#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
+	bl	erase_kstack
+#endif
+	.endm
 /*
  * Exception vectors.
  */
@@ -906,6 +911,7 @@  ret_to_user:
 	cbnz	x2, work_pending
 finish_ret_to_user:
 	enable_step_tsk x1, x2
+	ERASE_KSTACK
 	kernel_exit 0
 ENDPROC(ret_to_user)
 
diff --git a/arch/arm64/kernel/erase.c b/arch/arm64/kernel/erase.c
new file mode 100644
index 000000000000..b8b5648d893b
--- /dev/null
+++ b/arch/arm64/kernel/erase.c
@@ -0,0 +1,55 @@ 
+#include <linux/bug.h>
+#include <linux/sched.h>
+#include <asm/current.h>
+#include <asm/linkage.h>
+#include <asm/processor.h>
+
+asmlinkage void erase_kstack(void)
+{
+	unsigned long p = current->thread.lowest_stack;
+	unsigned long boundary = p & ~(THREAD_SIZE - 1);
+	unsigned long poison = 0;
+	const unsigned long check_depth = STACKLEAK_POISON_CHECK_DEPTH /
+							sizeof(unsigned long);
+
+	/*
+	 * Let's search for the poison value in the stack.
+	 * Start from the lowest_stack and go to the bottom.
+	 */
+	while (p > boundary && poison <= check_depth) {
+		if (*(unsigned long *)p == STACKLEAK_POISON)
+			poison++;
+		else
+			poison = 0;
+
+		p -= sizeof(unsigned long);
+	}
+
+	/*
+	 * One long int at the bottom of the thread stack is reserved and
+	 * should not be poisoned (see CONFIG_SCHED_STACK_END_CHECK).
+	 */
+	if (p == boundary)
+		p += sizeof(unsigned long);
+
+#ifdef CONFIG_STACKLEAK_METRICS
+	current->thread.prev_lowest_stack = p;
+#endif
+
+	/*
+	 * So let's write the poison value to the kernel stack.
+	 * Start from the address in p and move up till the new boundary.
+	 */
+	boundary = current_stack_pointer;
+
+	BUG_ON(boundary - p >= THREAD_SIZE);
+
+	while (p < boundary) {
+		*(unsigned long *)p = STACKLEAK_POISON;
+		p += sizeof(unsigned long);
+	}
+
+	/* Reset the lowest_stack value for the next syscall */
+	current->thread.lowest_stack = current_stack_pointer;
+}
+
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index f08a2ed9db0d..156fa0a0da19 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -364,6 +364,9 @@  int copy_thread(unsigned long clone_flags, unsigned long stack_start,
 	p->thread.cpu_context.pc = (unsigned long)ret_from_fork;
 	p->thread.cpu_context.sp = (unsigned long)childregs;
 
+#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
+	p->thread.lowest_stack = (unsigned long)task_stack_page(p);
+#endif
 	ptrace_hw_copy_thread(p);
 
 	return 0;
@@ -493,3 +496,16 @@  void arch_setup_new_exec(void)
 {
 	current->mm->context.flags = is_compat_task() ? MMCF_AARCH32 : 0;
 }
+
+#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
+void __used check_alloca(unsigned long size)
+{
+	unsigned long sp, stack_left;
+
+	sp = current_stack_pointer;
+
+	stack_left = sp & (THREAD_SIZE - 1);
+	BUG_ON(stack_left < 256 || size >= stack_left - 256);
+}
+EXPORT_SYMBOL(check_alloca);
+#endif
diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile
index a34e9290a699..25dd2a14560d 100644
--- a/drivers/firmware/efi/libstub/Makefile
+++ b/drivers/firmware/efi/libstub/Makefile
@@ -20,7 +20,8 @@  cflags-$(CONFIG_EFI_ARMSTUB)	+= -I$(srctree)/scripts/dtc/libfdt
 KBUILD_CFLAGS			:= $(cflags-y) -DDISABLE_BRANCH_PROFILING \
 				   -D__NO_FORTIFY \
 				   $(call cc-option,-ffreestanding) \
-				   $(call cc-option,-fno-stack-protector)
+				   $(call cc-option,-fno-stack-protector) \
+				   $(DISABLE_STACKLEAK_PLUGIN)
 
 GCOV_PROFILE			:= n
 KASAN_SANITIZE			:= n
diff --git a/scripts/Makefile.gcc-plugins b/scripts/Makefile.gcc-plugins
index 8d6070fc538f..6cc0e35d3324 100644
--- a/scripts/Makefile.gcc-plugins
+++ b/scripts/Makefile.gcc-plugins
@@ -37,11 +37,14 @@  ifdef CONFIG_GCC_PLUGINS
 
   gcc-plugin-$(CONFIG_GCC_PLUGIN_STACKLEAK)	+= stackleak_plugin.so
   gcc-plugin-cflags-$(CONFIG_GCC_PLUGIN_STACKLEAK)	+= -DSTACKLEAK_PLUGIN -fplugin-arg-stackleak_plugin-track-min-size=$(CONFIG_STACKLEAK_TRACK_MIN_SIZE)
+  ifdef CONFIG_GCC_PLUGIN_STACKLEAK
+    DISABLE_STACKLEAK_PLUGIN		+= -fplugin-arg-stackleak_plugin-disable
+  endif
 
   GCC_PLUGINS_CFLAGS := $(strip $(addprefix -fplugin=$(objtree)/scripts/gcc-plugins/, $(gcc-plugin-y)) $(gcc-plugin-cflags-y))
 
   export PLUGINCC GCC_PLUGINS_CFLAGS GCC_PLUGIN GCC_PLUGIN_SUBDIR
-  export SANCOV_PLUGIN DISABLE_LATENT_ENTROPY_PLUGIN
+  export SANCOV_PLUGIN DISABLE_LATENT_ENTROPY_PLUGIN DISABLE_STACKLEAK_PLUGIN
 
   ifneq ($(PLUGINCC),)
     # SANCOV_PLUGIN can be only in CFLAGS_KCOV because avoid duplication.