diff mbox series

[3/3] xen/arm: stack check instrumentation

Message ID 20240729142421.137283-4-stewart.hildebrand@amd.com (mailing list archive)
State New
Headers show
Series Stack checking on Arm | expand

Commit Message

Stewart Hildebrand July 29, 2024, 2:24 p.m. UTC
Use the -finstrument-functions option to check that the stack pointer is
valid upon each function entry. Only enable it for debug builds due to
the overhead introduced.

Use per-cpu variables to store the stack base and nesting level. The
nesting level is used to avoid invoking __cyg_profile_func_enter
recursively.

A check is added for whether per-cpu data has been initialized. Since
TPIDR_EL2 resets to an unknown value, initialize it to an invalid value.

Signed-off-by: Stewart Hildebrand <stewart.hildebrand@amd.com>
---
RFC: I only tested this on arm64. I still need to test with arm32.

RFC: Should we put this under its own config option instead of reusing
     CONFIG_DEBUG?

RFC: Is there a better value for INVALID_PER_CPU_OFFSET? We can't use 0
     because 0 is a valid value for get_per_cpu_offset().
---
 xen/arch/arm/arch.mk             |  1 +
 xen/arch/arm/arm64/head.S        |  4 +++
 xen/arch/arm/domain.c            |  3 +++
 xen/arch/arm/include/asm/page.h  |  2 ++
 xen/arch/arm/include/asm/traps.h |  8 ++++++
 xen/arch/arm/setup.c             |  4 +++
 xen/arch/arm/smpboot.c           |  3 +++
 xen/arch/arm/traps.c             | 45 ++++++++++++++++++++++++++++++++
 8 files changed, 70 insertions(+)

Comments

Jan Beulich July 29, 2024, 3:01 p.m. UTC | #1
On 29.07.2024 16:24, Stewart Hildebrand wrote:
> RFC: Should we put this under its own config option instead of reusing
>      CONFIG_DEBUG?

I'd say yes, with it merely defaulting to DEBUG. And then have this in
generic code rather than ...

> --- a/xen/arch/arm/arch.mk
> +++ b/xen/arch/arm/arch.mk
> @@ -12,6 +12,7 @@ CFLAGS-$(CONFIG_ARM_32) += -mno-unaligned-access
>  CFLAGS-$(CONFIG_ARM_64) += -mcpu=generic
>  CFLAGS-$(CONFIG_ARM_64) += -mgeneral-regs-only # No fp registers etc
>  $(call cc-option-add,CFLAGS-$(CONFIG_ARM_64),CC,-mno-outline-atomics)
> +CFLAGS-$(CONFIG_DEBUG) += -finstrument-functions

... here, with arches simply indicating whether the Kconfig is usable
(i.e. arch code had been suitably enabled).

Jan
Julien Grall July 29, 2024, 6:36 p.m. UTC | #2
Hi Stewart,

On 29/07/2024 15:24, Stewart Hildebrand wrote:
> Use the -finstrument-functions option to check that the stack pointer is

Is the feature supported by both clang and GCC? If so, which from versions?

 From README, this is what we currently support.

       - For ARM 32-bit:
         - GCC 4.9 or later
         - GNU Binutils 2.24 or later
       - For ARM 64-bit:
         - GCC 5.1 or later
         - GNU Binutils 2.24 or later

We don't mention Clang, but I would expect a clang from 4-5 years should 
still be Arm (not cross-compile as we never fully added the support).


> valid upon each function entry. Only enable it for debug builds due to
> the overhead introduced.

How much overhead is it? We use debug builds during the dev cycle, so we 
need to make sure this is not noticeable.

In any case, it would be better if this new option is under its own 
kconfig options. We can separately decide whether it is on or off by 
default when CONFIG_DEBUG=y.

> 
> Use per-cpu variables to store the stack base and nesting level. The
> nesting level is used to avoid invoking __cyg_profile_func_enter
> recursively.
> 
> A check is added for whether per-cpu data has been initialized. Since
> TPIDR_EL2 resets to an unknown value, initialize it to an invalid value.
> 
> Signed-off-by: Stewart Hildebrand <stewart.hildebrand@amd.com>
> ---
> RFC: I only tested this on arm64. I still need to test with arm32.
> 
> RFC: Should we put this under its own config option instead of reusing
>       CONFIG_DEBUG?

See above.

> 
> RFC: Is there a better value for INVALID_PER_CPU_OFFSET? We can't use 0
>       because 0 is a valid value for get_per_cpu_offset().

I would consider to use ~0 because this is very unlikely to be used as 
an offset (at least on arm64).

> ---
>   xen/arch/arm/arch.mk             |  1 +
>   xen/arch/arm/arm64/head.S        |  4 +++
>   xen/arch/arm/domain.c            |  3 +++
>   xen/arch/arm/include/asm/page.h  |  2 ++
>   xen/arch/arm/include/asm/traps.h |  8 ++++++
>   xen/arch/arm/setup.c             |  4 +++
>   xen/arch/arm/smpboot.c           |  3 +++
>   xen/arch/arm/traps.c             | 45 ++++++++++++++++++++++++++++++++
>   8 files changed, 70 insertions(+)
> 
> diff --git a/xen/arch/arm/arch.mk b/xen/arch/arm/arch.mk
> index 022dcda19224..c39cb65d183d 100644
> --- a/xen/arch/arm/arch.mk
> +++ b/xen/arch/arm/arch.mk
> @@ -12,6 +12,7 @@ CFLAGS-$(CONFIG_ARM_32) += -mno-unaligned-access
>   CFLAGS-$(CONFIG_ARM_64) += -mcpu=generic
>   CFLAGS-$(CONFIG_ARM_64) += -mgeneral-regs-only # No fp registers etc
>   $(call cc-option-add,CFLAGS-$(CONFIG_ARM_64),CC,-mno-outline-atomics)
> +CFLAGS-$(CONFIG_DEBUG) += -finstrument-functions
>   
>   ifneq ($(filter command line environment,$(origin CONFIG_EARLY_PRINTK)),)
>       $(error You must use 'make menuconfig' to enable/disable early printk now)
> diff --git a/xen/arch/arm/arm64/head.S b/xen/arch/arm/arm64/head.S
> index 2fa07dc3a04b..4a332b9cbc7c 100644
> --- a/xen/arch/arm/arm64/head.S
> +++ b/xen/arch/arm/arm64/head.S
> @@ -399,6 +399,10 @@ FUNC_LOCAL(cpu_init)
>            * than SP_EL0.
>            */
>           msr spsel, #1
> +

I would add a comment explaining what we are doing.

> +        ldr   x0, =INVALID_PER_CPU_OFFSET
> +        msr   tpidr_el2, x0
> +
>           ret
>   END(cpu_init)
>   
> diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c
> index 7cfcefd27944..93ebe6e5f8af 100644
> --- a/xen/arch/arm/domain.c
> +++ b/xen/arch/arm/domain.c
> @@ -26,6 +26,7 @@
>   #include <asm/procinfo.h>
>   #include <asm/regs.h>
>   #include <asm/tee/tee.h>
> +#include <asm/traps.h>
>   #include <asm/vfp.h>
>   #include <asm/vgic.h>
>   #include <asm/vtimer.h>
> @@ -328,6 +329,8 @@ void context_switch(struct vcpu *prev, struct vcpu *next)
>   
>       set_current(next);
>   
> +    stack_set(next->arch.stack);
> +
>       prev = __context_switch(prev, next);
>   
>       schedule_tail(prev);
> diff --git a/xen/arch/arm/include/asm/page.h b/xen/arch/arm/include/asm/page.h
> index 69f817d1e68a..6b5aaf1eb3ff 100644
> --- a/xen/arch/arm/include/asm/page.h
> +++ b/xen/arch/arm/include/asm/page.h
> @@ -7,6 +7,8 @@
>   #include <asm/lpae.h>
>   #include <asm/sysregs.h>
>   
> +#define INVALID_PER_CPU_OFFSET            _AC(0x1, UL)
> +
>   /* Shareability values for the LPAE entries */
>   #define LPAE_SH_NON_SHAREABLE 0x0
>   #define LPAE_SH_UNPREDICTALE  0x1
> diff --git a/xen/arch/arm/include/asm/traps.h b/xen/arch/arm/include/asm/traps.h
> index 9a60dbf70e5b..13e6997c2643 100644
> --- a/xen/arch/arm/include/asm/traps.h
> +++ b/xen/arch/arm/include/asm/traps.h
> @@ -118,6 +118,14 @@ static inline register_t sign_extend(const struct hsr_dabt dabt, register_t r)
>   
>   void finalize_instr_emulation(const struct instr_details *instr);
>   
> +#ifdef CONFIG_DEBUG
> +void stack_check_init(void);
> +void stack_set(unsigned char *base);
> +#else
> +static inline void stack_check_init(void) { }
> +static inline void stack_set(unsigned char *base) { }
> +#endif
> +
>   #endif /* __ASM_ARM_TRAPS__ */
>   /*
>    * Local variables:
> diff --git a/xen/arch/arm/setup.c b/xen/arch/arm/setup.c
> index 0c2fdaceaf21..07d07feff602 100644
> --- a/xen/arch/arm/setup.c
> +++ b/xen/arch/arm/setup.c
> @@ -47,6 +47,7 @@
>   #include <asm/setup.h>
>   #include <xsm/xsm.h>
>   #include <asm/acpi.h>
> +#include <asm/traps.h>
>   
>   struct bootinfo __initdata bootinfo = BOOTINFO_INIT;
>   
> @@ -733,6 +734,8 @@ void asmlinkage __init start_xen(unsigned long boot_phys_offset,
>       percpu_init_areas();
>       set_processor_id(0); /* needed early, for smp_processor_id() */
>   
> +    stack_check_init();
> +
>       /* Initialize traps early allow us to get backtrace when an error occurred */
>       init_traps();
>   
> @@ -924,6 +927,7 @@ void asmlinkage __init start_xen(unsigned long boot_phys_offset,
>        * since the static one we're running on is about to be freed. */
>       memcpy(idle_vcpu[0]->arch.cpu_info, get_cpu_info(),
>              sizeof(struct cpu_info));
> +    stack_set(idle_vcpu[0]->arch.stack);
>       switch_stack_and_jump(idle_vcpu[0]->arch.cpu_info, init_done);
>   }
>   
> diff --git a/xen/arch/arm/smpboot.c b/xen/arch/arm/smpboot.c
> index 04e363088d60..1c689f2caed7 100644
> --- a/xen/arch/arm/smpboot.c
> +++ b/xen/arch/arm/smpboot.c
> @@ -30,6 +30,7 @@
>   #include <asm/psci.h>
>   #include <asm/acpi.h>
>   #include <asm/tee/tee.h>
> +#include <asm/traps.h>
>   
>   /* Override macros from asm/page.h to make them work with mfn_t */
>   #undef virt_to_mfn
> @@ -329,6 +330,8 @@ void asmlinkage start_secondary(void)
>   
>       set_processor_id(cpuid);
>   
> +    stack_check_init();
> +
>       identify_cpu(&current_cpu_data);
>       processor_setup();
>   
> diff --git a/xen/arch/arm/traps.c b/xen/arch/arm/traps.c
> index aac6c599f878..b4890eec7ec4 100644
> --- a/xen/arch/arm/traps.c
> +++ b/xen/arch/arm/traps.c
> @@ -2325,6 +2325,51 @@ void asmlinkage leave_hypervisor_to_guest(void)
>           arm_smccc_1_1_smc(ARM_SMCCC_ARCH_WORKAROUND_2_FID, 0, NULL);
>   }
>   
> +#ifdef CONFIG_DEBUG

Loooking at the code below. Aside the stack pointer part, nothing seems 
specific to Arm. So maybe this could be moved to common code?

> +DEFINE_PER_CPU(unsigned int, stack_check_nesting);
> +DEFINE_PER_CPU(unsigned char *, stack_base);

I think this could be "const unsigned char *" as the stack should not be 
modified directly.

> +
> +void __attribute__((no_instrument_function)) stack_set(unsigned char *base)
> +{
> +    this_cpu(stack_base) = base;
> +}
> +
> +void __init __attribute__((no_instrument_function)) stack_check_init(void)
> +{
> +    this_cpu(stack_check_nesting) = 0;
> +    stack_set(init_data.stack);
> +}
> +
> +__attribute__((no_instrument_function))
> +void __cyg_profile_func_enter(void *this_fn, void *call_site)
> +{
> +    unsigned char *sp;
> +
> +    if ( get_per_cpu_offset() == INVALID_PER_CPU_OFFSET )
> +        return;
> +
> +    asm volatile ("mov %0, sp" : "=r" (sp) );
> +
> +    if ( sp < this_cpu(stack_base) ||
> +         sp > (this_cpu(stack_base) + STACK_SIZE) )

The top of the stack is used to store struct cpu_info. So you want to 
substract its size (see arch_vcpu_create()).

> +    {
> +        if ( this_cpu(stack_check_nesting) )
> +            return;
> +
> +        this_cpu(stack_check_nesting)++;

Looking at the use, it only seems to be used as "print/panic once". So 
maybe use a bool to avoid any overflow.

> +        printk("CPU %d stack pointer out of bounds (sp %#lx, stack base %#lx)\n",
> +               smp_processor_id(), (uintptr_t)sp,
> +               (uintptr_t)this_cpu(stack_base));
> +        BUG();

I would consider to call panic(). But is it safe to call any of this if 
we blew the stack? IOW, should we have a buffer?

Cheers,
Julien Grall July 29, 2024, 6:50 p.m. UTC | #3
Hi again,

On 29/07/2024 15:24, Stewart Hildebrand wrote:
> Use the -finstrument-functions option to check that the stack pointer is
> valid upon each function entry. Only enable it for debug builds due to
> the overhead introduced.
> 
> Use per-cpu variables to store the stack base and nesting level. The
> nesting level is used to avoid invoking __cyg_profile_func_enter
> recursively.
> 
> A check is added for whether per-cpu data has been initialized. Since
> TPIDR_EL2 resets to an unknown value, initialize it to an invalid value.
> 
> Signed-off-by: Stewart Hildebrand <stewart.hildebrand@amd.com>
> ---
> RFC: I only tested this on arm64. I still need to test with arm32.
> 
> RFC: Should we put this under its own config option instead of reusing
>       CONFIG_DEBUG?
> 
> RFC: Is there a better value for INVALID_PER_CPU_OFFSET? We can't use 0
>       because 0 is a valid value for get_per_cpu_offset().
> ---
>   xen/arch/arm/arch.mk             |  1 +
>   xen/arch/arm/arm64/head.S        |  4 +++
>   xen/arch/arm/domain.c            |  3 +++
>   xen/arch/arm/include/asm/page.h  |  2 ++
>   xen/arch/arm/include/asm/traps.h |  8 ++++++
>   xen/arch/arm/setup.c             |  4 +++
>   xen/arch/arm/smpboot.c           |  3 +++
>   xen/arch/arm/traps.c             | 45 ++++++++++++++++++++++++++++++++
>   8 files changed, 70 insertions(+)
> 
> diff --git a/xen/arch/arm/arch.mk b/xen/arch/arm/arch.mk
> index 022dcda19224..c39cb65d183d 100644
> --- a/xen/arch/arm/arch.mk
> +++ b/xen/arch/arm/arch.mk
> @@ -12,6 +12,7 @@ CFLAGS-$(CONFIG_ARM_32) += -mno-unaligned-access
>   CFLAGS-$(CONFIG_ARM_64) += -mcpu=generic
>   CFLAGS-$(CONFIG_ARM_64) += -mgeneral-regs-only # No fp registers etc
>   $(call cc-option-add,CFLAGS-$(CONFIG_ARM_64),CC,-mno-outline-atomics)
> +CFLAGS-$(CONFIG_DEBUG) += -finstrument-functions
>   
>   ifneq ($(filter command line environment,$(origin CONFIG_EARLY_PRINTK)),)
>       $(error You must use 'make menuconfig' to enable/disable early printk now)
> diff --git a/xen/arch/arm/arm64/head.S b/xen/arch/arm/arm64/head.S
> index 2fa07dc3a04b..4a332b9cbc7c 100644
> --- a/xen/arch/arm/arm64/head.S
> +++ b/xen/arch/arm/arm64/head.S
> @@ -399,6 +399,10 @@ FUNC_LOCAL(cpu_init)
>            * than SP_EL0.
>            */
>           msr spsel, #1
> +
> +        ldr   x0, =INVALID_PER_CPU_OFFSET
> +        msr   tpidr_el2, x0
> +
>           ret
>   END(cpu_init)
>   
> diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c
> index 7cfcefd27944..93ebe6e5f8af 100644
> --- a/xen/arch/arm/domain.c
> +++ b/xen/arch/arm/domain.c
> @@ -26,6 +26,7 @@
>   #include <asm/procinfo.h>
>   #include <asm/regs.h>
>   #include <asm/tee/tee.h>
> +#include <asm/traps.h>
>   #include <asm/vfp.h>
>   #include <asm/vgic.h>
>   #include <asm/vtimer.h>
> @@ -328,6 +329,8 @@ void context_switch(struct vcpu *prev, struct vcpu *next)
>   
>       set_current(next);
>   
> +    stack_set(next->arch.stack);
> +
>       prev = __context_switch(prev, next);
>   
>       schedule_tail(prev);
> diff --git a/xen/arch/arm/include/asm/page.h b/xen/arch/arm/include/asm/page.h
> index 69f817d1e68a..6b5aaf1eb3ff 100644
> --- a/xen/arch/arm/include/asm/page.h
> +++ b/xen/arch/arm/include/asm/page.h
> @@ -7,6 +7,8 @@
>   #include <asm/lpae.h>
>   #include <asm/sysregs.h>
>   
> +#define INVALID_PER_CPU_OFFSET            _AC(0x1, UL)
> +
>   /* Shareability values for the LPAE entries */
>   #define LPAE_SH_NON_SHAREABLE 0x0
>   #define LPAE_SH_UNPREDICTALE  0x1
> diff --git a/xen/arch/arm/include/asm/traps.h b/xen/arch/arm/include/asm/traps.h
> index 9a60dbf70e5b..13e6997c2643 100644
> --- a/xen/arch/arm/include/asm/traps.h
> +++ b/xen/arch/arm/include/asm/traps.h
> @@ -118,6 +118,14 @@ static inline register_t sign_extend(const struct hsr_dabt dabt, register_t r)
>   
>   void finalize_instr_emulation(const struct instr_details *instr);
>   
> +#ifdef CONFIG_DEBUG
> +void stack_check_init(void);
> +void stack_set(unsigned char *base);
> +#else
> +static inline void stack_check_init(void) { }
> +static inline void stack_set(unsigned char *base) { } > +#endif
> +
>   #endif /* __ASM_ARM_TRAPS__ */
>   /*
>    * Local variables:
> diff --git a/xen/arch/arm/setup.c b/xen/arch/arm/setup.c
> index 0c2fdaceaf21..07d07feff602 100644
> --- a/xen/arch/arm/setup.c
> +++ b/xen/arch/arm/setup.c
> @@ -47,6 +47,7 @@
>   #include <asm/setup.h>
>   #include <xsm/xsm.h>
>   #include <asm/acpi.h>
> +#include <asm/traps.h>
>   
>   struct bootinfo __initdata bootinfo = BOOTINFO_INIT;
>   
> @@ -733,6 +734,8 @@ void asmlinkage __init start_xen(unsigned long boot_phys_offset,
>       percpu_init_areas();
>       set_processor_id(0); /* needed early, for smp_processor_id() */
>   
> +    stack_check_init();
> +
>       /* Initialize traps early allow us to get backtrace when an error occurred */
>       init_traps();
>   
> @@ -924,6 +927,7 @@ void asmlinkage __init start_xen(unsigned long boot_phys_offset,
>        * since the static one we're running on is about to be freed. */
>       memcpy(idle_vcpu[0]->arch.cpu_info, get_cpu_info(),
>              sizeof(struct cpu_info));
> +    stack_set(idle_vcpu[0]->arch.stack);
>       switch_stack_and_jump(idle_vcpu[0]->arch.cpu_info, init_done);
>   }
>   
> diff --git a/xen/arch/arm/smpboot.c b/xen/arch/arm/smpboot.c
> index 04e363088d60..1c689f2caed7 100644
> --- a/xen/arch/arm/smpboot.c
> +++ b/xen/arch/arm/smpboot.c
> @@ -30,6 +30,7 @@
>   #include <asm/psci.h>
>   #include <asm/acpi.h>
>   #include <asm/tee/tee.h>
> +#include <asm/traps.h>
>   
>   /* Override macros from asm/page.h to make them work with mfn_t */
>   #undef virt_to_mfn
> @@ -329,6 +330,8 @@ void asmlinkage start_secondary(void)
>   
>       set_processor_id(cpuid);
>   
> +    stack_check_init();
> +
>       identify_cpu(&current_cpu_data);
>       processor_setup();
>   
> diff --git a/xen/arch/arm/traps.c b/xen/arch/arm/traps.c
> index aac6c599f878..b4890eec7ec4 100644
> --- a/xen/arch/arm/traps.c
> +++ b/xen/arch/arm/traps.c
> @@ -2325,6 +2325,51 @@ void asmlinkage leave_hypervisor_to_guest(void)
>           arm_smccc_1_1_smc(ARM_SMCCC_ARCH_WORKAROUND_2_FID, 0, NULL);
>   }
>   
> +#ifdef CONFIG_DEBUG
> +DEFINE_PER_CPU(unsigned int, stack_check_nesting);
> +DEFINE_PER_CPU(unsigned char *, stack_base);

While looking at the code, I just realized that this should be 
equivalent to current->arch.base. So do we actually need stack_base?

Cheers,
Stewart Hildebrand July 29, 2024, 7:40 p.m. UTC | #4
On 7/29/24 14:50, Julien Grall wrote:
> Hi again,
> 
> On 29/07/2024 15:24, Stewart Hildebrand wrote:
>> diff --git a/xen/arch/arm/traps.c b/xen/arch/arm/traps.c
>> index aac6c599f878..b4890eec7ec4 100644
>> --- a/xen/arch/arm/traps.c
>> +++ b/xen/arch/arm/traps.c
>> @@ -2325,6 +2325,51 @@ void asmlinkage leave_hypervisor_to_guest(void)
>>           arm_smccc_1_1_smc(ARM_SMCCC_ARCH_WORKAROUND_2_FID, 0, NULL);
>>   }
>>   +#ifdef CONFIG_DEBUG
>> +DEFINE_PER_CPU(unsigned int, stack_check_nesting);
>> +DEFINE_PER_CPU(unsigned char *, stack_base);
> 
> While looking at the code, I just realized that this should be equivalent to current->arch.base.

current->arch.stack

That's true only after the idle vcpu stacks have been allocated.

> So do we actually need stack_base?

This is a way to enable stack instrumentation much earlier during boot
when we are still using the static boot stack.
Julien Grall July 29, 2024, 8:37 p.m. UTC | #5
Hi,

On 29/07/2024 20:40, Stewart Hildebrand wrote:
> On 7/29/24 14:50, Julien Grall wrote:
>> Hi again,
>>
>> On 29/07/2024 15:24, Stewart Hildebrand wrote:
>>> diff --git a/xen/arch/arm/traps.c b/xen/arch/arm/traps.c
>>> index aac6c599f878..b4890eec7ec4 100644
>>> --- a/xen/arch/arm/traps.c
>>> +++ b/xen/arch/arm/traps.c
>>> @@ -2325,6 +2325,51 @@ void asmlinkage leave_hypervisor_to_guest(void)
>>>            arm_smccc_1_1_smc(ARM_SMCCC_ARCH_WORKAROUND_2_FID, 0, NULL);
>>>    }
>>>    +#ifdef CONFIG_DEBUG
>>> +DEFINE_PER_CPU(unsigned int, stack_check_nesting);
>>> +DEFINE_PER_CPU(unsigned char *, stack_base);
>>
>> While looking at the code, I just realized that this should be equivalent to current->arch.base.
> 
> current->arch.stack
> 
> That's true only after the idle vcpu stacks have been allocated.
> 
>> So do we actually need stack_base?
> 
> This is a way to enable stack instrumentation much earlier during boot
> when we are still using the static boot stack.

Sure. But we are only partially checking the static boot stack. I am not 
entirely sure if there is any value for that because at that point the 
stack would be barely used.

Anyway, this is only for debug build so far, so I am ok to keep it.

Cheers,
Jan Beulich July 30, 2024, 6:48 a.m. UTC | #6
On 29.07.2024 20:36, Julien Grall wrote:
> On 29/07/2024 15:24, Stewart Hildebrand wrote:
>> Use the -finstrument-functions option to check that the stack pointer is
> 
> Is the feature supported by both clang and GCC? If so, which from versions?
> 
>  From README, this is what we currently support.
> 
>        - For ARM 32-bit:
>          - GCC 4.9 or later
>          - GNU Binutils 2.24 or later
>        - For ARM 64-bit:
>          - GCC 5.1 or later
>          - GNU Binutils 2.24 or later
> 
> We don't mention Clang, but I would expect a clang from 4-5 years should 
> still be Arm (not cross-compile as we never fully added the support).

I was wondering the same, but already on patch 1 and with other architectures
also in mind, as there's common (XSM) code gaining respective attributes. For
gcc I checked that even 4.1 supports that attribute, so I expect we're fine
here (and double checking with godbolt, Clang 3.0.0 also supports the option).

Jan
Stewart Hildebrand July 30, 2024, 5:50 p.m. UTC | #7
On 7/29/24 14:36, Julien Grall wrote:
> Hi Stewart,
> 
> On 29/07/2024 15:24, Stewart Hildebrand wrote:
>> Use the -finstrument-functions option to check that the stack pointer is
> 
> Is the feature supported by both clang and GCC?

Yes, I tested both.

> If so, which from versions?

gcc since at least 1998, not sure what version specifically.

clang since v2.8 (2010)

> 
> From README, this is what we currently support.
> 
>       - For ARM 32-bit:
>         - GCC 4.9 or later
>         - GNU Binutils 2.24 or later
>       - For ARM 64-bit:
>         - GCC 5.1 or later
>         - GNU Binutils 2.24 or later
> 
> We don't mention Clang, but I would expect a clang from 4-5 years should still be Arm (not cross-compile as we never fully added the support).

There is a way cross compile with clang today, but still using gnu
linker and such. Here's my clang build rune:

make -j $(nproc) \
    clang=y \
    CC="clang --target=aarch64-none-linux-gnu -march=armv8a+nocrypto" \
    CXX="clang++ --target=aarch64-none-linux-gnu -march=armv8a+nocrypto" \
    HOSTCC=clang \
    HOSTCXX=clang++ \
    XEN_TARGET_ARCH=arm64 \
    CROSS_COMPILE=aarch64-none-linux-gnu- \
    dist-xen

> 
> 
>> valid upon each function entry. Only enable it for debug builds due to
>> the overhead introduced.
> 
> How much overhead is it? We use debug builds during the dev cycle, so we need to make sure this is not noticeable.

Xen binary size without instrumentation: 1.3M
Xen binary size with instrumentation: 1.9M

Subjectively, the Xen boot time on ZCU102 hardware seems comparable.

On qemu-system-aarch64, unfortunately, I'm seeing about a 7x slowdown in
Xen boot time. It seems page allocation operations are particularly
affected.

> 
> In any case, it would be better if this new option is under its own kconfig options. We can separately decide whether it is on or off by default when CONFIG_DEBUG=y.

Will do. Given the significant overhead when running on qemu, I think I
must reluctantly suggest that it be off by default. For CI runs with
real hardware, we could turn it on.

> 
>>
>> Use per-cpu variables to store the stack base and nesting level. The
>> nesting level is used to avoid invoking __cyg_profile_func_enter
>> recursively.
>>
>> A check is added for whether per-cpu data has been initialized. Since
>> TPIDR_EL2 resets to an unknown value, initialize it to an invalid value.
>>
>> Signed-off-by: Stewart Hildebrand <stewart.hildebrand@amd.com>
>> ---
>> RFC: I only tested this on arm64. I still need to test with arm32.
>>
>> RFC: Should we put this under its own config option instead of reusing
>>       CONFIG_DEBUG?
> 
> See above.
> 
>>
>> RFC: Is there a better value for INVALID_PER_CPU_OFFSET? We can't use 0
>>       because 0 is a valid value for get_per_cpu_offset().
> 
> I would consider to use ~0 because this is very unlikely to be used as an offset (at least on arm64).

Yep, ~0 works, will do.

> 
>> ---
>>   xen/arch/arm/arch.mk             |  1 +
>>   xen/arch/arm/arm64/head.S        |  4 +++
>>   xen/arch/arm/domain.c            |  3 +++
>>   xen/arch/arm/include/asm/page.h  |  2 ++
>>   xen/arch/arm/include/asm/traps.h |  8 ++++++
>>   xen/arch/arm/setup.c             |  4 +++
>>   xen/arch/arm/smpboot.c           |  3 +++
>>   xen/arch/arm/traps.c             | 45 ++++++++++++++++++++++++++++++++
>>   8 files changed, 70 insertions(+)
>>
>> diff --git a/xen/arch/arm/arch.mk b/xen/arch/arm/arch.mk
>> index 022dcda19224..c39cb65d183d 100644
>> --- a/xen/arch/arm/arch.mk
>> +++ b/xen/arch/arm/arch.mk
>> @@ -12,6 +12,7 @@ CFLAGS-$(CONFIG_ARM_32) += -mno-unaligned-access
>>   CFLAGS-$(CONFIG_ARM_64) += -mcpu=generic
>>   CFLAGS-$(CONFIG_ARM_64) += -mgeneral-regs-only # No fp registers etc
>>   $(call cc-option-add,CFLAGS-$(CONFIG_ARM_64),CC,-mno-outline-atomics)
>> +CFLAGS-$(CONFIG_DEBUG) += -finstrument-functions
>>     ifneq ($(filter command line environment,$(origin CONFIG_EARLY_PRINTK)),)
>>       $(error You must use 'make menuconfig' to enable/disable early printk now)
>> diff --git a/xen/arch/arm/arm64/head.S b/xen/arch/arm/arm64/head.S
>> index 2fa07dc3a04b..4a332b9cbc7c 100644
>> --- a/xen/arch/arm/arm64/head.S
>> +++ b/xen/arch/arm/arm64/head.S
>> @@ -399,6 +399,10 @@ FUNC_LOCAL(cpu_init)
>>            * than SP_EL0.
>>            */
>>           msr spsel, #1
>> +
> 
> I would add a comment explaining what we are doing.

Will do.

> 
>> +        ldr   x0, =INVALID_PER_CPU_OFFSET
>> +        msr   tpidr_el2, x0
>> +
>>           ret
>>   END(cpu_init)
>>   diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c
>> index 7cfcefd27944..93ebe6e5f8af 100644
>> --- a/xen/arch/arm/domain.c
>> +++ b/xen/arch/arm/domain.c
>> @@ -26,6 +26,7 @@
>>   #include <asm/procinfo.h>
>>   #include <asm/regs.h>
>>   #include <asm/tee/tee.h>
>> +#include <asm/traps.h>
>>   #include <asm/vfp.h>
>>   #include <asm/vgic.h>
>>   #include <asm/vtimer.h>
>> @@ -328,6 +329,8 @@ void context_switch(struct vcpu *prev, struct vcpu *next)
>>         set_current(next);
>>   +    stack_set(next->arch.stack);
>> +
>>       prev = __context_switch(prev, next);
>>         schedule_tail(prev);
>> diff --git a/xen/arch/arm/include/asm/page.h b/xen/arch/arm/include/asm/page.h
>> index 69f817d1e68a..6b5aaf1eb3ff 100644
>> --- a/xen/arch/arm/include/asm/page.h
>> +++ b/xen/arch/arm/include/asm/page.h
>> @@ -7,6 +7,8 @@
>>   #include <asm/lpae.h>
>>   #include <asm/sysregs.h>
>>   +#define INVALID_PER_CPU_OFFSET            _AC(0x1, UL)
>> +
>>   /* Shareability values for the LPAE entries */
>>   #define LPAE_SH_NON_SHAREABLE 0x0
>>   #define LPAE_SH_UNPREDICTALE  0x1
>> diff --git a/xen/arch/arm/include/asm/traps.h b/xen/arch/arm/include/asm/traps.h
>> index 9a60dbf70e5b..13e6997c2643 100644
>> --- a/xen/arch/arm/include/asm/traps.h
>> +++ b/xen/arch/arm/include/asm/traps.h
>> @@ -118,6 +118,14 @@ static inline register_t sign_extend(const struct hsr_dabt dabt, register_t r)
>>     void finalize_instr_emulation(const struct instr_details *instr);
>>   +#ifdef CONFIG_DEBUG
>> +void stack_check_init(void);
>> +void stack_set(unsigned char *base);
>> +#else
>> +static inline void stack_check_init(void) { }
>> +static inline void stack_set(unsigned char *base) { }
>> +#endif
>> +
>>   #endif /* __ASM_ARM_TRAPS__ */
>>   /*
>>    * Local variables:
>> diff --git a/xen/arch/arm/setup.c b/xen/arch/arm/setup.c
>> index 0c2fdaceaf21..07d07feff602 100644
>> --- a/xen/arch/arm/setup.c
>> +++ b/xen/arch/arm/setup.c
>> @@ -47,6 +47,7 @@
>>   #include <asm/setup.h>
>>   #include <xsm/xsm.h>
>>   #include <asm/acpi.h>
>> +#include <asm/traps.h>
>>     struct bootinfo __initdata bootinfo = BOOTINFO_INIT;
>>   @@ -733,6 +734,8 @@ void asmlinkage __init start_xen(unsigned long boot_phys_offset,
>>       percpu_init_areas();
>>       set_processor_id(0); /* needed early, for smp_processor_id() */
>>   +    stack_check_init();
>> +
>>       /* Initialize traps early allow us to get backtrace when an error occurred */
>>       init_traps();
>>   @@ -924,6 +927,7 @@ void asmlinkage __init start_xen(unsigned long boot_phys_offset,
>>        * since the static one we're running on is about to be freed. */
>>       memcpy(idle_vcpu[0]->arch.cpu_info, get_cpu_info(),
>>              sizeof(struct cpu_info));
>> +    stack_set(idle_vcpu[0]->arch.stack);
>>       switch_stack_and_jump(idle_vcpu[0]->arch.cpu_info, init_done);
>>   }
>>   diff --git a/xen/arch/arm/smpboot.c b/xen/arch/arm/smpboot.c
>> index 04e363088d60..1c689f2caed7 100644
>> --- a/xen/arch/arm/smpboot.c
>> +++ b/xen/arch/arm/smpboot.c
>> @@ -30,6 +30,7 @@
>>   #include <asm/psci.h>
>>   #include <asm/acpi.h>
>>   #include <asm/tee/tee.h>
>> +#include <asm/traps.h>
>>     /* Override macros from asm/page.h to make them work with mfn_t */
>>   #undef virt_to_mfn
>> @@ -329,6 +330,8 @@ void asmlinkage start_secondary(void)
>>         set_processor_id(cpuid);
>>   +    stack_check_init();
>> +
>>       identify_cpu(&current_cpu_data);
>>       processor_setup();
>>   diff --git a/xen/arch/arm/traps.c b/xen/arch/arm/traps.c
>> index aac6c599f878..b4890eec7ec4 100644
>> --- a/xen/arch/arm/traps.c
>> +++ b/xen/arch/arm/traps.c
>> @@ -2325,6 +2325,51 @@ void asmlinkage leave_hypervisor_to_guest(void)
>>           arm_smccc_1_1_smc(ARM_SMCCC_ARCH_WORKAROUND_2_FID, 0, NULL);
>>   }
>>   +#ifdef CONFIG_DEBUG
> 
> Loooking at the code below. Aside the stack pointer part, nothing seems specific to Arm. So maybe this could be moved to common code?

Yes, I suppose so.

> 
>> +DEFINE_PER_CPU(unsigned int, stack_check_nesting);
>> +DEFINE_PER_CPU(unsigned char *, stack_base);
> 
> I think this could be "const unsigned char *" as the stack should not be modified directly.

Every time there's a vcpu context switch we will have a new stack.

> 
>> +
>> +void __attribute__((no_instrument_function)) stack_set(unsigned char *base)
>> +{
>> +    this_cpu(stack_base) = base;
>> +}
>> +
>> +void __init __attribute__((no_instrument_function)) stack_check_init(void)
>> +{
>> +    this_cpu(stack_check_nesting) = 0;
>> +    stack_set(init_data.stack);
>> +}
>> +
>> +__attribute__((no_instrument_function))
>> +void __cyg_profile_func_enter(void *this_fn, void *call_site)
>> +{
>> +    unsigned char *sp;
>> +
>> +    if ( get_per_cpu_offset() == INVALID_PER_CPU_OFFSET )
>> +        return;
>> +
>> +    asm volatile ("mov %0, sp" : "=r" (sp) );
>> +
>> +    if ( sp < this_cpu(stack_base) ||
>> +         sp > (this_cpu(stack_base) + STACK_SIZE) )
> 
> The top of the stack is used to store struct cpu_info. So you want to substract its size (see arch_vcpu_create()).

Will do.

> 
>> +    {
>> +        if ( this_cpu(stack_check_nesting) )
>> +            return;
>> +
>> +        this_cpu(stack_check_nesting)++;
> 
> Looking at the use, it only seems to be used as "print/panic once". So maybe use a bool to avoid any overflow.

It will only ever be incremented once. I'll still change it to a bool,
this should make it more obvious.

> 
>> +        printk("CPU %d stack pointer out of bounds (sp %#lx, stack base %#lx)\n",
>> +               smp_processor_id(), (uintptr_t)sp,
>> +               (uintptr_t)this_cpu(stack_base));
>> +        BUG();
> 
> I would consider to call panic().

panic() alone doesn't show the stack trace / call trace.

> But is it safe to call any of this if we blew the stack?

Nope, it sure isn't!

> IOW, should we have a buffer?

Yes. After some experimentation, I found that this printk and a WARN
(similar to BUG, but resumes execution and allows me to collect these
metrics) uses approximately 1632 bytes of stack. Assuming BUG uses a
similar amount of stack as WARN, and adding in a comfortable margin for
error, I'll add a 4096 byte buffer (i.e. invoke the print/BUG with 4096
bytes remaining on the stack).
Stewart Hildebrand July 30, 2024, 5:59 p.m. UTC | #8
On 7/29/24 16:37, Julien Grall wrote:
> Hi,
> 
> On 29/07/2024 20:40, Stewart Hildebrand wrote:
>> On 7/29/24 14:50, Julien Grall wrote:
>>> Hi again,
>>>
>>> On 29/07/2024 15:24, Stewart Hildebrand wrote:
>>>> diff --git a/xen/arch/arm/traps.c b/xen/arch/arm/traps.c
>>>> index aac6c599f878..b4890eec7ec4 100644
>>>> --- a/xen/arch/arm/traps.c
>>>> +++ b/xen/arch/arm/traps.c
>>>> @@ -2325,6 +2325,51 @@ void asmlinkage leave_hypervisor_to_guest(void)
>>>>            arm_smccc_1_1_smc(ARM_SMCCC_ARCH_WORKAROUND_2_FID, 0, NULL);
>>>>    }
>>>>    +#ifdef CONFIG_DEBUG
>>>> +DEFINE_PER_CPU(unsigned int, stack_check_nesting);
>>>> +DEFINE_PER_CPU(unsigned char *, stack_base);
>>>
>>> While looking at the code, I just realized that this should be equivalent to current->arch.base.
>>
>> current->arch.stack
>>
>> That's true only after the idle vcpu stacks have been allocated.
>>
>>> So do we actually need stack_base?
>>
>> This is a way to enable stack instrumentation much earlier during boot
>> when we are still using the static boot stack.
> 
> Sure. But we are only partially checking the static boot stack.

The stack checking begins just a few lines into C world, after the
percpu_init_areas(), set_processor_id(), and stack_check_init() calls in
arch/arm/setup.c:start_xen().

> I am not entirely sure if there is any value for that because at that point the stack would be barely used.

The entirety of start_xen() uses the boot stack, and it makes plenty of
device tree parsing calls (where there is recursion) and performs domain
creation, which hits the stack significantly.

arch/arm/dom0less-build.c: In function ‘construct_domU’:
arch/arm/dom0less-build.c:742:19: warning: stack usage is 7824 bytes [-Wstack-usage=]
  742 | static int __init construct_domU(struct domain *d,
      |                   ^~~~~~~~~~~~~~

arch/arm/domain_build.c: In function ‘make_memory_node’:
arch/arm/domain_build.c:788:12: warning: stack usage is 4720 bytes [-Wstack-usage=]
  788 | int __init make_memory_node(const struct kernel_info *kinfo, int addrcells,
      |            ^~~~~~~~~~~~~~~~

arch/arm/domain_build.c: In function ‘construct_dom0’:
arch/arm/domain_build.c:2120:19: warning: stack usage is 7776 bytes [-Wstack-usage=]
 2120 | static int __init construct_dom0(struct domain *d)
      |                   ^~~~~~~~~~~~~~

> 
> Anyway, this is only for debug build so far, so I am ok to keep it.
> 
> Cheers,
>
Julien Grall July 30, 2024, 6:07 p.m. UTC | #9
Hi,

On 30/07/2024 18:50, Stewart Hildebrand wrote:
> On 7/29/24 14:36, Julien Grall wrote:
>> Hi Stewart,
>>
>> On 29/07/2024 15:24, Stewart Hildebrand wrote:
>>> Use the -finstrument-functions option to check that the stack pointer is
>>
>> Is the feature supported by both clang and GCC?
> 
> Yes, I tested both.
> 
>> If so, which from versions?
> 
> gcc since at least 1998, not sure what version specifically.
> 
> clang since v2.8 (2010)

That's quite old :). Thanks for checking.

>>
>> In any case, it would be better if this new option is under its own kconfig options. We can separately decide whether it is on or off by default when CONFIG_DEBUG=y.
> 
> Will do. Given the significant overhead when running on qemu, I think I
> must reluctantly suggest that it be off by default. For CI runs with
> real hardware, we could turn it on.

That makes sense.

>>> Use per-cpu variables to store the stack base and nesting level. The
>>> nesting level is used to avoid invoking __cyg_profile_func_enter
>>> recursively.
>>>
>>> A check is added for whether per-cpu data has been initialized. Since
>>> TPIDR_EL2 resets to an unknown value, initialize it to an invalid value.
>>>
>>> Signed-off-by: Stewart Hildebrand <stewart.hildebrand@amd.com>
>>> ---
>>> RFC: I only tested this on arm64. I still need to test with arm32.
>>>
>>> RFC: Should we put this under its own config option instead of reusing
>>>        CONFIG_DEBUG?
>>
>> See above.
>>
>>>
>>> RFC: Is there a better value for INVALID_PER_CPU_OFFSET? We can't use 0
>>>        because 0 is a valid value for get_per_cpu_offset().
>>
>> I would consider to use ~0 because this is very unlikely to be used as an offset (at least on arm64).
> 
> Yep, ~0 works, will do.
> 
>>
>>> ---
>>>    xen/arch/arm/arch.mk             |  1 +
>>>    xen/arch/arm/arm64/head.S        |  4 +++
>>>    xen/arch/arm/domain.c            |  3 +++
>>>    xen/arch/arm/include/asm/page.h  |  2 ++
>>>    xen/arch/arm/include/asm/traps.h |  8 ++++++
>>>    xen/arch/arm/setup.c             |  4 +++
>>>    xen/arch/arm/smpboot.c           |  3 +++
>>>    xen/arch/arm/traps.c             | 45 ++++++++++++++++++++++++++++++++
>>>    8 files changed, 70 insertions(+)
>>>
>>> diff --git a/xen/arch/arm/arch.mk b/xen/arch/arm/arch.mk
>>> index 022dcda19224..c39cb65d183d 100644
>>> --- a/xen/arch/arm/arch.mk
>>> +++ b/xen/arch/arm/arch.mk
>>> @@ -12,6 +12,7 @@ CFLAGS-$(CONFIG_ARM_32) += -mno-unaligned-access
>>>    CFLAGS-$(CONFIG_ARM_64) += -mcpu=generic
>>>    CFLAGS-$(CONFIG_ARM_64) += -mgeneral-regs-only # No fp registers etc
>>>    $(call cc-option-add,CFLAGS-$(CONFIG_ARM_64),CC,-mno-outline-atomics)
>>> +CFLAGS-$(CONFIG_DEBUG) += -finstrument-functions
>>>      ifneq ($(filter command line environment,$(origin CONFIG_EARLY_PRINTK)),)
>>>        $(error You must use 'make menuconfig' to enable/disable early printk now)
>>> diff --git a/xen/arch/arm/arm64/head.S b/xen/arch/arm/arm64/head.S
>>> index 2fa07dc3a04b..4a332b9cbc7c 100644
>>> --- a/xen/arch/arm/arm64/head.S
>>> +++ b/xen/arch/arm/arm64/head.S
>>> @@ -399,6 +399,10 @@ FUNC_LOCAL(cpu_init)
>>>             * than SP_EL0.
>>>             */
>>>            msr spsel, #1
>>> +
>>
>> I would add a comment explaining what we are doing.
> 
> Will do.
> 
>>
>>> +        ldr   x0, =INVALID_PER_CPU_OFFSET
>>> +        msr   tpidr_el2, x0
>>> +
>>>            ret
>>>    END(cpu_init)
>>>    diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c
>>> index 7cfcefd27944..93ebe6e5f8af 100644
>>> --- a/xen/arch/arm/domain.c
>>> +++ b/xen/arch/arm/domain.c
>>> @@ -26,6 +26,7 @@
>>>    #include <asm/procinfo.h>
>>>    #include <asm/regs.h>
>>>    #include <asm/tee/tee.h>
>>> +#include <asm/traps.h>
>>>    #include <asm/vfp.h>
>>>    #include <asm/vgic.h>
>>>    #include <asm/vtimer.h>
>>> @@ -328,6 +329,8 @@ void context_switch(struct vcpu *prev, struct vcpu *next)
>>>          set_current(next);
>>>    +    stack_set(next->arch.stack);
>>> +
>>>        prev = __context_switch(prev, next);
>>>          schedule_tail(prev);
>>> diff --git a/xen/arch/arm/include/asm/page.h b/xen/arch/arm/include/asm/page.h
>>> index 69f817d1e68a..6b5aaf1eb3ff 100644
>>> --- a/xen/arch/arm/include/asm/page.h
>>> +++ b/xen/arch/arm/include/asm/page.h
>>> @@ -7,6 +7,8 @@
>>>    #include <asm/lpae.h>
>>>    #include <asm/sysregs.h>
>>>    +#define INVALID_PER_CPU_OFFSET            _AC(0x1, UL)
>>> +
>>>    /* Shareability values for the LPAE entries */
>>>    #define LPAE_SH_NON_SHAREABLE 0x0
>>>    #define LPAE_SH_UNPREDICTALE  0x1
>>> diff --git a/xen/arch/arm/include/asm/traps.h b/xen/arch/arm/include/asm/traps.h
>>> index 9a60dbf70e5b..13e6997c2643 100644
>>> --- a/xen/arch/arm/include/asm/traps.h
>>> +++ b/xen/arch/arm/include/asm/traps.h
>>> @@ -118,6 +118,14 @@ static inline register_t sign_extend(const struct hsr_dabt dabt, register_t r)
>>>      void finalize_instr_emulation(const struct instr_details *instr);
>>>    +#ifdef CONFIG_DEBUG
>>> +void stack_check_init(void);
>>> +void stack_set(unsigned char *base);
>>> +#else
>>> +static inline void stack_check_init(void) { }
>>> +static inline void stack_set(unsigned char *base) { }
>>> +#endif
>>> +
>>>    #endif /* __ASM_ARM_TRAPS__ */
>>>    /*
>>>     * Local variables:
>>> diff --git a/xen/arch/arm/setup.c b/xen/arch/arm/setup.c
>>> index 0c2fdaceaf21..07d07feff602 100644
>>> --- a/xen/arch/arm/setup.c
>>> +++ b/xen/arch/arm/setup.c
>>> @@ -47,6 +47,7 @@
>>>    #include <asm/setup.h>
>>>    #include <xsm/xsm.h>
>>>    #include <asm/acpi.h>
>>> +#include <asm/traps.h>
>>>      struct bootinfo __initdata bootinfo = BOOTINFO_INIT;
>>>    @@ -733,6 +734,8 @@ void asmlinkage __init start_xen(unsigned long boot_phys_offset,
>>>        percpu_init_areas();
>>>        set_processor_id(0); /* needed early, for smp_processor_id() */
>>>    +    stack_check_init();
>>> +
>>>        /* Initialize traps early allow us to get backtrace when an error occurred */
>>>        init_traps();
>>>    @@ -924,6 +927,7 @@ void asmlinkage __init start_xen(unsigned long boot_phys_offset,
>>>         * since the static one we're running on is about to be freed. */
>>>        memcpy(idle_vcpu[0]->arch.cpu_info, get_cpu_info(),
>>>               sizeof(struct cpu_info));
>>> +    stack_set(idle_vcpu[0]->arch.stack);
>>>        switch_stack_and_jump(idle_vcpu[0]->arch.cpu_info, init_done);
>>>    }
>>>    diff --git a/xen/arch/arm/smpboot.c b/xen/arch/arm/smpboot.c
>>> index 04e363088d60..1c689f2caed7 100644
>>> --- a/xen/arch/arm/smpboot.c
>>> +++ b/xen/arch/arm/smpboot.c
>>> @@ -30,6 +30,7 @@
>>>    #include <asm/psci.h>
>>>    #include <asm/acpi.h>
>>>    #include <asm/tee/tee.h>
>>> +#include <asm/traps.h>
>>>      /* Override macros from asm/page.h to make them work with mfn_t */
>>>    #undef virt_to_mfn
>>> @@ -329,6 +330,8 @@ void asmlinkage start_secondary(void)
>>>          set_processor_id(cpuid);
>>>    +    stack_check_init();
>>> +
>>>        identify_cpu(&current_cpu_data);
>>>        processor_setup();
>>>    diff --git a/xen/arch/arm/traps.c b/xen/arch/arm/traps.c
>>> index aac6c599f878..b4890eec7ec4 100644
>>> --- a/xen/arch/arm/traps.c
>>> +++ b/xen/arch/arm/traps.c
>>> @@ -2325,6 +2325,51 @@ void asmlinkage leave_hypervisor_to_guest(void)
>>>            arm_smccc_1_1_smc(ARM_SMCCC_ARCH_WORKAROUND_2_FID, 0, NULL);
>>>    }
>>>    +#ifdef CONFIG_DEBUG
>>
>> Loooking at the code below. Aside the stack pointer part, nothing seems specific to Arm. So maybe this could be moved to common code?
> 
> Yes, I suppose so.
> 
>>
>>> +DEFINE_PER_CPU(unsigned int, stack_check_nesting);
>>> +DEFINE_PER_CPU(unsigned char *, stack_base);
>>
>> I think this could be "const unsigned char *" as the stack should not be modified directly.
> 
> Every time there's a vcpu context switch we will have a new stack.

I am not sure I follow. "const unsigned char *" should still allow you 
to update stack_base. It will just prevent anyone to try to write to 
modify the stack via stack_base.

> 
>>
>>> +
>>> +void __attribute__((no_instrument_function)) stack_set(unsigned char *base)
>>> +{
>>> +    this_cpu(stack_base) = base;
>>> +}
>>> +
>>> +void __init __attribute__((no_instrument_function)) stack_check_init(void)
>>> +{
>>> +    this_cpu(stack_check_nesting) = 0;
>>> +    stack_set(init_data.stack);
>>> +}
>>> +
>>> +__attribute__((no_instrument_function))
>>> +void __cyg_profile_func_enter(void *this_fn, void *call_site)
>>> +{
>>> +    unsigned char *sp;
>>> +
>>> +    if ( get_per_cpu_offset() == INVALID_PER_CPU_OFFSET )
>>> +        return;
>>> +
>>> +    asm volatile ("mov %0, sp" : "=r" (sp) );
>>> +
>>> +    if ( sp < this_cpu(stack_base) ||
>>> +         sp > (this_cpu(stack_base) + STACK_SIZE) )
>>
>> The top of the stack is used to store struct cpu_info. So you want to substract its size (see arch_vcpu_create()).
> 
> Will do.
> 
>>
>>> +    {
>>> +        if ( this_cpu(stack_check_nesting) )
>>> +            return;
>>> +
>>> +        this_cpu(stack_check_nesting)++;
>>
>> Looking at the use, it only seems to be used as "print/panic once". So maybe use a bool to avoid any overflow.
> 
> It will only ever be incremented once. I'll still change it to a bool,
> this should make it more obvious.
> 
>>
>>> +        printk("CPU %d stack pointer out of bounds (sp %#lx, stack base %#lx)\n",
>>> +               smp_processor_id(), (uintptr_t)sp,
>>> +               (uintptr_t)this_cpu(stack_base));
>>> +        BUG();
>>
>> I would consider to call panic().
> 
> panic() alone doesn't show the stack trace / call trace.

Ah good point. But TBH, I have never really understood why panic() 
didn't return a call stack. There are a few places where I found 
beneficial when debugging.

Anyway, I guess this could be handled separately.

> 
>> But is it safe to call any of this if we blew the stack?
> 
> Nope, it sure isn't!
> 
>> IOW, should we have a buffer?
> 
> Yes. After some experimentation, I found that this printk and a WARN
> (similar to BUG, but resumes execution and allows me to collect these
> metrics) uses approximately 1632 bytes of stack. Assuming BUG uses a
> similar amount of stack as WARN, and adding in a comfortable margin for
> error, I'll add a 4096 byte buffer (i.e. invoke the print/BUG with 4096
> bytes remaining on the stack).

AFAICT, the stack on Arm is 32KB. So we 1/8 of the stack as a buffer. Do 
you know the current stack use in a normal setup (e.g. boot a guest)?

Anyway, so long the feature is not enabled in production, then it might 
be ok to steal 4KB. We could increase the stack if we see any issue.

Cheers,
Stewart Hildebrand July 30, 2024, 8:12 p.m. UTC | #10
On 7/30/24 14:07, Julien Grall wrote:
> Hi,
> 
> On 30/07/2024 18:50, Stewart Hildebrand wrote:
>> On 7/29/24 14:36, Julien Grall wrote:
>>> Hi Stewart,
>>>
>>> On 29/07/2024 15:24, Stewart Hildebrand wrote:
>>>> +DEFINE_PER_CPU(unsigned int, stack_check_nesting);
>>>> +DEFINE_PER_CPU(unsigned char *, stack_base);
>>>
>>> I think this could be "const unsigned char *" as the stack should not be modified directly.
>>
>> Every time there's a vcpu context switch we will have a new stack.
> 
> I am not sure I follow. "const unsigned char *" should still allow you to update stack_base. It will just prevent anyone to try to write to modify the stack via stack_base.

Ah, of course. You're right. I'll change it to "const unsigned char *".

> 
>>
>>>
>>>> +
>>>> +void __attribute__((no_instrument_function)) stack_set(unsigned char *base)
>>>> +{
>>>> +    this_cpu(stack_base) = base;
>>>> +}
>>>> +
>>>> +void __init __attribute__((no_instrument_function)) stack_check_init(void)
>>>> +{
>>>> +    this_cpu(stack_check_nesting) = 0;
>>>> +    stack_set(init_data.stack);
>>>> +}
>>>> +
>>>> +__attribute__((no_instrument_function))
>>>> +void __cyg_profile_func_enter(void *this_fn, void *call_site)
>>>> +{
>>>> +    unsigned char *sp;
>>>> +
>>>> +    if ( get_per_cpu_offset() == INVALID_PER_CPU_OFFSET )
>>>> +        return;
>>>> +
>>>> +    asm volatile ("mov %0, sp" : "=r" (sp) );
>>>> +
>>>> +    if ( sp < this_cpu(stack_base) ||
>>>> +         sp > (this_cpu(stack_base) + STACK_SIZE) )
>>>
>>> The top of the stack is used to store struct cpu_info. So you want to substract its size (see arch_vcpu_create()).
>>
>> Will do.
>>
>>>
>>>> +    {
>>>> +        if ( this_cpu(stack_check_nesting) )
>>>> +            return;
>>>> +
>>>> +        this_cpu(stack_check_nesting)++;
>>>
>>> Looking at the use, it only seems to be used as "print/panic once". So maybe use a bool to avoid any overflow.
>>
>> It will only ever be incremented once. I'll still change it to a bool,
>> this should make it more obvious.
>>
>>>
>>>> +        printk("CPU %d stack pointer out of bounds (sp %#lx, stack base %#lx)\n",
>>>> +               smp_processor_id(), (uintptr_t)sp,
>>>> +               (uintptr_t)this_cpu(stack_base));
>>>> +        BUG();
>>>
>>> I would consider to call panic().
>>
>> panic() alone doesn't show the stack trace / call trace.
> 
> Ah good point. But TBH, I have never really understood why panic() didn't return a call stack. There are a few places where I found beneficial when debugging.
> 
> Anyway, I guess this could be handled separately.
> 
>>
>>> But is it safe to call any of this if we blew the stack?
>>
>> Nope, it sure isn't!
>>
>>> IOW, should we have a buffer?
>>
>> Yes. After some experimentation, I found that this printk and a WARN
>> (similar to BUG, but resumes execution and allows me to collect these
>> metrics) uses approximately 1632 bytes of stack. Assuming BUG uses a
>> similar amount of stack as WARN, and adding in a comfortable margin for
>> error, I'll add a 4096 byte buffer (i.e. invoke the print/BUG with 4096
>> bytes remaining on the stack).
> 
> AFAICT, the stack on Arm is 32KB. So we 1/8 of the stack as a buffer. Do you know the current stack use in a normal setup (e.g. boot a guest)?

In my particular test case simply booting a dom0, it uses about 14k of
stack. Of course this could vary with booting dom0less domUs, complexity
of device tree parsing, etc.

> 
> Anyway, so long the feature is not enabled in production, then it might be ok to steal 4KB. We could increase the stack if we see any issue.
> 
> Cheers,
>
diff mbox series

Patch

diff --git a/xen/arch/arm/arch.mk b/xen/arch/arm/arch.mk
index 022dcda19224..c39cb65d183d 100644
--- a/xen/arch/arm/arch.mk
+++ b/xen/arch/arm/arch.mk
@@ -12,6 +12,7 @@  CFLAGS-$(CONFIG_ARM_32) += -mno-unaligned-access
 CFLAGS-$(CONFIG_ARM_64) += -mcpu=generic
 CFLAGS-$(CONFIG_ARM_64) += -mgeneral-regs-only # No fp registers etc
 $(call cc-option-add,CFLAGS-$(CONFIG_ARM_64),CC,-mno-outline-atomics)
+CFLAGS-$(CONFIG_DEBUG) += -finstrument-functions
 
 ifneq ($(filter command line environment,$(origin CONFIG_EARLY_PRINTK)),)
     $(error You must use 'make menuconfig' to enable/disable early printk now)
diff --git a/xen/arch/arm/arm64/head.S b/xen/arch/arm/arm64/head.S
index 2fa07dc3a04b..4a332b9cbc7c 100644
--- a/xen/arch/arm/arm64/head.S
+++ b/xen/arch/arm/arm64/head.S
@@ -399,6 +399,10 @@  FUNC_LOCAL(cpu_init)
          * than SP_EL0.
          */
         msr spsel, #1
+
+        ldr   x0, =INVALID_PER_CPU_OFFSET
+        msr   tpidr_el2, x0
+
         ret
 END(cpu_init)
 
diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c
index 7cfcefd27944..93ebe6e5f8af 100644
--- a/xen/arch/arm/domain.c
+++ b/xen/arch/arm/domain.c
@@ -26,6 +26,7 @@ 
 #include <asm/procinfo.h>
 #include <asm/regs.h>
 #include <asm/tee/tee.h>
+#include <asm/traps.h>
 #include <asm/vfp.h>
 #include <asm/vgic.h>
 #include <asm/vtimer.h>
@@ -328,6 +329,8 @@  void context_switch(struct vcpu *prev, struct vcpu *next)
 
     set_current(next);
 
+    stack_set(next->arch.stack);
+
     prev = __context_switch(prev, next);
 
     schedule_tail(prev);
diff --git a/xen/arch/arm/include/asm/page.h b/xen/arch/arm/include/asm/page.h
index 69f817d1e68a..6b5aaf1eb3ff 100644
--- a/xen/arch/arm/include/asm/page.h
+++ b/xen/arch/arm/include/asm/page.h
@@ -7,6 +7,8 @@ 
 #include <asm/lpae.h>
 #include <asm/sysregs.h>
 
+#define INVALID_PER_CPU_OFFSET            _AC(0x1, UL)
+
 /* Shareability values for the LPAE entries */
 #define LPAE_SH_NON_SHAREABLE 0x0
 #define LPAE_SH_UNPREDICTALE  0x1
diff --git a/xen/arch/arm/include/asm/traps.h b/xen/arch/arm/include/asm/traps.h
index 9a60dbf70e5b..13e6997c2643 100644
--- a/xen/arch/arm/include/asm/traps.h
+++ b/xen/arch/arm/include/asm/traps.h
@@ -118,6 +118,14 @@  static inline register_t sign_extend(const struct hsr_dabt dabt, register_t r)
 
 void finalize_instr_emulation(const struct instr_details *instr);
 
+#ifdef CONFIG_DEBUG
+void stack_check_init(void);
+void stack_set(unsigned char *base);
+#else
+static inline void stack_check_init(void) { }
+static inline void stack_set(unsigned char *base) { }
+#endif
+
 #endif /* __ASM_ARM_TRAPS__ */
 /*
  * Local variables:
diff --git a/xen/arch/arm/setup.c b/xen/arch/arm/setup.c
index 0c2fdaceaf21..07d07feff602 100644
--- a/xen/arch/arm/setup.c
+++ b/xen/arch/arm/setup.c
@@ -47,6 +47,7 @@ 
 #include <asm/setup.h>
 #include <xsm/xsm.h>
 #include <asm/acpi.h>
+#include <asm/traps.h>
 
 struct bootinfo __initdata bootinfo = BOOTINFO_INIT;
 
@@ -733,6 +734,8 @@  void asmlinkage __init start_xen(unsigned long boot_phys_offset,
     percpu_init_areas();
     set_processor_id(0); /* needed early, for smp_processor_id() */
 
+    stack_check_init();
+
     /* Initialize traps early allow us to get backtrace when an error occurred */
     init_traps();
 
@@ -924,6 +927,7 @@  void asmlinkage __init start_xen(unsigned long boot_phys_offset,
      * since the static one we're running on is about to be freed. */
     memcpy(idle_vcpu[0]->arch.cpu_info, get_cpu_info(),
            sizeof(struct cpu_info));
+    stack_set(idle_vcpu[0]->arch.stack);
     switch_stack_and_jump(idle_vcpu[0]->arch.cpu_info, init_done);
 }
 
diff --git a/xen/arch/arm/smpboot.c b/xen/arch/arm/smpboot.c
index 04e363088d60..1c689f2caed7 100644
--- a/xen/arch/arm/smpboot.c
+++ b/xen/arch/arm/smpboot.c
@@ -30,6 +30,7 @@ 
 #include <asm/psci.h>
 #include <asm/acpi.h>
 #include <asm/tee/tee.h>
+#include <asm/traps.h>
 
 /* Override macros from asm/page.h to make them work with mfn_t */
 #undef virt_to_mfn
@@ -329,6 +330,8 @@  void asmlinkage start_secondary(void)
 
     set_processor_id(cpuid);
 
+    stack_check_init();
+
     identify_cpu(&current_cpu_data);
     processor_setup();
 
diff --git a/xen/arch/arm/traps.c b/xen/arch/arm/traps.c
index aac6c599f878..b4890eec7ec4 100644
--- a/xen/arch/arm/traps.c
+++ b/xen/arch/arm/traps.c
@@ -2325,6 +2325,51 @@  void asmlinkage leave_hypervisor_to_guest(void)
         arm_smccc_1_1_smc(ARM_SMCCC_ARCH_WORKAROUND_2_FID, 0, NULL);
 }
 
+#ifdef CONFIG_DEBUG
+DEFINE_PER_CPU(unsigned int, stack_check_nesting);
+DEFINE_PER_CPU(unsigned char *, stack_base);
+
+void __attribute__((no_instrument_function)) stack_set(unsigned char *base)
+{
+    this_cpu(stack_base) = base;
+}
+
+void __init __attribute__((no_instrument_function)) stack_check_init(void)
+{
+    this_cpu(stack_check_nesting) = 0;
+    stack_set(init_data.stack);
+}
+
+__attribute__((no_instrument_function))
+void __cyg_profile_func_enter(void *this_fn, void *call_site)
+{
+    unsigned char *sp;
+
+    if ( get_per_cpu_offset() == INVALID_PER_CPU_OFFSET )
+        return;
+
+    asm volatile ("mov %0, sp" : "=r" (sp) );
+
+    if ( sp < this_cpu(stack_base) ||
+         sp > (this_cpu(stack_base) + STACK_SIZE) )
+    {
+        if ( this_cpu(stack_check_nesting) )
+            return;
+
+        this_cpu(stack_check_nesting)++;
+        printk("CPU %d stack pointer out of bounds (sp %#lx, stack base %#lx)\n",
+               smp_processor_id(), (uintptr_t)sp,
+               (uintptr_t)this_cpu(stack_base));
+        BUG();
+    }
+}
+
+__attribute__((no_instrument_function))
+void __cyg_profile_func_exit(void *this_fn, void *call_site)
+{
+}
+#endif
+
 /*
  * Local variables:
  * mode: C