arm64: kernel: Use a separate stack for irq interrupts.

Message ID	1441636584-23174-1-git-send-email-james.morse@arm.com (mailing list archive)
State	New, archived
Headers	show Return-Path: <linux-arm-kernel-bounces+patchwork-linux-arm=patchwork.kernel.org@lists.infradead.org> From: James Morse <james.morse@arm.com> To: Jungseok Lee <jungseoklee85@gmail.com> Subject: [PATCH] arm64: kernel: Use a separate stack for irq interrupts. Date: Mon, 7 Sep 2015 15:36:24 +0100 Message-Id: <1441636584-23174-1-git-send-email-james.morse@arm.com> In-Reply-To: <55EDA040.90208@arm.com> References: <55EDA040.90208@arm.com> Precedence: list Cc: Catalin Marinas <Catalin.Marinas@arm.com>, James Morse <james.morse@arm.com>, linux-arm-kernel@lists.infradead.org, Will Deacon <Will.Deacon@arm.com>, linux-kernel@vger.kernel.org MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Sender: "linux-arm-kernel" <linux-arm-kernel-bounces@lists.infradead.org> Errors-To: linux-arm-kernel-bounces+patchwork-linux-arm=patchwork.kernel.org@lists.infradead.org

James Morse Sept. 7, 2015, 2:36 p.m. UTC

Having to handle interrupts on top of an existing kernel stack means the
kernel stack must be large enough to accomodate both the maximum kernel
usage, and the maximum irq handler usage. Switching to a different stack
when processing irqs allows us to make the stack size smaller.

Maximum kernel stack usage (running ltp and generating usb+ethernet
interrupts) was 7256 bytes. With this patch, the same workload gives
a maximum stack usage of 5816 bytes.

Signed-off-by: James Morse <james.morse@arm.com>
---
 arch/arm64/include/asm/irq.h         | 12 +++++++++
 arch/arm64/include/asm/thread_info.h |  8 ++++--
 arch/arm64/kernel/entry.S            | 33 ++++++++++++++++++++---
 arch/arm64/kernel/irq.c              | 52 ++++++++++++++++++++++++++++++++++++
 arch/arm64/kernel/smp.c              |  4 +++
 arch/arm64/kernel/stacktrace.c       |  4 ++-
 6 files changed, 107 insertions(+), 6 deletions(-)

Jungseok Lee Sept. 7, 2015, 3:48 p.m. UTC | #1

On Sep 7, 2015, at 11:36 PM, James Morse wrote:

Hi James,

> Having to handle interrupts on top of an existing kernel stack means the
> kernel stack must be large enough to accomodate both the maximum kernel
> usage, and the maximum irq handler usage. Switching to a different stack
> when processing irqs allows us to make the stack size smaller.
> 
> Maximum kernel stack usage (running ltp and generating usb+ethernet
> interrupts) was 7256 bytes. With this patch, the same workload gives
> a maximum stack usage of 5816 bytes.

I'd like to know how to measure the max stack depth.
AFAIK, a stack tracer on ftrace does not work well. Did you dump a stack
region and find or track down an untouched region? 

I will leave comments after reading and playing with this change carefully.

Best Regards
Jungseok Lee

> Signed-off-by: James Morse <james.morse@arm.com>
> ---
> arch/arm64/include/asm/irq.h         | 12 +++++++++
> arch/arm64/include/asm/thread_info.h |  8 ++++--
> arch/arm64/kernel/entry.S            | 33 ++++++++++++++++++++---
> arch/arm64/kernel/irq.c              | 52 ++++++++++++++++++++++++++++++++++++
> arch/arm64/kernel/smp.c              |  4 +++
> arch/arm64/kernel/stacktrace.c       |  4 ++-
> 6 files changed, 107 insertions(+), 6 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/irq.h b/arch/arm64/include/asm/irq.h
> index bbb251b14746..050d4196c736 100644
> --- a/arch/arm64/include/asm/irq.h
> +++ b/arch/arm64/include/asm/irq.h
> @@ -2,14 +2,20 @@
> #define __ASM_IRQ_H
> 
> #include <linux/irqchip/arm-gic-acpi.h>
> +#include <linux/percpu.h>
> 
> #include <asm-generic/irq.h>
> +#include <asm/thread_info.h>
> +
> +DECLARE_PER_CPU(unsigned long, irq_sp);
> 
> struct pt_regs;
> 
> extern void migrate_irqs(void);
> extern void set_handle_irq(void (*handle_irq)(struct pt_regs *));
> 
> +extern int alloc_irq_stack(unsigned int cpu);
> +
> static inline void acpi_irq_init(void)
> {
> 	/*
> @@ -21,4 +27,10 @@ static inline void acpi_irq_init(void)
> }
> #define acpi_irq_init acpi_irq_init
> 
> +static inline bool is_irq_stack(unsigned long sp)
> +{
> +	struct thread_info *ti = get_thread_info(sp);
> +	return (get_thread_info(per_cpu(irq_sp, ti->cpu)) == ti);
> +}
> +
> #endif
> diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
> index dcd06d18a42a..b906254fc400 100644
> --- a/arch/arm64/include/asm/thread_info.h
> +++ b/arch/arm64/include/asm/thread_info.h
> @@ -69,12 +69,16 @@ register unsigned long current_stack_pointer asm ("sp");
> /*
>  * how to get the thread information struct from C
>  */
> +static inline struct thread_info *get_thread_info(unsigned long sp)
> +{
> +	return (struct thread_info *)(sp & ~(THREAD_SIZE - 1));
> +}
> +
> static inline struct thread_info *current_thread_info(void) __attribute_const__;
> 
> static inline struct thread_info *current_thread_info(void)
> {
> -	return (struct thread_info *)
> -		(current_stack_pointer & ~(THREAD_SIZE - 1));
> +	return get_thread_info(current_stack_pointer);
> }
> 
> #define thread_saved_pc(tsk)	\
> diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
> index e16351819fed..d42371f3f5a1 100644
> --- a/arch/arm64/kernel/entry.S
> +++ b/arch/arm64/kernel/entry.S
> @@ -190,10 +190,37 @@ tsk	.req	x28		// current thread_info
>  * Interrupt handling.
>  */
> 	.macro	irq_handler
> -	adrp	x1, handle_arch_irq
> -	ldr	x1, [x1, #:lo12:handle_arch_irq]
> -	mov	x0, sp
> +	mrs	x21, tpidr_el1
> +	adr_l	x20, irq_sp
> +	add	x20, x20, x21
> +
> +	ldr	x21, [x20]
> +	mov	x20, sp
> +
> +	mov	x0, x21
> +	mov	x1, x20
> +	bl	irq_copy_thread_info
> +
> +	/* test for recursive use of irq_sp */
> +	cbz	w0, 1f
> +	mrs	x30, elr_el1
> +	mov	sp, x21
> +
> +	/*
> +	 * Create a fake stack frame to bump unwind_frame() onto the original
> +	 * stack. This relies on x29 not being clobbered by kernel_entry().
> +	 */
> +	push	x29, x30
> +
> +1:	ldr_l	x1, handle_arch_irq
> +	mov     x0, x20
> 	blr	x1
> +
> +	mov	x0, x20
> +	mov	x1, x21
> +	bl	irq_copy_thread_info
> +	mov	sp, x20
> +
> 	.endm
> 
> 	.text
> diff --git a/arch/arm64/kernel/irq.c b/arch/arm64/kernel/irq.c
> index 463fa2e7e34c..10b57a006da8 100644
> --- a/arch/arm64/kernel/irq.c
> +++ b/arch/arm64/kernel/irq.c
> @@ -26,11 +26,14 @@
> #include <linux/smp.h>
> #include <linux/init.h>
> #include <linux/irqchip.h>
> +#include <linux/percpu.h>
> #include <linux/seq_file.h>
> #include <linux/ratelimit.h>
> 
> unsigned long irq_err_count;
> 
> +DEFINE_PER_CPU(unsigned long, irq_sp) = 0;
> +
> int arch_show_interrupts(struct seq_file *p, int prec)
> {
> #ifdef CONFIG_SMP
> @@ -55,6 +58,10 @@ void __init init_IRQ(void)
> 	irqchip_init();
> 	if (!handle_arch_irq)
> 		panic("No interrupt controller found.");
> +
> +	/* Allocate an irq stack for the boot cpu */
> +	if (alloc_irq_stack(smp_processor_id()))
> +		panic("Failed to allocate irq stack for boot cpu.");
> }
> 
> #ifdef CONFIG_HOTPLUG_CPU
> @@ -117,3 +124,48 @@ void migrate_irqs(void)
> 	local_irq_restore(flags);
> }
> #endif /* CONFIG_HOTPLUG_CPU */
> +
> +/* Allocate an irq_stack for a cpu that is about to be brought up. */
> +int alloc_irq_stack(unsigned int cpu)
> +{
> +	struct page *irq_stack_page;
> +	union thread_union *irq_stack;
> +
> +	/* reuse stack allocated previously */
> +	if (per_cpu(irq_sp, cpu))
> +		return 0;
> +
> +	irq_stack_page = alloc_kmem_pages(THREADINFO_GFP, THREAD_SIZE_ORDER);
> +	if (!irq_stack_page) {
> +		pr_crit("CPU%u: failed to allocate irq stack for cpu %u\n",
> +			smp_processor_id(), cpu);
> +		return -ENOMEM;
> +	}
> +	irq_stack = page_address(irq_stack_page);
> +
> +	per_cpu(irq_sp, cpu) = (unsigned long)irq_stack->stack
> +			       + THREAD_START_SP;
> +
> +	return 0;
> +}
> +
> +/*
> + * Copy struct thread_info between two stacks, and update current->stack.
> + * This is used when moving to the irq exception stack.
> + * Changing current->stack is necessary so that non-arch thread_info writers
> + * don't use the new thread_info->task->stack to find the old thread_info when
> + * setting flags like TIF_NEED_RESCHED...
> + */
> +asmlinkage int irq_copy_thread_info(unsigned long dst_sp, unsigned long src_sp)
> +{
> +	struct thread_info *src = get_thread_info(src_sp);
> +	struct thread_info *dst = get_thread_info(dst_sp);
> +
> +	if (dst == src)
> +		return 0;
> +
> +	*dst = *src;
> +	current->stack = dst;
> +
> +	return 1;
> +}
> diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
> index 50fb4696654e..5283dc5629e4 100644
> --- a/arch/arm64/kernel/smp.c
> +++ b/arch/arm64/kernel/smp.c
> @@ -89,6 +89,10 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle)
> {
> 	int ret;
> 
> +	ret = alloc_irq_stack(cpu);
> +	if (ret)
> +		return ret;
> +
> 	/*
> 	 * We need to tell the secondary core where to find its stack and the
> 	 * page tables.
> diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
> index 407991bf79f5..3d6d5b45aa4b 100644
> --- a/arch/arm64/kernel/stacktrace.c
> +++ b/arch/arm64/kernel/stacktrace.c
> @@ -20,6 +20,7 @@
> #include <linux/sched.h>
> #include <linux/stacktrace.h>
> 
> +#include <asm/irq.h>
> #include <asm/stacktrace.h>
> 
> /*
> @@ -43,7 +44,8 @@ int notrace unwind_frame(struct stackframe *frame)
> 	low  = frame->sp;
> 	high = ALIGN(low, THREAD_SIZE);
> 
> -	if (fp < low || fp > high - 0x18 || fp & 0xf)
> +	if ((fp < low || fp > high - 0x18 || fp & 0xf) &&
> +	    !is_irq_stack(frame->sp))
> 		return -EINVAL;
> 
> 	frame->sp = fp + 0x10;
> -- 
> 2.1.4
>

James Morse Sept. 7, 2015, 4:06 p.m. UTC | #2

On 07/09/15 16:48, Jungseok Lee wrote:
> On Sep 7, 2015, at 11:36 PM, James Morse wrote:
> 
> Hi James,
> 
>> Having to handle interrupts on top of an existing kernel stack means the
>> kernel stack must be large enough to accomodate both the maximum kernel
>> usage, and the maximum irq handler usage. Switching to a different stack
>> when processing irqs allows us to make the stack size smaller.
>>
>> Maximum kernel stack usage (running ltp and generating usb+ethernet
>> interrupts) was 7256 bytes. With this patch, the same workload gives
>> a maximum stack usage of 5816 bytes.
> 
> I'd like to know how to measure the max stack depth.
> AFAIK, a stack tracer on ftrace does not work well. Did you dump a stack
> region and find or track down an untouched region? 

I enabled the 'Trace max stack' option under menuconfig 'Kernel Hacking' ->
'Tracers', then looked in debugfs:/tracing/stack_max_size.

What problems did you encounter?
(I may be missing something...)


Thanks,

James

Jungseok Lee Sept. 7, 2015, 4:34 p.m. UTC | #3

On Sep 8, 2015, at 1:06 AM, James Morse wrote:
> On 07/09/15 16:48, Jungseok Lee wrote:
>> On Sep 7, 2015, at 11:36 PM, James Morse wrote:
>> 
>> Hi James,
>> 
>>> Having to handle interrupts on top of an existing kernel stack means the
>>> kernel stack must be large enough to accomodate both the maximum kernel
>>> usage, and the maximum irq handler usage. Switching to a different stack
>>> when processing irqs allows us to make the stack size smaller.
>>> 
>>> Maximum kernel stack usage (running ltp and generating usb+ethernet
>>> interrupts) was 7256 bytes. With this patch, the same workload gives
>>> a maximum stack usage of 5816 bytes.
>> 
>> I'd like to know how to measure the max stack depth.
>> AFAIK, a stack tracer on ftrace does not work well. Did you dump a stack
>> region and find or track down an untouched region? 
> 
> I enabled the 'Trace max stack' option under menuconfig 'Kernel Hacking' ->
> 'Tracers', then looked in debugfs:/tracing/stack_max_size.
> 
> What problems did you encounter?
> (I may be missing something…)

When I enabled the feature, all entries had *0* size except the last entry.
It can be reproduced easily as looking in debugs:/tracing/stack_trace.

You can track down my report and Akashi's changes with the following links:
- http://lists.infradead.org/pipermail/linux-arm-kernel/2015-July/354126.html
- https://lkml.org/lkml/2015/7/13/29

Although it is impossible to measure an exact depth at this moment, the feature
could be utilized to check improvement.

Cc'ing Akashi for additional comments if needed.

Best Regards
Jungseok Lee

AKASHI Takahiro Sept. 8, 2015, 1:45 a.m. UTC | #4

Jungseok,

On 09/08/2015 01:34 AM, Jungseok Lee wrote:
> On Sep 8, 2015, at 1:06 AM, James Morse wrote:
>> On 07/09/15 16:48, Jungseok Lee wrote:
>>> On Sep 7, 2015, at 11:36 PM, James Morse wrote:
>>>
>>> Hi James,
>>>
>>>> Having to handle interrupts on top of an existing kernel stack means the
>>>> kernel stack must be large enough to accomodate both the maximum kernel
>>>> usage, and the maximum irq handler usage. Switching to a different stack
>>>> when processing irqs allows us to make the stack size smaller.
>>>>
>>>> Maximum kernel stack usage (running ltp and generating usb+ethernet
>>>> interrupts) was 7256 bytes. With this patch, the same workload gives
>>>> a maximum stack usage of 5816 bytes.
>>>
>>> I'd like to know how to measure the max stack depth.
>>> AFAIK, a stack tracer on ftrace does not work well. Did you dump a stack
>>> region and find or track down an untouched region?
>>
>> I enabled the 'Trace max stack' option under menuconfig 'Kernel Hacking' ->
>> 'Tracers', then looked in debugfs:/tracing/stack_max_size.
>>
>> What problems did you encounter?
>> (I may be missing something…)
>
> When I enabled the feature, all entries had *0* size except the last entry.
> It can be reproduced easily as looking in debugs:/tracing/stack_trace.

I'm afraid that you have not applied one of patches in my RFC:
http://lists.infradead.org/pipermail/linux-arm-kernel/2015-July/355919.html

I have not looked into James' patch in details, but hope that it will help
fix one of issues that are annoying me: Stack tracer (actually save_stack_trace())
will miss a function (and its parent function in some case) that is being executed
when an interrupt is taken.


-Takahiro AKASHI

> You can track down my report and Akashi's changes with the following links:
> - http://lists.infradead.org/pipermail/linux-arm-kernel/2015-July/354126.html
> - https://lkml.org/lkml/2015/7/13/29
>
> Although it is impossible to measure an exact depth at this moment, the feature
> could be utilized to check improvement.
>
> Cc'ing Akashi for additional comments if needed.
>
> Best Regards
> Jungseok Lee
>

AKASHI Takahiro Sept. 8, 2015, 6:44 a.m. UTC | #5

On 09/08/2015 10:45 AM, AKASHI Takahiro wrote:
> Jungseok,
>
> On 09/08/2015 01:34 AM, Jungseok Lee wrote:
>> On Sep 8, 2015, at 1:06 AM, James Morse wrote:
>>> On 07/09/15 16:48, Jungseok Lee wrote:
>>>> On Sep 7, 2015, at 11:36 PM, James Morse wrote:
>>>>
>>>> Hi James,
>>>>
>>>>> Having to handle interrupts on top of an existing kernel stack means the
>>>>> kernel stack must be large enough to accomodate both the maximum kernel
>>>>> usage, and the maximum irq handler usage. Switching to a different stack
>>>>> when processing irqs allows us to make the stack size smaller.
>>>>>
>>>>> Maximum kernel stack usage (running ltp and generating usb+ethernet
>>>>> interrupts) was 7256 bytes. With this patch, the same workload gives
>>>>> a maximum stack usage of 5816 bytes.
>>>>
>>>> I'd like to know how to measure the max stack depth.
>>>> AFAIK, a stack tracer on ftrace does not work well. Did you dump a stack
>>>> region and find or track down an untouched region?
>>>
>>> I enabled the 'Trace max stack' option under menuconfig 'Kernel Hacking' ->
>>> 'Tracers', then looked in debugfs:/tracing/stack_max_size.
>>>
>>> What problems did you encounter?
>>> (I may be missing something…)
>>
>> When I enabled the feature, all entries had *0* size except the last entry.
>> It can be reproduced easily as looking in debugs:/tracing/stack_trace.
>
> I'm afraid that you have not applied one of patches in my RFC:
> http://lists.infradead.org/pipermail/linux-arm-kernel/2015-July/355919.html
>
> I have not looked into James' patch in details, but hope that it will help
> fix one of issues that are annoying me: Stack tracer (actually save_stack_trace())
> will miss a function (and its parent function in some case) that is being executed
> when an interrupt is taken.

Well, it didn't fix the issue:
# cat /sys/kernel/debug/tracing/stack_trace
         Depth    Size   Location    (54 entries)
         -----    ----   --------
   0)     5096      16   irq_copy_thread_info+0x18/0x70
   1)     5080     336   el1_irq+0x78/0x11c

                                  <<<= _raw_spin_unlock_irqrestore

   2)     4744      48   __skb_recv_datagram+0x148/0x49c
   3)     4696     208   skb_recv_datagram+0x50/0x60
   4)     4488      64   xs_udp_data_ready+0x48/0x170
   5)     4424      96   sock_queue_rcv_skb+0x1fc/0x270
  ...
  53)      344     344   el0_svc_naked+0x20/0x28

__skb_recv_datagram+0x148 is "bl _raw_spin_unlock_irqrestore."

And the frames, #0 and #1, appear here because this patch replaces stack pointers
*after* kernel_entry.

-Takahiro AKASHI

>
> -Takahiro AKASHI
>
>> You can track down my report and Akashi's changes with the following links:
>> - http://lists.infradead.org/pipermail/linux-arm-kernel/2015-July/354126.html
>> - https://lkml.org/lkml/2015/7/13/29
>>
>> Although it is impossible to measure an exact depth at this moment, the feature
>> could be utilized to check improvement.
>>
>> Cc'ing Akashi for additional comments if needed.
>>
>> Best Regards
>> Jungseok Lee
>>

AKASHI Takahiro Sept. 8, 2015, 7:51 a.m. UTC | #6

On 09/07/2015 11:36 PM, James Morse wrote:
> Having to handle interrupts on top of an existing kernel stack means the
> kernel stack must be large enough to accomodate both the maximum kernel
> usage, and the maximum irq handler usage. Switching to a different stack
> when processing irqs allows us to make the stack size smaller.
>
> Maximum kernel stack usage (running ltp and generating usb+ethernet
> interrupts) was 7256 bytes. With this patch, the same workload gives
> a maximum stack usage of 5816 bytes.

With stack tracer on, the kernel with this patch ran into BUG() in check_stack():
     if (task_stack_end_corrupted(current))
         BUG();

This is because a check for an interrupt stack with "object_is_on_stack(stack)"
has passed, while your irq stack doesn't have a STACK_END_MAGIC.

> +-+
> Signed-off-by: James Morse <james.morse@arm.com>
> ---
>   arch/arm64/include/asm/irq.h         | 12 +++++++++
>   arch/arm64/include/asm/thread_info.h |  8 ++++--
>   arch/arm64/kernel/entry.S            | 33 ++++++++++++++++++++---
>   arch/arm64/kernel/irq.c              | 52 ++++++++++++++++++++++++++++++++++++
>   arch/arm64/kernel/smp.c              |  4 +++
>   arch/arm64/kernel/stacktrace.c       |  4 ++-
>   6 files changed, 107 insertions(+), 6 deletions(-)
>
> diff --git a/arch/arm64/include/asm/irq.h b/arch/arm64/include/asm/irq.h
> index bbb251b14746..050d4196c736 100644
> --- a/arch/arm64/include/asm/irq.h
> +++ b/arch/arm64/include/asm/irq.h
> @@ -2,14 +2,20 @@
>   #define __ASM_IRQ_H
>
>   #include <linux/irqchip/arm-gic-acpi.h>
> +#include <linux/percpu.h>
>
>   #include <asm-generic/irq.h>
> +#include <asm/thread_info.h>
> +
> +DECLARE_PER_CPU(unsigned long, irq_sp);
>
>   struct pt_regs;
>
>   extern void migrate_irqs(void);
>   extern void set_handle_irq(void (*handle_irq)(struct pt_regs *));
>
> +extern int alloc_irq_stack(unsigned int cpu);
> +
>   static inline void acpi_irq_init(void)
>   {
>   	/*
> @@ -21,4 +27,10 @@ static inline void acpi_irq_init(void)
>   }
>   #define acpi_irq_init acpi_irq_init
>
> +static inline bool is_irq_stack(unsigned long sp)
> +{
> +	struct thread_info *ti = get_thread_info(sp);
> +	return (get_thread_info(per_cpu(irq_sp, ti->cpu)) == ti);
> +}
> +
>   #endif
> diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
> index dcd06d18a42a..b906254fc400 100644
> --- a/arch/arm64/include/asm/thread_info.h
> +++ b/arch/arm64/include/asm/thread_info.h
> @@ -69,12 +69,16 @@ register unsigned long current_stack_pointer asm ("sp");
>   /*
>    * how to get the thread information struct from C
>    */
> +static inline struct thread_info *get_thread_info(unsigned long sp)
> +{
> +	return (struct thread_info *)(sp & ~(THREAD_SIZE - 1));
> +}
> +
>   static inline struct thread_info *current_thread_info(void) __attribute_const__;
>
>   static inline struct thread_info *current_thread_info(void)
>   {
> -	return (struct thread_info *)
> -		(current_stack_pointer & ~(THREAD_SIZE - 1));
> +	return get_thread_info(current_stack_pointer);
>   }
>
>   #define thread_saved_pc(tsk)	\
> diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
> index e16351819fed..d42371f3f5a1 100644
> --- a/arch/arm64/kernel/entry.S
> +++ b/arch/arm64/kernel/entry.S
> @@ -190,10 +190,37 @@ tsk	.req	x28		// current thread_info
>    * Interrupt handling.
>    */
>   	.macro	irq_handler
> -	adrp	x1, handle_arch_irq
> -	ldr	x1, [x1, #:lo12:handle_arch_irq]
> -	mov	x0, sp
> +	mrs	x21, tpidr_el1
> +	adr_l	x20, irq_sp
> +	add	x20, x20, x21
> +
> +	ldr	x21, [x20]
> +	mov	x20, sp
> +
> +	mov	x0, x21
> +	mov	x1, x20
> +	bl	irq_copy_thread_info
> +
> +	/* test for recursive use of irq_sp */
> +	cbz	w0, 1f
> +	mrs	x30, elr_el1
> +	mov	sp, x21
> +
> +	/*
> +	 * Create a fake stack frame to bump unwind_frame() onto the original
> +	 * stack. This relies on x29 not being clobbered by kernel_entry().
> +	 */
> +	push	x29, x30

I don't think that it works. Since unwind_frame() doesn't care about sp, but
only does fp, a pushed stack frame will never be dereferenced.

-Takahiro AKASHI


> +
> +1:	ldr_l	x1, handle_arch_irq
> +	mov     x0, x20
>   	blr	x1
> +
> +	mov	x0, x20
> +	mov	x1, x21
> +	bl	irq_copy_thread_info
> +	mov	sp, x20
> +
>   	.endm
>
>   	.text
> diff --git a/arch/arm64/kernel/irq.c b/arch/arm64/kernel/irq.c
> index 463fa2e7e34c..10b57a006da8 100644
> --- a/arch/arm64/kernel/irq.c
> +++ b/arch/arm64/kernel/irq.c
> @@ -26,11 +26,14 @@
>   #include <linux/smp.h>
>   #include <linux/init.h>
>   #include <linux/irqchip.h>
> +#include <linux/percpu.h>
>   #include <linux/seq_file.h>
>   #include <linux/ratelimit.h>
>
>   unsigned long irq_err_count;
>
> +DEFINE_PER_CPU(unsigned long, irq_sp) = 0;
> +
>   int arch_show_interrupts(struct seq_file *p, int prec)
>   {
>   #ifdef CONFIG_SMP
> @@ -55,6 +58,10 @@ void __init init_IRQ(void)
>   	irqchip_init();
>   	if (!handle_arch_irq)
>   		panic("No interrupt controller found.");
> +
> +	/* Allocate an irq stack for the boot cpu */
> +	if (alloc_irq_stack(smp_processor_id()))
> +		panic("Failed to allocate irq stack for boot cpu.");
>   }
>
>   #ifdef CONFIG_HOTPLUG_CPU
> @@ -117,3 +124,48 @@ void migrate_irqs(void)
>   	local_irq_restore(flags);
>   }
>   #endif /* CONFIG_HOTPLUG_CPU */
> +
> +/* Allocate an irq_stack for a cpu that is about to be brought up. */
> +int alloc_irq_stack(unsigned int cpu)
> +{
> +	struct page *irq_stack_page;
> +	union thread_union *irq_stack;
> +
> +	/* reuse stack allocated previously */
> +	if (per_cpu(irq_sp, cpu))
> +		return 0;
> +
> +	irq_stack_page = alloc_kmem_pages(THREADINFO_GFP, THREAD_SIZE_ORDER);
> +	if (!irq_stack_page) {
> +		pr_crit("CPU%u: failed to allocate irq stack for cpu %u\n",
> +			smp_processor_id(), cpu);
> +		return -ENOMEM;
> +	}
> +	irq_stack = page_address(irq_stack_page);
> +
> +	per_cpu(irq_sp, cpu) = (unsigned long)irq_stack->stack
> +			       + THREAD_START_SP;
> +
> +	return 0;
> +}
> +
> +/*
> + * Copy struct thread_info between two stacks, and update current->stack.
> + * This is used when moving to the irq exception stack.
> + * Changing current->stack is necessary so that non-arch thread_info writers
> + * don't use the new thread_info->task->stack to find the old thread_info when
> + * setting flags like TIF_NEED_RESCHED...
> + */
> +asmlinkage int irq_copy_thread_info(unsigned long dst_sp, unsigned long src_sp)
> +{
> +	struct thread_info *src = get_thread_info(src_sp);
> +	struct thread_info *dst = get_thread_info(dst_sp);
> +
> +	if (dst == src)
> +		return 0;
> +
> +	*dst = *src;
> +	current->stack = dst;
> +
> +	return 1;
> +}
> diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
> index 50fb4696654e..5283dc5629e4 100644
> --- a/arch/arm64/kernel/smp.c
> +++ b/arch/arm64/kernel/smp.c
> @@ -89,6 +89,10 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle)
>   {
>   	int ret;
>
> +	ret = alloc_irq_stack(cpu);
> +	if (ret)
> +		return ret;
> +
>   	/*
>   	 * We need to tell the secondary core where to find its stack and the
>   	 * page tables.
> diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
> index 407991bf79f5..3d6d5b45aa4b 100644
> --- a/arch/arm64/kernel/stacktrace.c
> +++ b/arch/arm64/kernel/stacktrace.c
> @@ -20,6 +20,7 @@
>   #include <linux/sched.h>
>   #include <linux/stacktrace.h>
>
> +#include <asm/irq.h>
>   #include <asm/stacktrace.h>
>
>   /*
> @@ -43,7 +44,8 @@ int notrace unwind_frame(struct stackframe *frame)
>   	low  = frame->sp;
>   	high = ALIGN(low, THREAD_SIZE);
>
> -	if (fp < low || fp > high - 0x18 || fp & 0xf)
> +	if ((fp < low || fp > high - 0x18 || fp & 0xf) &&
> +	    !is_irq_stack(frame->sp))
>   		return -EINVAL;
>
>   	frame->sp = fp + 0x10;
>

Jungseok Lee Sept. 8, 2015, 2:54 p.m. UTC | #7

On Sep 7, 2015, at 11:36 PM, James Morse wrote:

Hi James,

> Having to handle interrupts on top of an existing kernel stack means the
> kernel stack must be large enough to accomodate both the maximum kernel
> usage, and the maximum irq handler usage. Switching to a different stack
> when processing irqs allows us to make the stack size smaller.
> 
> Maximum kernel stack usage (running ltp and generating usb+ethernet
> interrupts) was 7256 bytes. With this patch, the same workload gives
> a maximum stack usage of 5816 bytes.
> 
> Signed-off-by: James Morse <james.morse@arm.com>
> ---
> arch/arm64/include/asm/irq.h         | 12 +++++++++
> arch/arm64/include/asm/thread_info.h |  8 ++++--
> arch/arm64/kernel/entry.S            | 33 ++++++++++++++++++++---
> arch/arm64/kernel/irq.c              | 52 ++++++++++++++++++++++++++++++++++++
> arch/arm64/kernel/smp.c              |  4 +++
> arch/arm64/kernel/stacktrace.c       |  4 ++-
> 6 files changed, 107 insertions(+), 6 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/irq.h b/arch/arm64/include/asm/irq.h
> index bbb251b14746..050d4196c736 100644
> --- a/arch/arm64/include/asm/irq.h
> +++ b/arch/arm64/include/asm/irq.h
> @@ -2,14 +2,20 @@
> #define __ASM_IRQ_H
> 
> #include <linux/irqchip/arm-gic-acpi.h>
> +#include <linux/percpu.h>
> 
> #include <asm-generic/irq.h>
> +#include <asm/thread_info.h>
> +
> +DECLARE_PER_CPU(unsigned long, irq_sp);
> 
> struct pt_regs;
> 
> extern void migrate_irqs(void);
> extern void set_handle_irq(void (*handle_irq)(struct pt_regs *));
> 
> +extern int alloc_irq_stack(unsigned int cpu);
> +
> static inline void acpi_irq_init(void)
> {
> 	/*
> @@ -21,4 +27,10 @@ static inline void acpi_irq_init(void)
> }
> #define acpi_irq_init acpi_irq_init
> 
> +static inline bool is_irq_stack(unsigned long sp)
> +{
> +	struct thread_info *ti = get_thread_info(sp);
> +	return (get_thread_info(per_cpu(irq_sp, ti->cpu)) == ti);
> +}
> +
> #endif
> diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
> index dcd06d18a42a..b906254fc400 100644
> --- a/arch/arm64/include/asm/thread_info.h
> +++ b/arch/arm64/include/asm/thread_info.h
> @@ -69,12 +69,16 @@ register unsigned long current_stack_pointer asm ("sp");
> /*
>  * how to get the thread information struct from C
>  */
> +static inline struct thread_info *get_thread_info(unsigned long sp)
> +{
> +	return (struct thread_info *)(sp & ~(THREAD_SIZE - 1));
> +}
> +
> static inline struct thread_info *current_thread_info(void) __attribute_const__;
> 
> static inline struct thread_info *current_thread_info(void)
> {
> -	return (struct thread_info *)
> -		(current_stack_pointer & ~(THREAD_SIZE - 1));
> +	return get_thread_info(current_stack_pointer);
> }
> 
> #define thread_saved_pc(tsk)	\
> diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
> index e16351819fed..d42371f3f5a1 100644
> --- a/arch/arm64/kernel/entry.S
> +++ b/arch/arm64/kernel/entry.S
> @@ -190,10 +190,37 @@ tsk	.req	x28		// current thread_info
>  * Interrupt handling.
>  */
> 	.macro	irq_handler
> -	adrp	x1, handle_arch_irq
> -	ldr	x1, [x1, #:lo12:handle_arch_irq]
> -	mov	x0, sp
> +	mrs	x21, tpidr_el1
> +	adr_l	x20, irq_sp
> +	add	x20, x20, x21
> +
> +	ldr	x21, [x20]
> +	mov	x20, sp
> +
> +	mov	x0, x21
> +	mov	x1, x20
> +	bl	irq_copy_thread_info
> +
> +	/* test for recursive use of irq_sp */
> +	cbz	w0, 1f
> +	mrs	x30, elr_el1
> +	mov	sp, x21
> +
> +	/*
> +	 * Create a fake stack frame to bump unwind_frame() onto the original
> +	 * stack. This relies on x29 not being clobbered by kernel_entry().
> +	 */
> +	push	x29, x30

It is the most challenging item to handle a frame pointer in this context.

As I mentioned previously, a stack tracer on ftrace is not supported yet.
How about decoupling "IRQ Stack feature" from "Stack Tracer Fix"?

> +
> +1:	ldr_l	x1, handle_arch_irq
> +	mov     x0, x20
> 	blr	x1
> +
> +	mov	x0, x20
> +	mov	x1, x21
> +	bl	irq_copy_thread_info
> +	mov	sp, x20
> +
> 	.endm
> 
> 	.text
> diff --git a/arch/arm64/kernel/irq.c b/arch/arm64/kernel/irq.c
> index 463fa2e7e34c..10b57a006da8 100644
> --- a/arch/arm64/kernel/irq.c
> +++ b/arch/arm64/kernel/irq.c
> @@ -26,11 +26,14 @@
> #include <linux/smp.h>
> #include <linux/init.h>
> #include <linux/irqchip.h>
> +#include <linux/percpu.h>
> #include <linux/seq_file.h>
> #include <linux/ratelimit.h>
> 
> unsigned long irq_err_count;
> 
> +DEFINE_PER_CPU(unsigned long, irq_sp) = 0;
> +
> int arch_show_interrupts(struct seq_file *p, int prec)
> {
> #ifdef CONFIG_SMP
> @@ -55,6 +58,10 @@ void __init init_IRQ(void)
> 	irqchip_init();
> 	if (!handle_arch_irq)
> 		panic("No interrupt controller found.");
> +
> +	/* Allocate an irq stack for the boot cpu */
> +	if (alloc_irq_stack(smp_processor_id()))
> +		panic("Failed to allocate irq stack for boot cpu.");
> }
> 
> #ifdef CONFIG_HOTPLUG_CPU
> @@ -117,3 +124,48 @@ void migrate_irqs(void)
> 	local_irq_restore(flags);
> }
> #endif /* CONFIG_HOTPLUG_CPU */
> +
> +/* Allocate an irq_stack for a cpu that is about to be brought up. */
> +int alloc_irq_stack(unsigned int cpu)
> +{
> +	struct page *irq_stack_page;
> +	union thread_union *irq_stack;
> +
> +	/* reuse stack allocated previously */
> +	if (per_cpu(irq_sp, cpu))
> +		return 0;

I'd like to avoid even this simple check since CPU hotplug could be heavily
used for power management.

> +
> +	irq_stack_page = alloc_kmem_pages(THREADINFO_GFP, THREAD_SIZE_ORDER);
> +	if (!irq_stack_page) {
> +		pr_crit("CPU%u: failed to allocate irq stack for cpu %u\n",
> +			smp_processor_id(), cpu);
> +		return -ENOMEM;
> +	}
> +	irq_stack = page_address(irq_stack_page);
> +
> +	per_cpu(irq_sp, cpu) = (unsigned long)irq_stack->stack
> +			       + THREAD_START_SP;
> +
> +	return 0;
> +}
> +
> +/*
> + * Copy struct thread_info between two stacks, and update current->stack.
> + * This is used when moving to the irq exception stack.
> + * Changing current->stack is necessary so that non-arch thread_info writers
> + * don't use the new thread_info->task->stack to find the old thread_info when
> + * setting flags like TIF_NEED_RESCHED...
> + */
> +asmlinkage int irq_copy_thread_info(unsigned long dst_sp, unsigned long src_sp)
> +{
> +	struct thread_info *src = get_thread_info(src_sp);
> +	struct thread_info *dst = get_thread_info(dst_sp);
> +
> +	if (dst == src)
> +		return 0;
> +
> +	*dst = *src;
> +	current->stack = dst;
> +
> +	return 1;
> +}

Although this is 32-byte memcpy, sizeof(struct thread_info), this function is called
twice to handle a single interrupt. Isn's it too expensive?

This is a major difference between my approach and this patch. I think we should get
some feedbacks on struct thread_info information management for v2 preparation.

Best Regards
Jungseok Lee

Jungseok Lee Sept. 8, 2015, 2:59 p.m. UTC | #8

On Sep 8, 2015, at 10:45 AM, AKASHI Takahiro wrote:
> Jungseok,

Hi Akashi,

> On 09/08/2015 01:34 AM, Jungseok Lee wrote:
>> On Sep 8, 2015, at 1:06 AM, James Morse wrote:
>>> On 07/09/15 16:48, Jungseok Lee wrote:
>>>> On Sep 7, 2015, at 11:36 PM, James Morse wrote:
>>>> 
>>>> Hi James,
>>>> 
>>>>> Having to handle interrupts on top of an existing kernel stack means the
>>>>> kernel stack must be large enough to accomodate both the maximum kernel
>>>>> usage, and the maximum irq handler usage. Switching to a different stack
>>>>> when processing irqs allows us to make the stack size smaller.
>>>>> 
>>>>> Maximum kernel stack usage (running ltp and generating usb+ethernet
>>>>> interrupts) was 7256 bytes. With this patch, the same workload gives
>>>>> a maximum stack usage of 5816 bytes.
>>>> 
>>>> I'd like to know how to measure the max stack depth.
>>>> AFAIK, a stack tracer on ftrace does not work well. Did you dump a stack
>>>> region and find or track down an untouched region?
>>> 
>>> I enabled the 'Trace max stack' option under menuconfig 'Kernel Hacking' ->
>>> 'Tracers', then looked in debugfs:/tracing/stack_max_size.
>>> 
>>> What problems did you encounter?
>>> (I may be missing something…)
>> 
>> When I enabled the feature, all entries had *0* size except the last entry.
>> It can be reproduced easily as looking in debugs:/tracing/stack_trace.
> 
> I'm afraid that you have not applied one of patches in my RFC:
> http://lists.infradead.org/pipermail/linux-arm-kernel/2015-July/355919.html

My description is not clear. Whenever I use a stack tracer, I always put your
RFC and Steve's stack tracer fix on top of my tree.

Best Regards
Jungseok Lee

James Morse Sept. 8, 2015, 4:47 p.m. UTC | #9

On 08/09/15 15:54, Jungseok Lee wrote:
> On Sep 7, 2015, at 11:36 PM, James Morse wrote:
> 
> Hi James,
> 
>> diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
>> index e16351819fed..d42371f3f5a1 100644
>> --- a/arch/arm64/kernel/entry.S
>> +++ b/arch/arm64/kernel/entry.S
>> @@ -190,10 +190,37 @@ tsk	.req	x28		// current thread_info
>>  * Interrupt handling.
>>  */
>> 	.macro	irq_handler
>> -	adrp	x1, handle_arch_irq
>> -	ldr	x1, [x1, #:lo12:handle_arch_irq]
>> -	mov	x0, sp
>> +	mrs	x21, tpidr_el1
>> +	adr_l	x20, irq_sp
>> +	add	x20, x20, x21
>> +
>> +	ldr	x21, [x20]
>> +	mov	x20, sp
>> +
>> +	mov	x0, x21
>> +	mov	x1, x20
>> +	bl	irq_copy_thread_info
>> +
>> +	/* test for recursive use of irq_sp */
>> +	cbz	w0, 1f
>> +	mrs	x30, elr_el1
>> +	mov	sp, x21
>> +
>> +	/*
>> +	 * Create a fake stack frame to bump unwind_frame() onto the original
>> +	 * stack. This relies on x29 not being clobbered by kernel_entry().
>> +	 */
>> +	push	x29, x30
> 
> It is the most challenging item to handle a frame pointer in this context.
> 
> As I mentioned previously, a stack tracer on ftrace is not supported yet.
> How about decoupling "IRQ Stack feature" from "Stack Tracer Fix"?

Yes - I discovered today this was more complicated than I thought. I will
need to do some more reading.


>> +
>> +1:	ldr_l	x1, handle_arch_irq
>> +	mov     x0, x20
>> 	blr	x1
>> +
>> +	mov	x0, x20
>> +	mov	x1, x21
>> +	bl	irq_copy_thread_info
>> +	mov	sp, x20
>> +
>> 	.endm
>>
>> 	.text
>> diff --git a/arch/arm64/kernel/irq.c b/arch/arm64/kernel/irq.c
>> index 463fa2e7e34c..10b57a006da8 100644
>> --- a/arch/arm64/kernel/irq.c
>> +++ b/arch/arm64/kernel/irq.c
>> @@ -26,11 +26,14 @@
>> #include <linux/smp.h>
>> #include <linux/init.h>
>> #include <linux/irqchip.h>
>> +#include <linux/percpu.h>
>> #include <linux/seq_file.h>
>> #include <linux/ratelimit.h>
>>
>> unsigned long irq_err_count;
>>
>> +DEFINE_PER_CPU(unsigned long, irq_sp) = 0;
>> +
>> int arch_show_interrupts(struct seq_file *p, int prec)
>> {
>> #ifdef CONFIG_SMP
>> @@ -55,6 +58,10 @@ void __init init_IRQ(void)
>> 	irqchip_init();
>> 	if (!handle_arch_irq)
>> 		panic("No interrupt controller found.");
>> +
>> +	/* Allocate an irq stack for the boot cpu */
>> +	if (alloc_irq_stack(smp_processor_id()))
>> +		panic("Failed to allocate irq stack for boot cpu.");
>> }
>>
>> #ifdef CONFIG_HOTPLUG_CPU
>> @@ -117,3 +124,48 @@ void migrate_irqs(void)
>> 	local_irq_restore(flags);
>> }
>> #endif /* CONFIG_HOTPLUG_CPU */
>> +
>> +/* Allocate an irq_stack for a cpu that is about to be brought up. */
>> +int alloc_irq_stack(unsigned int cpu)
>> +{
>> +	struct page *irq_stack_page;
>> +	union thread_union *irq_stack;
>> +
>> +	/* reuse stack allocated previously */
>> +	if (per_cpu(irq_sp, cpu))
>> +		return 0;
> 
> I'd like to avoid even this simple check since CPU hotplug could be heavily
> used for power management.

I don't think its a problem:
__cpu_up() contains a call to wait_for_completion_timeout() (which could
eventually end up in the scheduler), so I don't think it could ever be on a
'really hot' path.

For really frequent hotplug-like power management, cpu_suspend() makes use
of firmware support to power-off cores - from what I can see it doesn't use
__cpu_up().


>> +
>> +	irq_stack_page = alloc_kmem_pages(THREADINFO_GFP, THREAD_SIZE_ORDER);
>> +	if (!irq_stack_page) {
>> +		pr_crit("CPU%u: failed to allocate irq stack for cpu %u\n",
>> +			smp_processor_id(), cpu);
>> +		return -ENOMEM;
>> +	}
>> +	irq_stack = page_address(irq_stack_page);
>> +
>> +	per_cpu(irq_sp, cpu) = (unsigned long)irq_stack->stack
>> +			       + THREAD_START_SP;
>> +
>> +	return 0;
>> +}
>> +
>> +/*
>> + * Copy struct thread_info between two stacks, and update current->stack.
>> + * This is used when moving to the irq exception stack.
>> + * Changing current->stack is necessary so that non-arch thread_info writers
>> + * don't use the new thread_info->task->stack to find the old thread_info when
>> + * setting flags like TIF_NEED_RESCHED...
>> + */
>> +asmlinkage int irq_copy_thread_info(unsigned long dst_sp, unsigned long src_sp)
>> +{
>> +	struct thread_info *src = get_thread_info(src_sp);
>> +	struct thread_info *dst = get_thread_info(dst_sp);
>> +
>> +	if (dst == src)
>> +		return 0;
>> +
>> +	*dst = *src;
>> +	current->stack = dst;
>> +
>> +	return 1;
>> +}
> 
> Although this is 32-byte memcpy, sizeof(struct thread_info), this function is called
> twice to handle a single interrupt. Isn's it too expensive?
> 
> This is a major difference between my approach and this patch. I think we should get
> some feedbacks on struct thread_info information management for v2 preparation.

Agreed.

The other difference, as Akashi Takahiro pointed out, is the behaviour of
object_is_on_stack(). What should this return for an object on an irq stack?


Thanks,

James

Jungseok Lee Sept. 9, 2015, 1:22 p.m. UTC | #10

On Sep 9, 2015, at 1:47 AM, James Morse wrote:
> On 08/09/15 15:54, Jungseok Lee wrote:
>> On Sep 7, 2015, at 11:36 PM, James Morse wrote:
>> 
>> Hi James,
>> 
>>> diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
>>> index e16351819fed..d42371f3f5a1 100644
>>> --- a/arch/arm64/kernel/entry.S
>>> +++ b/arch/arm64/kernel/entry.S
>>> @@ -190,10 +190,37 @@ tsk	.req	x28		// current thread_info
>>> * Interrupt handling.
>>> */
>>> 	.macro	irq_handler
>>> -	adrp	x1, handle_arch_irq
>>> -	ldr	x1, [x1, #:lo12:handle_arch_irq]
>>> -	mov	x0, sp
>>> +	mrs	x21, tpidr_el1
>>> +	adr_l	x20, irq_sp
>>> +	add	x20, x20, x21
>>> +
>>> +	ldr	x21, [x20]
>>> +	mov	x20, sp
>>> +
>>> +	mov	x0, x21
>>> +	mov	x1, x20
>>> +	bl	irq_copy_thread_info
>>> +
>>> +	/* test for recursive use of irq_sp */
>>> +	cbz	w0, 1f
>>> +	mrs	x30, elr_el1
>>> +	mov	sp, x21
>>> +
>>> +	/*
>>> +	 * Create a fake stack frame to bump unwind_frame() onto the original
>>> +	 * stack. This relies on x29 not being clobbered by kernel_entry().
>>> +	 */
>>> +	push	x29, x30
>> 
>> It is the most challenging item to handle a frame pointer in this context.
>> 
>> As I mentioned previously, a stack tracer on ftrace is not supported yet.
>> How about decoupling "IRQ Stack feature" from "Stack Tracer Fix"?
> 
> Yes - I discovered today this was more complicated than I thought. I will
> need to do some more reading.
> 
> 
>>> +
>>> +1:	ldr_l	x1, handle_arch_irq
>>> +	mov     x0, x20
>>> 	blr	x1
>>> +
>>> +	mov	x0, x20
>>> +	mov	x1, x21
>>> +	bl	irq_copy_thread_info
>>> +	mov	sp, x20
>>> +
>>> 	.endm
>>> 
>>> 	.text
>>> diff --git a/arch/arm64/kernel/irq.c b/arch/arm64/kernel/irq.c
>>> index 463fa2e7e34c..10b57a006da8 100644
>>> --- a/arch/arm64/kernel/irq.c
>>> +++ b/arch/arm64/kernel/irq.c
>>> @@ -26,11 +26,14 @@
>>> #include <linux/smp.h>
>>> #include <linux/init.h>
>>> #include <linux/irqchip.h>
>>> +#include <linux/percpu.h>
>>> #include <linux/seq_file.h>
>>> #include <linux/ratelimit.h>
>>> 
>>> unsigned long irq_err_count;
>>> 
>>> +DEFINE_PER_CPU(unsigned long, irq_sp) = 0;
>>> +
>>> int arch_show_interrupts(struct seq_file *p, int prec)
>>> {
>>> #ifdef CONFIG_SMP
>>> @@ -55,6 +58,10 @@ void __init init_IRQ(void)
>>> 	irqchip_init();
>>> 	if (!handle_arch_irq)
>>> 		panic("No interrupt controller found.");
>>> +
>>> +	/* Allocate an irq stack for the boot cpu */
>>> +	if (alloc_irq_stack(smp_processor_id()))
>>> +		panic("Failed to allocate irq stack for boot cpu.");
>>> }
>>> 
>>> #ifdef CONFIG_HOTPLUG_CPU
>>> @@ -117,3 +124,48 @@ void migrate_irqs(void)
>>> 	local_irq_restore(flags);
>>> }
>>> #endif /* CONFIG_HOTPLUG_CPU */
>>> +
>>> +/* Allocate an irq_stack for a cpu that is about to be brought up. */
>>> +int alloc_irq_stack(unsigned int cpu)
>>> +{
>>> +	struct page *irq_stack_page;
>>> +	union thread_union *irq_stack;
>>> +
>>> +	/* reuse stack allocated previously */
>>> +	if (per_cpu(irq_sp, cpu))
>>> +		return 0;
>> 
>> I'd like to avoid even this simple check since CPU hotplug could be heavily
>> used for power management.
> 
> I don't think its a problem:
> __cpu_up() contains a call to wait_for_completion_timeout() (which could
> eventually end up in the scheduler), so I don't think it could ever be on a
> 'really hot' path.
> 
> For really frequent hotplug-like power management, cpu_suspend() makes use
> of firmware support to power-off cores - from what I can see it doesn't use
> __cpu_up().

In case of some platforms, CPU hotplug is triggered via sysfs for power management
based on user data. What is advantage of putting stack allocation into this path?

IRQ stack allocation is an critical one-shot operation. So, there would be no issue
to give this work to a booting core. 

>>> +
>>> +	irq_stack_page = alloc_kmem_pages(THREADINFO_GFP, THREAD_SIZE_ORDER);
>>> +	if (!irq_stack_page) {
>>> +		pr_crit("CPU%u: failed to allocate irq stack for cpu %u\n",
>>> +			smp_processor_id(), cpu);
>>> +		return -ENOMEM;
>>> +	}
>>> +	irq_stack = page_address(irq_stack_page);

I forgot leaving a comment here. How about using __get_free_pages? It returns an
address instead of page. 

>>> +
>>> +	per_cpu(irq_sp, cpu) = (unsigned long)irq_stack->stack
>>> +			       + THREAD_START_SP;
>>> +
>>> +	return 0;
>>> +}
>>> +
>>> +/*
>>> + * Copy struct thread_info between two stacks, and update current->stack.
>>> + * This is used when moving to the irq exception stack.
>>> + * Changing current->stack is necessary so that non-arch thread_info writers
>>> + * don't use the new thread_info->task->stack to find the old thread_info when
>>> + * setting flags like TIF_NEED_RESCHED...
>>> + */
>>> +asmlinkage int irq_copy_thread_info(unsigned long dst_sp, unsigned long src_sp)
>>> +{
>>> +	struct thread_info *src = get_thread_info(src_sp);
>>> +	struct thread_info *dst = get_thread_info(dst_sp);
>>> +
>>> +	if (dst == src)
>>> +		return 0;
>>> +
>>> +	*dst = *src;
>>> +	current->stack = dst;
>>> +
>>> +	return 1;
>>> +}
>> 
>> Although this is 32-byte memcpy, sizeof(struct thread_info), this function is called
>> twice to handle a single interrupt. Isn's it too expensive?
>> 
>> This is a major difference between my approach and this patch. I think we should get
>> some feedbacks on struct thread_info information management for v2 preparation.
> 
> Agreed.
> 
> The other difference, as Akashi Takahiro pointed out, is the behaviour of
> object_is_on_stack(). What should this return for an object on an irq stack?

0 should be returned in that case to align with x86 behavior according to check_stack()
context and the commit, 81520a1b0649d0.

Best Regards
Jungseok Lee

James Morse Sept. 9, 2015, 6:13 p.m. UTC | #11

On 09/09/15 14:22, Jungseok Lee wrote:
> On Sep 9, 2015, at 1:47 AM, James Morse wrote:
>> On 08/09/15 15:54, Jungseok Lee wrote:
>>> On Sep 7, 2015, at 11:36 PM, James Morse wrote:
>>>> diff --git a/arch/arm64/kernel/irq.c b/arch/arm64/kernel/irq.c
>>>> index 463fa2e7e34c..10b57a006da8 100644
>>>> --- a/arch/arm64/kernel/irq.c
>>>> +++ b/arch/arm64/kernel/irq.c
>>>> @@ -26,11 +26,14 @@
>>>> #include <linux/smp.h>
>>>> #include <linux/init.h>
>>>> #include <linux/irqchip.h>
>>>> +#include <linux/percpu.h>
>>>> #include <linux/seq_file.h>
>>>> #include <linux/ratelimit.h>
>>>>
>>>> unsigned long irq_err_count;
>>>>
>>>> +DEFINE_PER_CPU(unsigned long, irq_sp) = 0;
>>>> +
>>>> int arch_show_interrupts(struct seq_file *p, int prec)
>>>> {
>>>> #ifdef CONFIG_SMP
>>>> @@ -55,6 +58,10 @@ void __init init_IRQ(void)
>>>> 	irqchip_init();
>>>> 	if (!handle_arch_irq)
>>>> 		panic("No interrupt controller found.");
>>>> +
>>>> +	/* Allocate an irq stack for the boot cpu */
>>>> +	if (alloc_irq_stack(smp_processor_id()))
>>>> +		panic("Failed to allocate irq stack for boot cpu.");
>>>> }
>>>>
>>>> #ifdef CONFIG_HOTPLUG_CPU
>>>> @@ -117,3 +124,48 @@ void migrate_irqs(void)
>>>> 	local_irq_restore(flags);
>>>> }
>>>> #endif /* CONFIG_HOTPLUG_CPU */
>>>> +
>>>> +/* Allocate an irq_stack for a cpu that is about to be brought up. */
>>>> +int alloc_irq_stack(unsigned int cpu)
>>>> +{
>>>> +	struct page *irq_stack_page;
>>>> +	union thread_union *irq_stack;
>>>> +
>>>> +	/* reuse stack allocated previously */
>>>> +	if (per_cpu(irq_sp, cpu))
>>>> +		return 0;
>>>
>>> I'd like to avoid even this simple check since CPU hotplug could be heavily
>>> used for power management.
>>
>> I don't think its a problem:
>> __cpu_up() contains a call to wait_for_completion_timeout() (which could
>> eventually end up in the scheduler), so I don't think it could ever be on a
>> 'really hot' path.
>>
>> For really frequent hotplug-like power management, cpu_suspend() makes use
>> of firmware support to power-off cores - from what I can see it doesn't use
>> __cpu_up().
> 
> In case of some platforms, CPU hotplug is triggered via sysfs for power management
> based on user data. What is advantage of putting stack allocation into this path?

It will only happen for CPUs that are brought up.


> IRQ stack allocation is an critical one-shot operation. So, there would be no issue
> to give this work to a booting core. 

I agree, but:

From include/linux/cpumask.h:
> *  If HOTPLUG is enabled, then cpu_possible_mask is forced to have
> *  all NR_CPUS bits set, otherwise it is just the set of CPUs that
> *  ACPI reports present at boot.

(This doesn't seem to happen with DT - but might with ACPI.)

NR_CPUs could be much bigger than the number of cpus the system ever has.
Allocating a stack for every possible cpu would waste memory. It is better
to do it just-in-time, when we know the memory will be used.

This already happens for the per-cpu idle task, (please check I traced
these through correctly!)
_cpu_up()
  idle_thread_get()
    init_idle()
      fork_idle()
        copy_process()
          dup_task_struct()
            alloc_task_struct_node()
            alloc_thread_info_node()
            arch_dup_task_struct()

So plenty of memory-allocation occurs during _cpu_up(), idle_init() checks
whether the idle task has already been created.


>>>> +
>>>> +	irq_stack_page = alloc_kmem_pages(THREADINFO_GFP, THREAD_SIZE_ORDER);
>>>> +	if (!irq_stack_page) {
>>>> +		pr_crit("CPU%u: failed to allocate irq stack for cpu %u\n",
>>>> +			smp_processor_id(), cpu);
>>>> +		return -ENOMEM;
>>>> +	}
>>>> +	irq_stack = page_address(irq_stack_page);
> 
> I forgot leaving a comment here. How about using __get_free_pages? It returns an
> address instead of page. 

Good idea, I was paranoid about the stack not being correctly aligned (as
current_thread_info() relies on this). I lifted that alloc_kmem_pages line
from kernel/fork.c.


>>>> +
>>>> +	per_cpu(irq_sp, cpu) = (unsigned long)irq_stack->stack
>>>> +			       + THREAD_START_SP;
>>>> +
>>>> +	return 0;
>>>> +}
>>>> +
>>>> +/*
>>>> + * Copy struct thread_info between two stacks, and update current->stack.
>>>> + * This is used when moving to the irq exception stack.
>>>> + * Changing current->stack is necessary so that non-arch thread_info writers
>>>> + * don't use the new thread_info->task->stack to find the old thread_info when
>>>> + * setting flags like TIF_NEED_RESCHED...
>>>> + */
>>>> +asmlinkage int irq_copy_thread_info(unsigned long dst_sp, unsigned long src_sp)
>>>> +{
>>>> +	struct thread_info *src = get_thread_info(src_sp);
>>>> +	struct thread_info *dst = get_thread_info(dst_sp);
>>>> +
>>>> +	if (dst == src)
>>>> +		return 0;
>>>> +
>>>> +	*dst = *src;
>>>> +	current->stack = dst;
>>>> +
>>>> +	return 1;
>>>> +}
>>>
>>> Although this is 32-byte memcpy, sizeof(struct thread_info), this function is called
>>> twice to handle a single interrupt. Isn's it too expensive?
>>>
>>> This is a major difference between my approach and this patch. I think we should get
>>> some feedbacks on struct thread_info information management for v2 preparation.
>>
>> Agreed.
>>
>> The other difference, as Akashi Takahiro pointed out, is the behaviour of
>> object_is_on_stack(). What should this return for an object on an irq stack?
> 
> 0 should be returned in that case to align with x86 behavior according to check_stack()
> context and the commit, 81520a1b0649d0.

Unless we can update ftrace on x86 too! :)


Thanks for your comments,


James

Jungseok Lee Sept. 10, 2015, 11:30 p.m. UTC | #12

On Sep 10, 2015, at 3:13 AM, James Morse wrote:
> On 09/09/15 14:22, Jungseok Lee wrote:
>> On Sep 9, 2015, at 1:47 AM, James Morse wrote:
>>> On 08/09/15 15:54, Jungseok Lee wrote:
>>>> On Sep 7, 2015, at 11:36 PM, James Morse wrote:
>>>>> diff --git a/arch/arm64/kernel/irq.c b/arch/arm64/kernel/irq.c
>>>>> index 463fa2e7e34c..10b57a006da8 100644
>>>>> --- a/arch/arm64/kernel/irq.c
>>>>> +++ b/arch/arm64/kernel/irq.c
>>>>> @@ -26,11 +26,14 @@
>>>>> #include <linux/smp.h>
>>>>> #include <linux/init.h>
>>>>> #include <linux/irqchip.h>
>>>>> +#include <linux/percpu.h>
>>>>> #include <linux/seq_file.h>
>>>>> #include <linux/ratelimit.h>
>>>>> 
>>>>> unsigned long irq_err_count;
>>>>> 
>>>>> +DEFINE_PER_CPU(unsigned long, irq_sp) = 0;
>>>>> +
>>>>> int arch_show_interrupts(struct seq_file *p, int prec)
>>>>> {
>>>>> #ifdef CONFIG_SMP
>>>>> @@ -55,6 +58,10 @@ void __init init_IRQ(void)
>>>>> 	irqchip_init();
>>>>> 	if (!handle_arch_irq)
>>>>> 		panic("No interrupt controller found.");
>>>>> +
>>>>> +	/* Allocate an irq stack for the boot cpu */
>>>>> +	if (alloc_irq_stack(smp_processor_id()))
>>>>> +		panic("Failed to allocate irq stack for boot cpu.");
>>>>> }
>>>>> 
>>>>> #ifdef CONFIG_HOTPLUG_CPU
>>>>> @@ -117,3 +124,48 @@ void migrate_irqs(void)
>>>>> 	local_irq_restore(flags);
>>>>> }
>>>>> #endif /* CONFIG_HOTPLUG_CPU */
>>>>> +
>>>>> +/* Allocate an irq_stack for a cpu that is about to be brought up. */
>>>>> +int alloc_irq_stack(unsigned int cpu)
>>>>> +{
>>>>> +	struct page *irq_stack_page;
>>>>> +	union thread_union *irq_stack;
>>>>> +
>>>>> +	/* reuse stack allocated previously */
>>>>> +	if (per_cpu(irq_sp, cpu))
>>>>> +		return 0;
>>>> 
>>>> I'd like to avoid even this simple check since CPU hotplug could be heavily
>>>> used for power management.
>>> 
>>> I don't think its a problem:
>>> __cpu_up() contains a call to wait_for_completion_timeout() (which could
>>> eventually end up in the scheduler), so I don't think it could ever be on a
>>> 'really hot' path.
>>> 
>>> For really frequent hotplug-like power management, cpu_suspend() makes use
>>> of firmware support to power-off cores - from what I can see it doesn't use
>>> __cpu_up().
>> 
>> In case of some platforms, CPU hotplug is triggered via sysfs for power management
>> based on user data. What is advantage of putting stack allocation into this path?
> 
> It will only happen for CPUs that are brought up.
> 
> 
>> IRQ stack allocation is an critical one-shot operation. So, there would be no issue
>> to give this work to a booting core. 
> 
> I agree, but:
> 
> From include/linux/cpumask.h:
>> *  If HOTPLUG is enabled, then cpu_possible_mask is forced to have
>> *  all NR_CPUS bits set, otherwise it is just the set of CPUs that
>> *  ACPI reports present at boot.
> 
> (This doesn't seem to happen with DT - but might with ACPI.)
> 
> NR_CPUs could be much bigger than the number of cpus the system ever has.
> Allocating a stack for every possible cpu would waste memory. It is better
> to do it just-in-time, when we know the memory will be used.

Frankly I've not considered that kind of system, but this feature should be
supported smoothly for that system. I will move the allocation logic in v2.

> This already happens for the per-cpu idle task, (please check I traced
> these through correctly!)
> _cpu_up()
>  idle_thread_get()
>    init_idle()
>      fork_idle()
>        copy_process()
>          dup_task_struct()
>            alloc_task_struct_node()
>            alloc_thread_info_node()
>            arch_dup_task_struct()
> 
> So plenty of memory-allocation occurs during _cpu_up(), idle_init() checks
> whether the idle task has already been created.

Got it.

Thanks for the feedbacks.

Best Regards
Jungseok Lee

arm64: kernel: Use a separate stack for irq interrupts.

Commit Message

Comments

Patch