Message ID | 1441636584-23174-1-git-send-email-james.morse@arm.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Sep 7, 2015, at 11:36 PM, James Morse wrote: Hi James, > Having to handle interrupts on top of an existing kernel stack means the > kernel stack must be large enough to accomodate both the maximum kernel > usage, and the maximum irq handler usage. Switching to a different stack > when processing irqs allows us to make the stack size smaller. > > Maximum kernel stack usage (running ltp and generating usb+ethernet > interrupts) was 7256 bytes. With this patch, the same workload gives > a maximum stack usage of 5816 bytes. I'd like to know how to measure the max stack depth. AFAIK, a stack tracer on ftrace does not work well. Did you dump a stack region and find or track down an untouched region? I will leave comments after reading and playing with this change carefully. Best Regards Jungseok Lee > Signed-off-by: James Morse <james.morse@arm.com> > --- > arch/arm64/include/asm/irq.h | 12 +++++++++ > arch/arm64/include/asm/thread_info.h | 8 ++++-- > arch/arm64/kernel/entry.S | 33 ++++++++++++++++++++--- > arch/arm64/kernel/irq.c | 52 ++++++++++++++++++++++++++++++++++++ > arch/arm64/kernel/smp.c | 4 +++ > arch/arm64/kernel/stacktrace.c | 4 ++- > 6 files changed, 107 insertions(+), 6 deletions(-) > > diff --git a/arch/arm64/include/asm/irq.h b/arch/arm64/include/asm/irq.h > index bbb251b14746..050d4196c736 100644 > --- a/arch/arm64/include/asm/irq.h > +++ b/arch/arm64/include/asm/irq.h > @@ -2,14 +2,20 @@ > #define __ASM_IRQ_H > > #include <linux/irqchip/arm-gic-acpi.h> > +#include <linux/percpu.h> > > #include <asm-generic/irq.h> > +#include <asm/thread_info.h> > + > +DECLARE_PER_CPU(unsigned long, irq_sp); > > struct pt_regs; > > extern void migrate_irqs(void); > extern void set_handle_irq(void (*handle_irq)(struct pt_regs *)); > > +extern int alloc_irq_stack(unsigned int cpu); > + > static inline void acpi_irq_init(void) > { > /* > @@ -21,4 +27,10 @@ static inline void acpi_irq_init(void) > } > #define acpi_irq_init acpi_irq_init > > +static inline bool is_irq_stack(unsigned long sp) > +{ > + struct thread_info *ti = get_thread_info(sp); > + return (get_thread_info(per_cpu(irq_sp, ti->cpu)) == ti); > +} > + > #endif > diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h > index dcd06d18a42a..b906254fc400 100644 > --- a/arch/arm64/include/asm/thread_info.h > +++ b/arch/arm64/include/asm/thread_info.h > @@ -69,12 +69,16 @@ register unsigned long current_stack_pointer asm ("sp"); > /* > * how to get the thread information struct from C > */ > +static inline struct thread_info *get_thread_info(unsigned long sp) > +{ > + return (struct thread_info *)(sp & ~(THREAD_SIZE - 1)); > +} > + > static inline struct thread_info *current_thread_info(void) __attribute_const__; > > static inline struct thread_info *current_thread_info(void) > { > - return (struct thread_info *) > - (current_stack_pointer & ~(THREAD_SIZE - 1)); > + return get_thread_info(current_stack_pointer); > } > > #define thread_saved_pc(tsk) \ > diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S > index e16351819fed..d42371f3f5a1 100644 > --- a/arch/arm64/kernel/entry.S > +++ b/arch/arm64/kernel/entry.S > @@ -190,10 +190,37 @@ tsk .req x28 // current thread_info > * Interrupt handling. > */ > .macro irq_handler > - adrp x1, handle_arch_irq > - ldr x1, [x1, #:lo12:handle_arch_irq] > - mov x0, sp > + mrs x21, tpidr_el1 > + adr_l x20, irq_sp > + add x20, x20, x21 > + > + ldr x21, [x20] > + mov x20, sp > + > + mov x0, x21 > + mov x1, x20 > + bl irq_copy_thread_info > + > + /* test for recursive use of irq_sp */ > + cbz w0, 1f > + mrs x30, elr_el1 > + mov sp, x21 > + > + /* > + * Create a fake stack frame to bump unwind_frame() onto the original > + * stack. This relies on x29 not being clobbered by kernel_entry(). > + */ > + push x29, x30 > + > +1: ldr_l x1, handle_arch_irq > + mov x0, x20 > blr x1 > + > + mov x0, x20 > + mov x1, x21 > + bl irq_copy_thread_info > + mov sp, x20 > + > .endm > > .text > diff --git a/arch/arm64/kernel/irq.c b/arch/arm64/kernel/irq.c > index 463fa2e7e34c..10b57a006da8 100644 > --- a/arch/arm64/kernel/irq.c > +++ b/arch/arm64/kernel/irq.c > @@ -26,11 +26,14 @@ > #include <linux/smp.h> > #include <linux/init.h> > #include <linux/irqchip.h> > +#include <linux/percpu.h> > #include <linux/seq_file.h> > #include <linux/ratelimit.h> > > unsigned long irq_err_count; > > +DEFINE_PER_CPU(unsigned long, irq_sp) = 0; > + > int arch_show_interrupts(struct seq_file *p, int prec) > { > #ifdef CONFIG_SMP > @@ -55,6 +58,10 @@ void __init init_IRQ(void) > irqchip_init(); > if (!handle_arch_irq) > panic("No interrupt controller found."); > + > + /* Allocate an irq stack for the boot cpu */ > + if (alloc_irq_stack(smp_processor_id())) > + panic("Failed to allocate irq stack for boot cpu."); > } > > #ifdef CONFIG_HOTPLUG_CPU > @@ -117,3 +124,48 @@ void migrate_irqs(void) > local_irq_restore(flags); > } > #endif /* CONFIG_HOTPLUG_CPU */ > + > +/* Allocate an irq_stack for a cpu that is about to be brought up. */ > +int alloc_irq_stack(unsigned int cpu) > +{ > + struct page *irq_stack_page; > + union thread_union *irq_stack; > + > + /* reuse stack allocated previously */ > + if (per_cpu(irq_sp, cpu)) > + return 0; > + > + irq_stack_page = alloc_kmem_pages(THREADINFO_GFP, THREAD_SIZE_ORDER); > + if (!irq_stack_page) { > + pr_crit("CPU%u: failed to allocate irq stack for cpu %u\n", > + smp_processor_id(), cpu); > + return -ENOMEM; > + } > + irq_stack = page_address(irq_stack_page); > + > + per_cpu(irq_sp, cpu) = (unsigned long)irq_stack->stack > + + THREAD_START_SP; > + > + return 0; > +} > + > +/* > + * Copy struct thread_info between two stacks, and update current->stack. > + * This is used when moving to the irq exception stack. > + * Changing current->stack is necessary so that non-arch thread_info writers > + * don't use the new thread_info->task->stack to find the old thread_info when > + * setting flags like TIF_NEED_RESCHED... > + */ > +asmlinkage int irq_copy_thread_info(unsigned long dst_sp, unsigned long src_sp) > +{ > + struct thread_info *src = get_thread_info(src_sp); > + struct thread_info *dst = get_thread_info(dst_sp); > + > + if (dst == src) > + return 0; > + > + *dst = *src; > + current->stack = dst; > + > + return 1; > +} > diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c > index 50fb4696654e..5283dc5629e4 100644 > --- a/arch/arm64/kernel/smp.c > +++ b/arch/arm64/kernel/smp.c > @@ -89,6 +89,10 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle) > { > int ret; > > + ret = alloc_irq_stack(cpu); > + if (ret) > + return ret; > + > /* > * We need to tell the secondary core where to find its stack and the > * page tables. > diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c > index 407991bf79f5..3d6d5b45aa4b 100644 > --- a/arch/arm64/kernel/stacktrace.c > +++ b/arch/arm64/kernel/stacktrace.c > @@ -20,6 +20,7 @@ > #include <linux/sched.h> > #include <linux/stacktrace.h> > > +#include <asm/irq.h> > #include <asm/stacktrace.h> > > /* > @@ -43,7 +44,8 @@ int notrace unwind_frame(struct stackframe *frame) > low = frame->sp; > high = ALIGN(low, THREAD_SIZE); > > - if (fp < low || fp > high - 0x18 || fp & 0xf) > + if ((fp < low || fp > high - 0x18 || fp & 0xf) && > + !is_irq_stack(frame->sp)) > return -EINVAL; > > frame->sp = fp + 0x10; > -- > 2.1.4 >
On 07/09/15 16:48, Jungseok Lee wrote: > On Sep 7, 2015, at 11:36 PM, James Morse wrote: > > Hi James, > >> Having to handle interrupts on top of an existing kernel stack means the >> kernel stack must be large enough to accomodate both the maximum kernel >> usage, and the maximum irq handler usage. Switching to a different stack >> when processing irqs allows us to make the stack size smaller. >> >> Maximum kernel stack usage (running ltp and generating usb+ethernet >> interrupts) was 7256 bytes. With this patch, the same workload gives >> a maximum stack usage of 5816 bytes. > > I'd like to know how to measure the max stack depth. > AFAIK, a stack tracer on ftrace does not work well. Did you dump a stack > region and find or track down an untouched region? I enabled the 'Trace max stack' option under menuconfig 'Kernel Hacking' -> 'Tracers', then looked in debugfs:/tracing/stack_max_size. What problems did you encounter? (I may be missing something...) Thanks, James
On Sep 8, 2015, at 1:06 AM, James Morse wrote: > On 07/09/15 16:48, Jungseok Lee wrote: >> On Sep 7, 2015, at 11:36 PM, James Morse wrote: >> >> Hi James, >> >>> Having to handle interrupts on top of an existing kernel stack means the >>> kernel stack must be large enough to accomodate both the maximum kernel >>> usage, and the maximum irq handler usage. Switching to a different stack >>> when processing irqs allows us to make the stack size smaller. >>> >>> Maximum kernel stack usage (running ltp and generating usb+ethernet >>> interrupts) was 7256 bytes. With this patch, the same workload gives >>> a maximum stack usage of 5816 bytes. >> >> I'd like to know how to measure the max stack depth. >> AFAIK, a stack tracer on ftrace does not work well. Did you dump a stack >> region and find or track down an untouched region? > > I enabled the 'Trace max stack' option under menuconfig 'Kernel Hacking' -> > 'Tracers', then looked in debugfs:/tracing/stack_max_size. > > What problems did you encounter? > (I may be missing something…) When I enabled the feature, all entries had *0* size except the last entry. It can be reproduced easily as looking in debugs:/tracing/stack_trace. You can track down my report and Akashi's changes with the following links: - http://lists.infradead.org/pipermail/linux-arm-kernel/2015-July/354126.html - https://lkml.org/lkml/2015/7/13/29 Although it is impossible to measure an exact depth at this moment, the feature could be utilized to check improvement. Cc'ing Akashi for additional comments if needed. Best Regards Jungseok Lee
Jungseok, On 09/08/2015 01:34 AM, Jungseok Lee wrote: > On Sep 8, 2015, at 1:06 AM, James Morse wrote: >> On 07/09/15 16:48, Jungseok Lee wrote: >>> On Sep 7, 2015, at 11:36 PM, James Morse wrote: >>> >>> Hi James, >>> >>>> Having to handle interrupts on top of an existing kernel stack means the >>>> kernel stack must be large enough to accomodate both the maximum kernel >>>> usage, and the maximum irq handler usage. Switching to a different stack >>>> when processing irqs allows us to make the stack size smaller. >>>> >>>> Maximum kernel stack usage (running ltp and generating usb+ethernet >>>> interrupts) was 7256 bytes. With this patch, the same workload gives >>>> a maximum stack usage of 5816 bytes. >>> >>> I'd like to know how to measure the max stack depth. >>> AFAIK, a stack tracer on ftrace does not work well. Did you dump a stack >>> region and find or track down an untouched region? >> >> I enabled the 'Trace max stack' option under menuconfig 'Kernel Hacking' -> >> 'Tracers', then looked in debugfs:/tracing/stack_max_size. >> >> What problems did you encounter? >> (I may be missing something…) > > When I enabled the feature, all entries had *0* size except the last entry. > It can be reproduced easily as looking in debugs:/tracing/stack_trace. I'm afraid that you have not applied one of patches in my RFC: http://lists.infradead.org/pipermail/linux-arm-kernel/2015-July/355919.html I have not looked into James' patch in details, but hope that it will help fix one of issues that are annoying me: Stack tracer (actually save_stack_trace()) will miss a function (and its parent function in some case) that is being executed when an interrupt is taken. -Takahiro AKASHI > You can track down my report and Akashi's changes with the following links: > - http://lists.infradead.org/pipermail/linux-arm-kernel/2015-July/354126.html > - https://lkml.org/lkml/2015/7/13/29 > > Although it is impossible to measure an exact depth at this moment, the feature > could be utilized to check improvement. > > Cc'ing Akashi for additional comments if needed. > > Best Regards > Jungseok Lee >
On 09/08/2015 10:45 AM, AKASHI Takahiro wrote: > Jungseok, > > On 09/08/2015 01:34 AM, Jungseok Lee wrote: >> On Sep 8, 2015, at 1:06 AM, James Morse wrote: >>> On 07/09/15 16:48, Jungseok Lee wrote: >>>> On Sep 7, 2015, at 11:36 PM, James Morse wrote: >>>> >>>> Hi James, >>>> >>>>> Having to handle interrupts on top of an existing kernel stack means the >>>>> kernel stack must be large enough to accomodate both the maximum kernel >>>>> usage, and the maximum irq handler usage. Switching to a different stack >>>>> when processing irqs allows us to make the stack size smaller. >>>>> >>>>> Maximum kernel stack usage (running ltp and generating usb+ethernet >>>>> interrupts) was 7256 bytes. With this patch, the same workload gives >>>>> a maximum stack usage of 5816 bytes. >>>> >>>> I'd like to know how to measure the max stack depth. >>>> AFAIK, a stack tracer on ftrace does not work well. Did you dump a stack >>>> region and find or track down an untouched region? >>> >>> I enabled the 'Trace max stack' option under menuconfig 'Kernel Hacking' -> >>> 'Tracers', then looked in debugfs:/tracing/stack_max_size. >>> >>> What problems did you encounter? >>> (I may be missing something…) >> >> When I enabled the feature, all entries had *0* size except the last entry. >> It can be reproduced easily as looking in debugs:/tracing/stack_trace. > > I'm afraid that you have not applied one of patches in my RFC: > http://lists.infradead.org/pipermail/linux-arm-kernel/2015-July/355919.html > > I have not looked into James' patch in details, but hope that it will help > fix one of issues that are annoying me: Stack tracer (actually save_stack_trace()) > will miss a function (and its parent function in some case) that is being executed > when an interrupt is taken. Well, it didn't fix the issue: # cat /sys/kernel/debug/tracing/stack_trace Depth Size Location (54 entries) ----- ---- -------- 0) 5096 16 irq_copy_thread_info+0x18/0x70 1) 5080 336 el1_irq+0x78/0x11c <<<= _raw_spin_unlock_irqrestore 2) 4744 48 __skb_recv_datagram+0x148/0x49c 3) 4696 208 skb_recv_datagram+0x50/0x60 4) 4488 64 xs_udp_data_ready+0x48/0x170 5) 4424 96 sock_queue_rcv_skb+0x1fc/0x270 ... 53) 344 344 el0_svc_naked+0x20/0x28 __skb_recv_datagram+0x148 is "bl _raw_spin_unlock_irqrestore." And the frames, #0 and #1, appear here because this patch replaces stack pointers *after* kernel_entry. -Takahiro AKASHI > > -Takahiro AKASHI > >> You can track down my report and Akashi's changes with the following links: >> - http://lists.infradead.org/pipermail/linux-arm-kernel/2015-July/354126.html >> - https://lkml.org/lkml/2015/7/13/29 >> >> Although it is impossible to measure an exact depth at this moment, the feature >> could be utilized to check improvement. >> >> Cc'ing Akashi for additional comments if needed. >> >> Best Regards >> Jungseok Lee >>
On 09/07/2015 11:36 PM, James Morse wrote: > Having to handle interrupts on top of an existing kernel stack means the > kernel stack must be large enough to accomodate both the maximum kernel > usage, and the maximum irq handler usage. Switching to a different stack > when processing irqs allows us to make the stack size smaller. > > Maximum kernel stack usage (running ltp and generating usb+ethernet > interrupts) was 7256 bytes. With this patch, the same workload gives > a maximum stack usage of 5816 bytes. With stack tracer on, the kernel with this patch ran into BUG() in check_stack(): if (task_stack_end_corrupted(current)) BUG(); This is because a check for an interrupt stack with "object_is_on_stack(stack)" has passed, while your irq stack doesn't have a STACK_END_MAGIC. > +-+ > Signed-off-by: James Morse <james.morse@arm.com> > --- > arch/arm64/include/asm/irq.h | 12 +++++++++ > arch/arm64/include/asm/thread_info.h | 8 ++++-- > arch/arm64/kernel/entry.S | 33 ++++++++++++++++++++--- > arch/arm64/kernel/irq.c | 52 ++++++++++++++++++++++++++++++++++++ > arch/arm64/kernel/smp.c | 4 +++ > arch/arm64/kernel/stacktrace.c | 4 ++- > 6 files changed, 107 insertions(+), 6 deletions(-) > > diff --git a/arch/arm64/include/asm/irq.h b/arch/arm64/include/asm/irq.h > index bbb251b14746..050d4196c736 100644 > --- a/arch/arm64/include/asm/irq.h > +++ b/arch/arm64/include/asm/irq.h > @@ -2,14 +2,20 @@ > #define __ASM_IRQ_H > > #include <linux/irqchip/arm-gic-acpi.h> > +#include <linux/percpu.h> > > #include <asm-generic/irq.h> > +#include <asm/thread_info.h> > + > +DECLARE_PER_CPU(unsigned long, irq_sp); > > struct pt_regs; > > extern void migrate_irqs(void); > extern void set_handle_irq(void (*handle_irq)(struct pt_regs *)); > > +extern int alloc_irq_stack(unsigned int cpu); > + > static inline void acpi_irq_init(void) > { > /* > @@ -21,4 +27,10 @@ static inline void acpi_irq_init(void) > } > #define acpi_irq_init acpi_irq_init > > +static inline bool is_irq_stack(unsigned long sp) > +{ > + struct thread_info *ti = get_thread_info(sp); > + return (get_thread_info(per_cpu(irq_sp, ti->cpu)) == ti); > +} > + > #endif > diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h > index dcd06d18a42a..b906254fc400 100644 > --- a/arch/arm64/include/asm/thread_info.h > +++ b/arch/arm64/include/asm/thread_info.h > @@ -69,12 +69,16 @@ register unsigned long current_stack_pointer asm ("sp"); > /* > * how to get the thread information struct from C > */ > +static inline struct thread_info *get_thread_info(unsigned long sp) > +{ > + return (struct thread_info *)(sp & ~(THREAD_SIZE - 1)); > +} > + > static inline struct thread_info *current_thread_info(void) __attribute_const__; > > static inline struct thread_info *current_thread_info(void) > { > - return (struct thread_info *) > - (current_stack_pointer & ~(THREAD_SIZE - 1)); > + return get_thread_info(current_stack_pointer); > } > > #define thread_saved_pc(tsk) \ > diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S > index e16351819fed..d42371f3f5a1 100644 > --- a/arch/arm64/kernel/entry.S > +++ b/arch/arm64/kernel/entry.S > @@ -190,10 +190,37 @@ tsk .req x28 // current thread_info > * Interrupt handling. > */ > .macro irq_handler > - adrp x1, handle_arch_irq > - ldr x1, [x1, #:lo12:handle_arch_irq] > - mov x0, sp > + mrs x21, tpidr_el1 > + adr_l x20, irq_sp > + add x20, x20, x21 > + > + ldr x21, [x20] > + mov x20, sp > + > + mov x0, x21 > + mov x1, x20 > + bl irq_copy_thread_info > + > + /* test for recursive use of irq_sp */ > + cbz w0, 1f > + mrs x30, elr_el1 > + mov sp, x21 > + > + /* > + * Create a fake stack frame to bump unwind_frame() onto the original > + * stack. This relies on x29 not being clobbered by kernel_entry(). > + */ > + push x29, x30 I don't think that it works. Since unwind_frame() doesn't care about sp, but only does fp, a pushed stack frame will never be dereferenced. -Takahiro AKASHI > + > +1: ldr_l x1, handle_arch_irq > + mov x0, x20 > blr x1 > + > + mov x0, x20 > + mov x1, x21 > + bl irq_copy_thread_info > + mov sp, x20 > + > .endm > > .text > diff --git a/arch/arm64/kernel/irq.c b/arch/arm64/kernel/irq.c > index 463fa2e7e34c..10b57a006da8 100644 > --- a/arch/arm64/kernel/irq.c > +++ b/arch/arm64/kernel/irq.c > @@ -26,11 +26,14 @@ > #include <linux/smp.h> > #include <linux/init.h> > #include <linux/irqchip.h> > +#include <linux/percpu.h> > #include <linux/seq_file.h> > #include <linux/ratelimit.h> > > unsigned long irq_err_count; > > +DEFINE_PER_CPU(unsigned long, irq_sp) = 0; > + > int arch_show_interrupts(struct seq_file *p, int prec) > { > #ifdef CONFIG_SMP > @@ -55,6 +58,10 @@ void __init init_IRQ(void) > irqchip_init(); > if (!handle_arch_irq) > panic("No interrupt controller found."); > + > + /* Allocate an irq stack for the boot cpu */ > + if (alloc_irq_stack(smp_processor_id())) > + panic("Failed to allocate irq stack for boot cpu."); > } > > #ifdef CONFIG_HOTPLUG_CPU > @@ -117,3 +124,48 @@ void migrate_irqs(void) > local_irq_restore(flags); > } > #endif /* CONFIG_HOTPLUG_CPU */ > + > +/* Allocate an irq_stack for a cpu that is about to be brought up. */ > +int alloc_irq_stack(unsigned int cpu) > +{ > + struct page *irq_stack_page; > + union thread_union *irq_stack; > + > + /* reuse stack allocated previously */ > + if (per_cpu(irq_sp, cpu)) > + return 0; > + > + irq_stack_page = alloc_kmem_pages(THREADINFO_GFP, THREAD_SIZE_ORDER); > + if (!irq_stack_page) { > + pr_crit("CPU%u: failed to allocate irq stack for cpu %u\n", > + smp_processor_id(), cpu); > + return -ENOMEM; > + } > + irq_stack = page_address(irq_stack_page); > + > + per_cpu(irq_sp, cpu) = (unsigned long)irq_stack->stack > + + THREAD_START_SP; > + > + return 0; > +} > + > +/* > + * Copy struct thread_info between two stacks, and update current->stack. > + * This is used when moving to the irq exception stack. > + * Changing current->stack is necessary so that non-arch thread_info writers > + * don't use the new thread_info->task->stack to find the old thread_info when > + * setting flags like TIF_NEED_RESCHED... > + */ > +asmlinkage int irq_copy_thread_info(unsigned long dst_sp, unsigned long src_sp) > +{ > + struct thread_info *src = get_thread_info(src_sp); > + struct thread_info *dst = get_thread_info(dst_sp); > + > + if (dst == src) > + return 0; > + > + *dst = *src; > + current->stack = dst; > + > + return 1; > +} > diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c > index 50fb4696654e..5283dc5629e4 100644 > --- a/arch/arm64/kernel/smp.c > +++ b/arch/arm64/kernel/smp.c > @@ -89,6 +89,10 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle) > { > int ret; > > + ret = alloc_irq_stack(cpu); > + if (ret) > + return ret; > + > /* > * We need to tell the secondary core where to find its stack and the > * page tables. > diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c > index 407991bf79f5..3d6d5b45aa4b 100644 > --- a/arch/arm64/kernel/stacktrace.c > +++ b/arch/arm64/kernel/stacktrace.c > @@ -20,6 +20,7 @@ > #include <linux/sched.h> > #include <linux/stacktrace.h> > > +#include <asm/irq.h> > #include <asm/stacktrace.h> > > /* > @@ -43,7 +44,8 @@ int notrace unwind_frame(struct stackframe *frame) > low = frame->sp; > high = ALIGN(low, THREAD_SIZE); > > - if (fp < low || fp > high - 0x18 || fp & 0xf) > + if ((fp < low || fp > high - 0x18 || fp & 0xf) && > + !is_irq_stack(frame->sp)) > return -EINVAL; > > frame->sp = fp + 0x10; >
On Sep 7, 2015, at 11:36 PM, James Morse wrote: Hi James, > Having to handle interrupts on top of an existing kernel stack means the > kernel stack must be large enough to accomodate both the maximum kernel > usage, and the maximum irq handler usage. Switching to a different stack > when processing irqs allows us to make the stack size smaller. > > Maximum kernel stack usage (running ltp and generating usb+ethernet > interrupts) was 7256 bytes. With this patch, the same workload gives > a maximum stack usage of 5816 bytes. > > Signed-off-by: James Morse <james.morse@arm.com> > --- > arch/arm64/include/asm/irq.h | 12 +++++++++ > arch/arm64/include/asm/thread_info.h | 8 ++++-- > arch/arm64/kernel/entry.S | 33 ++++++++++++++++++++--- > arch/arm64/kernel/irq.c | 52 ++++++++++++++++++++++++++++++++++++ > arch/arm64/kernel/smp.c | 4 +++ > arch/arm64/kernel/stacktrace.c | 4 ++- > 6 files changed, 107 insertions(+), 6 deletions(-) > > diff --git a/arch/arm64/include/asm/irq.h b/arch/arm64/include/asm/irq.h > index bbb251b14746..050d4196c736 100644 > --- a/arch/arm64/include/asm/irq.h > +++ b/arch/arm64/include/asm/irq.h > @@ -2,14 +2,20 @@ > #define __ASM_IRQ_H > > #include <linux/irqchip/arm-gic-acpi.h> > +#include <linux/percpu.h> > > #include <asm-generic/irq.h> > +#include <asm/thread_info.h> > + > +DECLARE_PER_CPU(unsigned long, irq_sp); > > struct pt_regs; > > extern void migrate_irqs(void); > extern void set_handle_irq(void (*handle_irq)(struct pt_regs *)); > > +extern int alloc_irq_stack(unsigned int cpu); > + > static inline void acpi_irq_init(void) > { > /* > @@ -21,4 +27,10 @@ static inline void acpi_irq_init(void) > } > #define acpi_irq_init acpi_irq_init > > +static inline bool is_irq_stack(unsigned long sp) > +{ > + struct thread_info *ti = get_thread_info(sp); > + return (get_thread_info(per_cpu(irq_sp, ti->cpu)) == ti); > +} > + > #endif > diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h > index dcd06d18a42a..b906254fc400 100644 > --- a/arch/arm64/include/asm/thread_info.h > +++ b/arch/arm64/include/asm/thread_info.h > @@ -69,12 +69,16 @@ register unsigned long current_stack_pointer asm ("sp"); > /* > * how to get the thread information struct from C > */ > +static inline struct thread_info *get_thread_info(unsigned long sp) > +{ > + return (struct thread_info *)(sp & ~(THREAD_SIZE - 1)); > +} > + > static inline struct thread_info *current_thread_info(void) __attribute_const__; > > static inline struct thread_info *current_thread_info(void) > { > - return (struct thread_info *) > - (current_stack_pointer & ~(THREAD_SIZE - 1)); > + return get_thread_info(current_stack_pointer); > } > > #define thread_saved_pc(tsk) \ > diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S > index e16351819fed..d42371f3f5a1 100644 > --- a/arch/arm64/kernel/entry.S > +++ b/arch/arm64/kernel/entry.S > @@ -190,10 +190,37 @@ tsk .req x28 // current thread_info > * Interrupt handling. > */ > .macro irq_handler > - adrp x1, handle_arch_irq > - ldr x1, [x1, #:lo12:handle_arch_irq] > - mov x0, sp > + mrs x21, tpidr_el1 > + adr_l x20, irq_sp > + add x20, x20, x21 > + > + ldr x21, [x20] > + mov x20, sp > + > + mov x0, x21 > + mov x1, x20 > + bl irq_copy_thread_info > + > + /* test for recursive use of irq_sp */ > + cbz w0, 1f > + mrs x30, elr_el1 > + mov sp, x21 > + > + /* > + * Create a fake stack frame to bump unwind_frame() onto the original > + * stack. This relies on x29 not being clobbered by kernel_entry(). > + */ > + push x29, x30 It is the most challenging item to handle a frame pointer in this context. As I mentioned previously, a stack tracer on ftrace is not supported yet. How about decoupling "IRQ Stack feature" from "Stack Tracer Fix"? > + > +1: ldr_l x1, handle_arch_irq > + mov x0, x20 > blr x1 > + > + mov x0, x20 > + mov x1, x21 > + bl irq_copy_thread_info > + mov sp, x20 > + > .endm > > .text > diff --git a/arch/arm64/kernel/irq.c b/arch/arm64/kernel/irq.c > index 463fa2e7e34c..10b57a006da8 100644 > --- a/arch/arm64/kernel/irq.c > +++ b/arch/arm64/kernel/irq.c > @@ -26,11 +26,14 @@ > #include <linux/smp.h> > #include <linux/init.h> > #include <linux/irqchip.h> > +#include <linux/percpu.h> > #include <linux/seq_file.h> > #include <linux/ratelimit.h> > > unsigned long irq_err_count; > > +DEFINE_PER_CPU(unsigned long, irq_sp) = 0; > + > int arch_show_interrupts(struct seq_file *p, int prec) > { > #ifdef CONFIG_SMP > @@ -55,6 +58,10 @@ void __init init_IRQ(void) > irqchip_init(); > if (!handle_arch_irq) > panic("No interrupt controller found."); > + > + /* Allocate an irq stack for the boot cpu */ > + if (alloc_irq_stack(smp_processor_id())) > + panic("Failed to allocate irq stack for boot cpu."); > } > > #ifdef CONFIG_HOTPLUG_CPU > @@ -117,3 +124,48 @@ void migrate_irqs(void) > local_irq_restore(flags); > } > #endif /* CONFIG_HOTPLUG_CPU */ > + > +/* Allocate an irq_stack for a cpu that is about to be brought up. */ > +int alloc_irq_stack(unsigned int cpu) > +{ > + struct page *irq_stack_page; > + union thread_union *irq_stack; > + > + /* reuse stack allocated previously */ > + if (per_cpu(irq_sp, cpu)) > + return 0; I'd like to avoid even this simple check since CPU hotplug could be heavily used for power management. > + > + irq_stack_page = alloc_kmem_pages(THREADINFO_GFP, THREAD_SIZE_ORDER); > + if (!irq_stack_page) { > + pr_crit("CPU%u: failed to allocate irq stack for cpu %u\n", > + smp_processor_id(), cpu); > + return -ENOMEM; > + } > + irq_stack = page_address(irq_stack_page); > + > + per_cpu(irq_sp, cpu) = (unsigned long)irq_stack->stack > + + THREAD_START_SP; > + > + return 0; > +} > + > +/* > + * Copy struct thread_info between two stacks, and update current->stack. > + * This is used when moving to the irq exception stack. > + * Changing current->stack is necessary so that non-arch thread_info writers > + * don't use the new thread_info->task->stack to find the old thread_info when > + * setting flags like TIF_NEED_RESCHED... > + */ > +asmlinkage int irq_copy_thread_info(unsigned long dst_sp, unsigned long src_sp) > +{ > + struct thread_info *src = get_thread_info(src_sp); > + struct thread_info *dst = get_thread_info(dst_sp); > + > + if (dst == src) > + return 0; > + > + *dst = *src; > + current->stack = dst; > + > + return 1; > +} Although this is 32-byte memcpy, sizeof(struct thread_info), this function is called twice to handle a single interrupt. Isn's it too expensive? This is a major difference between my approach and this patch. I think we should get some feedbacks on struct thread_info information management for v2 preparation. Best Regards Jungseok Lee
On Sep 8, 2015, at 10:45 AM, AKASHI Takahiro wrote: > Jungseok, Hi Akashi, > On 09/08/2015 01:34 AM, Jungseok Lee wrote: >> On Sep 8, 2015, at 1:06 AM, James Morse wrote: >>> On 07/09/15 16:48, Jungseok Lee wrote: >>>> On Sep 7, 2015, at 11:36 PM, James Morse wrote: >>>> >>>> Hi James, >>>> >>>>> Having to handle interrupts on top of an existing kernel stack means the >>>>> kernel stack must be large enough to accomodate both the maximum kernel >>>>> usage, and the maximum irq handler usage. Switching to a different stack >>>>> when processing irqs allows us to make the stack size smaller. >>>>> >>>>> Maximum kernel stack usage (running ltp and generating usb+ethernet >>>>> interrupts) was 7256 bytes. With this patch, the same workload gives >>>>> a maximum stack usage of 5816 bytes. >>>> >>>> I'd like to know how to measure the max stack depth. >>>> AFAIK, a stack tracer on ftrace does not work well. Did you dump a stack >>>> region and find or track down an untouched region? >>> >>> I enabled the 'Trace max stack' option under menuconfig 'Kernel Hacking' -> >>> 'Tracers', then looked in debugfs:/tracing/stack_max_size. >>> >>> What problems did you encounter? >>> (I may be missing something…) >> >> When I enabled the feature, all entries had *0* size except the last entry. >> It can be reproduced easily as looking in debugs:/tracing/stack_trace. > > I'm afraid that you have not applied one of patches in my RFC: > http://lists.infradead.org/pipermail/linux-arm-kernel/2015-July/355919.html My description is not clear. Whenever I use a stack tracer, I always put your RFC and Steve's stack tracer fix on top of my tree. Best Regards Jungseok Lee
On 08/09/15 15:54, Jungseok Lee wrote: > On Sep 7, 2015, at 11:36 PM, James Morse wrote: > > Hi James, > >> diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S >> index e16351819fed..d42371f3f5a1 100644 >> --- a/arch/arm64/kernel/entry.S >> +++ b/arch/arm64/kernel/entry.S >> @@ -190,10 +190,37 @@ tsk .req x28 // current thread_info >> * Interrupt handling. >> */ >> .macro irq_handler >> - adrp x1, handle_arch_irq >> - ldr x1, [x1, #:lo12:handle_arch_irq] >> - mov x0, sp >> + mrs x21, tpidr_el1 >> + adr_l x20, irq_sp >> + add x20, x20, x21 >> + >> + ldr x21, [x20] >> + mov x20, sp >> + >> + mov x0, x21 >> + mov x1, x20 >> + bl irq_copy_thread_info >> + >> + /* test for recursive use of irq_sp */ >> + cbz w0, 1f >> + mrs x30, elr_el1 >> + mov sp, x21 >> + >> + /* >> + * Create a fake stack frame to bump unwind_frame() onto the original >> + * stack. This relies on x29 not being clobbered by kernel_entry(). >> + */ >> + push x29, x30 > > It is the most challenging item to handle a frame pointer in this context. > > As I mentioned previously, a stack tracer on ftrace is not supported yet. > How about decoupling "IRQ Stack feature" from "Stack Tracer Fix"? Yes - I discovered today this was more complicated than I thought. I will need to do some more reading. >> + >> +1: ldr_l x1, handle_arch_irq >> + mov x0, x20 >> blr x1 >> + >> + mov x0, x20 >> + mov x1, x21 >> + bl irq_copy_thread_info >> + mov sp, x20 >> + >> .endm >> >> .text >> diff --git a/arch/arm64/kernel/irq.c b/arch/arm64/kernel/irq.c >> index 463fa2e7e34c..10b57a006da8 100644 >> --- a/arch/arm64/kernel/irq.c >> +++ b/arch/arm64/kernel/irq.c >> @@ -26,11 +26,14 @@ >> #include <linux/smp.h> >> #include <linux/init.h> >> #include <linux/irqchip.h> >> +#include <linux/percpu.h> >> #include <linux/seq_file.h> >> #include <linux/ratelimit.h> >> >> unsigned long irq_err_count; >> >> +DEFINE_PER_CPU(unsigned long, irq_sp) = 0; >> + >> int arch_show_interrupts(struct seq_file *p, int prec) >> { >> #ifdef CONFIG_SMP >> @@ -55,6 +58,10 @@ void __init init_IRQ(void) >> irqchip_init(); >> if (!handle_arch_irq) >> panic("No interrupt controller found."); >> + >> + /* Allocate an irq stack for the boot cpu */ >> + if (alloc_irq_stack(smp_processor_id())) >> + panic("Failed to allocate irq stack for boot cpu."); >> } >> >> #ifdef CONFIG_HOTPLUG_CPU >> @@ -117,3 +124,48 @@ void migrate_irqs(void) >> local_irq_restore(flags); >> } >> #endif /* CONFIG_HOTPLUG_CPU */ >> + >> +/* Allocate an irq_stack for a cpu that is about to be brought up. */ >> +int alloc_irq_stack(unsigned int cpu) >> +{ >> + struct page *irq_stack_page; >> + union thread_union *irq_stack; >> + >> + /* reuse stack allocated previously */ >> + if (per_cpu(irq_sp, cpu)) >> + return 0; > > I'd like to avoid even this simple check since CPU hotplug could be heavily > used for power management. I don't think its a problem: __cpu_up() contains a call to wait_for_completion_timeout() (which could eventually end up in the scheduler), so I don't think it could ever be on a 'really hot' path. For really frequent hotplug-like power management, cpu_suspend() makes use of firmware support to power-off cores - from what I can see it doesn't use __cpu_up(). >> + >> + irq_stack_page = alloc_kmem_pages(THREADINFO_GFP, THREAD_SIZE_ORDER); >> + if (!irq_stack_page) { >> + pr_crit("CPU%u: failed to allocate irq stack for cpu %u\n", >> + smp_processor_id(), cpu); >> + return -ENOMEM; >> + } >> + irq_stack = page_address(irq_stack_page); >> + >> + per_cpu(irq_sp, cpu) = (unsigned long)irq_stack->stack >> + + THREAD_START_SP; >> + >> + return 0; >> +} >> + >> +/* >> + * Copy struct thread_info between two stacks, and update current->stack. >> + * This is used when moving to the irq exception stack. >> + * Changing current->stack is necessary so that non-arch thread_info writers >> + * don't use the new thread_info->task->stack to find the old thread_info when >> + * setting flags like TIF_NEED_RESCHED... >> + */ >> +asmlinkage int irq_copy_thread_info(unsigned long dst_sp, unsigned long src_sp) >> +{ >> + struct thread_info *src = get_thread_info(src_sp); >> + struct thread_info *dst = get_thread_info(dst_sp); >> + >> + if (dst == src) >> + return 0; >> + >> + *dst = *src; >> + current->stack = dst; >> + >> + return 1; >> +} > > Although this is 32-byte memcpy, sizeof(struct thread_info), this function is called > twice to handle a single interrupt. Isn's it too expensive? > > This is a major difference between my approach and this patch. I think we should get > some feedbacks on struct thread_info information management for v2 preparation. Agreed. The other difference, as Akashi Takahiro pointed out, is the behaviour of object_is_on_stack(). What should this return for an object on an irq stack? Thanks, James
On Sep 9, 2015, at 1:47 AM, James Morse wrote: > On 08/09/15 15:54, Jungseok Lee wrote: >> On Sep 7, 2015, at 11:36 PM, James Morse wrote: >> >> Hi James, >> >>> diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S >>> index e16351819fed..d42371f3f5a1 100644 >>> --- a/arch/arm64/kernel/entry.S >>> +++ b/arch/arm64/kernel/entry.S >>> @@ -190,10 +190,37 @@ tsk .req x28 // current thread_info >>> * Interrupt handling. >>> */ >>> .macro irq_handler >>> - adrp x1, handle_arch_irq >>> - ldr x1, [x1, #:lo12:handle_arch_irq] >>> - mov x0, sp >>> + mrs x21, tpidr_el1 >>> + adr_l x20, irq_sp >>> + add x20, x20, x21 >>> + >>> + ldr x21, [x20] >>> + mov x20, sp >>> + >>> + mov x0, x21 >>> + mov x1, x20 >>> + bl irq_copy_thread_info >>> + >>> + /* test for recursive use of irq_sp */ >>> + cbz w0, 1f >>> + mrs x30, elr_el1 >>> + mov sp, x21 >>> + >>> + /* >>> + * Create a fake stack frame to bump unwind_frame() onto the original >>> + * stack. This relies on x29 not being clobbered by kernel_entry(). >>> + */ >>> + push x29, x30 >> >> It is the most challenging item to handle a frame pointer in this context. >> >> As I mentioned previously, a stack tracer on ftrace is not supported yet. >> How about decoupling "IRQ Stack feature" from "Stack Tracer Fix"? > > Yes - I discovered today this was more complicated than I thought. I will > need to do some more reading. > > >>> + >>> +1: ldr_l x1, handle_arch_irq >>> + mov x0, x20 >>> blr x1 >>> + >>> + mov x0, x20 >>> + mov x1, x21 >>> + bl irq_copy_thread_info >>> + mov sp, x20 >>> + >>> .endm >>> >>> .text >>> diff --git a/arch/arm64/kernel/irq.c b/arch/arm64/kernel/irq.c >>> index 463fa2e7e34c..10b57a006da8 100644 >>> --- a/arch/arm64/kernel/irq.c >>> +++ b/arch/arm64/kernel/irq.c >>> @@ -26,11 +26,14 @@ >>> #include <linux/smp.h> >>> #include <linux/init.h> >>> #include <linux/irqchip.h> >>> +#include <linux/percpu.h> >>> #include <linux/seq_file.h> >>> #include <linux/ratelimit.h> >>> >>> unsigned long irq_err_count; >>> >>> +DEFINE_PER_CPU(unsigned long, irq_sp) = 0; >>> + >>> int arch_show_interrupts(struct seq_file *p, int prec) >>> { >>> #ifdef CONFIG_SMP >>> @@ -55,6 +58,10 @@ void __init init_IRQ(void) >>> irqchip_init(); >>> if (!handle_arch_irq) >>> panic("No interrupt controller found."); >>> + >>> + /* Allocate an irq stack for the boot cpu */ >>> + if (alloc_irq_stack(smp_processor_id())) >>> + panic("Failed to allocate irq stack for boot cpu."); >>> } >>> >>> #ifdef CONFIG_HOTPLUG_CPU >>> @@ -117,3 +124,48 @@ void migrate_irqs(void) >>> local_irq_restore(flags); >>> } >>> #endif /* CONFIG_HOTPLUG_CPU */ >>> + >>> +/* Allocate an irq_stack for a cpu that is about to be brought up. */ >>> +int alloc_irq_stack(unsigned int cpu) >>> +{ >>> + struct page *irq_stack_page; >>> + union thread_union *irq_stack; >>> + >>> + /* reuse stack allocated previously */ >>> + if (per_cpu(irq_sp, cpu)) >>> + return 0; >> >> I'd like to avoid even this simple check since CPU hotplug could be heavily >> used for power management. > > I don't think its a problem: > __cpu_up() contains a call to wait_for_completion_timeout() (which could > eventually end up in the scheduler), so I don't think it could ever be on a > 'really hot' path. > > For really frequent hotplug-like power management, cpu_suspend() makes use > of firmware support to power-off cores - from what I can see it doesn't use > __cpu_up(). In case of some platforms, CPU hotplug is triggered via sysfs for power management based on user data. What is advantage of putting stack allocation into this path? IRQ stack allocation is an critical one-shot operation. So, there would be no issue to give this work to a booting core. >>> + >>> + irq_stack_page = alloc_kmem_pages(THREADINFO_GFP, THREAD_SIZE_ORDER); >>> + if (!irq_stack_page) { >>> + pr_crit("CPU%u: failed to allocate irq stack for cpu %u\n", >>> + smp_processor_id(), cpu); >>> + return -ENOMEM; >>> + } >>> + irq_stack = page_address(irq_stack_page); I forgot leaving a comment here. How about using __get_free_pages? It returns an address instead of page. >>> + >>> + per_cpu(irq_sp, cpu) = (unsigned long)irq_stack->stack >>> + + THREAD_START_SP; >>> + >>> + return 0; >>> +} >>> + >>> +/* >>> + * Copy struct thread_info between two stacks, and update current->stack. >>> + * This is used when moving to the irq exception stack. >>> + * Changing current->stack is necessary so that non-arch thread_info writers >>> + * don't use the new thread_info->task->stack to find the old thread_info when >>> + * setting flags like TIF_NEED_RESCHED... >>> + */ >>> +asmlinkage int irq_copy_thread_info(unsigned long dst_sp, unsigned long src_sp) >>> +{ >>> + struct thread_info *src = get_thread_info(src_sp); >>> + struct thread_info *dst = get_thread_info(dst_sp); >>> + >>> + if (dst == src) >>> + return 0; >>> + >>> + *dst = *src; >>> + current->stack = dst; >>> + >>> + return 1; >>> +} >> >> Although this is 32-byte memcpy, sizeof(struct thread_info), this function is called >> twice to handle a single interrupt. Isn's it too expensive? >> >> This is a major difference between my approach and this patch. I think we should get >> some feedbacks on struct thread_info information management for v2 preparation. > > Agreed. > > The other difference, as Akashi Takahiro pointed out, is the behaviour of > object_is_on_stack(). What should this return for an object on an irq stack? 0 should be returned in that case to align with x86 behavior according to check_stack() context and the commit, 81520a1b0649d0. Best Regards Jungseok Lee
On 09/09/15 14:22, Jungseok Lee wrote: > On Sep 9, 2015, at 1:47 AM, James Morse wrote: >> On 08/09/15 15:54, Jungseok Lee wrote: >>> On Sep 7, 2015, at 11:36 PM, James Morse wrote: >>>> diff --git a/arch/arm64/kernel/irq.c b/arch/arm64/kernel/irq.c >>>> index 463fa2e7e34c..10b57a006da8 100644 >>>> --- a/arch/arm64/kernel/irq.c >>>> +++ b/arch/arm64/kernel/irq.c >>>> @@ -26,11 +26,14 @@ >>>> #include <linux/smp.h> >>>> #include <linux/init.h> >>>> #include <linux/irqchip.h> >>>> +#include <linux/percpu.h> >>>> #include <linux/seq_file.h> >>>> #include <linux/ratelimit.h> >>>> >>>> unsigned long irq_err_count; >>>> >>>> +DEFINE_PER_CPU(unsigned long, irq_sp) = 0; >>>> + >>>> int arch_show_interrupts(struct seq_file *p, int prec) >>>> { >>>> #ifdef CONFIG_SMP >>>> @@ -55,6 +58,10 @@ void __init init_IRQ(void) >>>> irqchip_init(); >>>> if (!handle_arch_irq) >>>> panic("No interrupt controller found."); >>>> + >>>> + /* Allocate an irq stack for the boot cpu */ >>>> + if (alloc_irq_stack(smp_processor_id())) >>>> + panic("Failed to allocate irq stack for boot cpu."); >>>> } >>>> >>>> #ifdef CONFIG_HOTPLUG_CPU >>>> @@ -117,3 +124,48 @@ void migrate_irqs(void) >>>> local_irq_restore(flags); >>>> } >>>> #endif /* CONFIG_HOTPLUG_CPU */ >>>> + >>>> +/* Allocate an irq_stack for a cpu that is about to be brought up. */ >>>> +int alloc_irq_stack(unsigned int cpu) >>>> +{ >>>> + struct page *irq_stack_page; >>>> + union thread_union *irq_stack; >>>> + >>>> + /* reuse stack allocated previously */ >>>> + if (per_cpu(irq_sp, cpu)) >>>> + return 0; >>> >>> I'd like to avoid even this simple check since CPU hotplug could be heavily >>> used for power management. >> >> I don't think its a problem: >> __cpu_up() contains a call to wait_for_completion_timeout() (which could >> eventually end up in the scheduler), so I don't think it could ever be on a >> 'really hot' path. >> >> For really frequent hotplug-like power management, cpu_suspend() makes use >> of firmware support to power-off cores - from what I can see it doesn't use >> __cpu_up(). > > In case of some platforms, CPU hotplug is triggered via sysfs for power management > based on user data. What is advantage of putting stack allocation into this path? It will only happen for CPUs that are brought up. > IRQ stack allocation is an critical one-shot operation. So, there would be no issue > to give this work to a booting core. I agree, but: From include/linux/cpumask.h: > * If HOTPLUG is enabled, then cpu_possible_mask is forced to have > * all NR_CPUS bits set, otherwise it is just the set of CPUs that > * ACPI reports present at boot. (This doesn't seem to happen with DT - but might with ACPI.) NR_CPUs could be much bigger than the number of cpus the system ever has. Allocating a stack for every possible cpu would waste memory. It is better to do it just-in-time, when we know the memory will be used. This already happens for the per-cpu idle task, (please check I traced these through correctly!) _cpu_up() idle_thread_get() init_idle() fork_idle() copy_process() dup_task_struct() alloc_task_struct_node() alloc_thread_info_node() arch_dup_task_struct() So plenty of memory-allocation occurs during _cpu_up(), idle_init() checks whether the idle task has already been created. >>>> + >>>> + irq_stack_page = alloc_kmem_pages(THREADINFO_GFP, THREAD_SIZE_ORDER); >>>> + if (!irq_stack_page) { >>>> + pr_crit("CPU%u: failed to allocate irq stack for cpu %u\n", >>>> + smp_processor_id(), cpu); >>>> + return -ENOMEM; >>>> + } >>>> + irq_stack = page_address(irq_stack_page); > > I forgot leaving a comment here. How about using __get_free_pages? It returns an > address instead of page. Good idea, I was paranoid about the stack not being correctly aligned (as current_thread_info() relies on this). I lifted that alloc_kmem_pages line from kernel/fork.c. >>>> + >>>> + per_cpu(irq_sp, cpu) = (unsigned long)irq_stack->stack >>>> + + THREAD_START_SP; >>>> + >>>> + return 0; >>>> +} >>>> + >>>> +/* >>>> + * Copy struct thread_info between two stacks, and update current->stack. >>>> + * This is used when moving to the irq exception stack. >>>> + * Changing current->stack is necessary so that non-arch thread_info writers >>>> + * don't use the new thread_info->task->stack to find the old thread_info when >>>> + * setting flags like TIF_NEED_RESCHED... >>>> + */ >>>> +asmlinkage int irq_copy_thread_info(unsigned long dst_sp, unsigned long src_sp) >>>> +{ >>>> + struct thread_info *src = get_thread_info(src_sp); >>>> + struct thread_info *dst = get_thread_info(dst_sp); >>>> + >>>> + if (dst == src) >>>> + return 0; >>>> + >>>> + *dst = *src; >>>> + current->stack = dst; >>>> + >>>> + return 1; >>>> +} >>> >>> Although this is 32-byte memcpy, sizeof(struct thread_info), this function is called >>> twice to handle a single interrupt. Isn's it too expensive? >>> >>> This is a major difference between my approach and this patch. I think we should get >>> some feedbacks on struct thread_info information management for v2 preparation. >> >> Agreed. >> >> The other difference, as Akashi Takahiro pointed out, is the behaviour of >> object_is_on_stack(). What should this return for an object on an irq stack? > > 0 should be returned in that case to align with x86 behavior according to check_stack() > context and the commit, 81520a1b0649d0. Unless we can update ftrace on x86 too! :) Thanks for your comments, James
On Sep 10, 2015, at 3:13 AM, James Morse wrote: > On 09/09/15 14:22, Jungseok Lee wrote: >> On Sep 9, 2015, at 1:47 AM, James Morse wrote: >>> On 08/09/15 15:54, Jungseok Lee wrote: >>>> On Sep 7, 2015, at 11:36 PM, James Morse wrote: >>>>> diff --git a/arch/arm64/kernel/irq.c b/arch/arm64/kernel/irq.c >>>>> index 463fa2e7e34c..10b57a006da8 100644 >>>>> --- a/arch/arm64/kernel/irq.c >>>>> +++ b/arch/arm64/kernel/irq.c >>>>> @@ -26,11 +26,14 @@ >>>>> #include <linux/smp.h> >>>>> #include <linux/init.h> >>>>> #include <linux/irqchip.h> >>>>> +#include <linux/percpu.h> >>>>> #include <linux/seq_file.h> >>>>> #include <linux/ratelimit.h> >>>>> >>>>> unsigned long irq_err_count; >>>>> >>>>> +DEFINE_PER_CPU(unsigned long, irq_sp) = 0; >>>>> + >>>>> int arch_show_interrupts(struct seq_file *p, int prec) >>>>> { >>>>> #ifdef CONFIG_SMP >>>>> @@ -55,6 +58,10 @@ void __init init_IRQ(void) >>>>> irqchip_init(); >>>>> if (!handle_arch_irq) >>>>> panic("No interrupt controller found."); >>>>> + >>>>> + /* Allocate an irq stack for the boot cpu */ >>>>> + if (alloc_irq_stack(smp_processor_id())) >>>>> + panic("Failed to allocate irq stack for boot cpu."); >>>>> } >>>>> >>>>> #ifdef CONFIG_HOTPLUG_CPU >>>>> @@ -117,3 +124,48 @@ void migrate_irqs(void) >>>>> local_irq_restore(flags); >>>>> } >>>>> #endif /* CONFIG_HOTPLUG_CPU */ >>>>> + >>>>> +/* Allocate an irq_stack for a cpu that is about to be brought up. */ >>>>> +int alloc_irq_stack(unsigned int cpu) >>>>> +{ >>>>> + struct page *irq_stack_page; >>>>> + union thread_union *irq_stack; >>>>> + >>>>> + /* reuse stack allocated previously */ >>>>> + if (per_cpu(irq_sp, cpu)) >>>>> + return 0; >>>> >>>> I'd like to avoid even this simple check since CPU hotplug could be heavily >>>> used for power management. >>> >>> I don't think its a problem: >>> __cpu_up() contains a call to wait_for_completion_timeout() (which could >>> eventually end up in the scheduler), so I don't think it could ever be on a >>> 'really hot' path. >>> >>> For really frequent hotplug-like power management, cpu_suspend() makes use >>> of firmware support to power-off cores - from what I can see it doesn't use >>> __cpu_up(). >> >> In case of some platforms, CPU hotplug is triggered via sysfs for power management >> based on user data. What is advantage of putting stack allocation into this path? > > It will only happen for CPUs that are brought up. > > >> IRQ stack allocation is an critical one-shot operation. So, there would be no issue >> to give this work to a booting core. > > I agree, but: > > From include/linux/cpumask.h: >> * If HOTPLUG is enabled, then cpu_possible_mask is forced to have >> * all NR_CPUS bits set, otherwise it is just the set of CPUs that >> * ACPI reports present at boot. > > (This doesn't seem to happen with DT - but might with ACPI.) > > NR_CPUs could be much bigger than the number of cpus the system ever has. > Allocating a stack for every possible cpu would waste memory. It is better > to do it just-in-time, when we know the memory will be used. Frankly I've not considered that kind of system, but this feature should be supported smoothly for that system. I will move the allocation logic in v2. > This already happens for the per-cpu idle task, (please check I traced > these through correctly!) > _cpu_up() > idle_thread_get() > init_idle() > fork_idle() > copy_process() > dup_task_struct() > alloc_task_struct_node() > alloc_thread_info_node() > arch_dup_task_struct() > > So plenty of memory-allocation occurs during _cpu_up(), idle_init() checks > whether the idle task has already been created. Got it. Thanks for the feedbacks. Best Regards Jungseok Lee
diff --git a/arch/arm64/include/asm/irq.h b/arch/arm64/include/asm/irq.h index bbb251b14746..050d4196c736 100644 --- a/arch/arm64/include/asm/irq.h +++ b/arch/arm64/include/asm/irq.h @@ -2,14 +2,20 @@ #define __ASM_IRQ_H #include <linux/irqchip/arm-gic-acpi.h> +#include <linux/percpu.h> #include <asm-generic/irq.h> +#include <asm/thread_info.h> + +DECLARE_PER_CPU(unsigned long, irq_sp); struct pt_regs; extern void migrate_irqs(void); extern void set_handle_irq(void (*handle_irq)(struct pt_regs *)); +extern int alloc_irq_stack(unsigned int cpu); + static inline void acpi_irq_init(void) { /* @@ -21,4 +27,10 @@ static inline void acpi_irq_init(void) } #define acpi_irq_init acpi_irq_init +static inline bool is_irq_stack(unsigned long sp) +{ + struct thread_info *ti = get_thread_info(sp); + return (get_thread_info(per_cpu(irq_sp, ti->cpu)) == ti); +} + #endif diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index dcd06d18a42a..b906254fc400 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h @@ -69,12 +69,16 @@ register unsigned long current_stack_pointer asm ("sp"); /* * how to get the thread information struct from C */ +static inline struct thread_info *get_thread_info(unsigned long sp) +{ + return (struct thread_info *)(sp & ~(THREAD_SIZE - 1)); +} + static inline struct thread_info *current_thread_info(void) __attribute_const__; static inline struct thread_info *current_thread_info(void) { - return (struct thread_info *) - (current_stack_pointer & ~(THREAD_SIZE - 1)); + return get_thread_info(current_stack_pointer); } #define thread_saved_pc(tsk) \ diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index e16351819fed..d42371f3f5a1 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -190,10 +190,37 @@ tsk .req x28 // current thread_info * Interrupt handling. */ .macro irq_handler - adrp x1, handle_arch_irq - ldr x1, [x1, #:lo12:handle_arch_irq] - mov x0, sp + mrs x21, tpidr_el1 + adr_l x20, irq_sp + add x20, x20, x21 + + ldr x21, [x20] + mov x20, sp + + mov x0, x21 + mov x1, x20 + bl irq_copy_thread_info + + /* test for recursive use of irq_sp */ + cbz w0, 1f + mrs x30, elr_el1 + mov sp, x21 + + /* + * Create a fake stack frame to bump unwind_frame() onto the original + * stack. This relies on x29 not being clobbered by kernel_entry(). + */ + push x29, x30 + +1: ldr_l x1, handle_arch_irq + mov x0, x20 blr x1 + + mov x0, x20 + mov x1, x21 + bl irq_copy_thread_info + mov sp, x20 + .endm .text diff --git a/arch/arm64/kernel/irq.c b/arch/arm64/kernel/irq.c index 463fa2e7e34c..10b57a006da8 100644 --- a/arch/arm64/kernel/irq.c +++ b/arch/arm64/kernel/irq.c @@ -26,11 +26,14 @@ #include <linux/smp.h> #include <linux/init.h> #include <linux/irqchip.h> +#include <linux/percpu.h> #include <linux/seq_file.h> #include <linux/ratelimit.h> unsigned long irq_err_count; +DEFINE_PER_CPU(unsigned long, irq_sp) = 0; + int arch_show_interrupts(struct seq_file *p, int prec) { #ifdef CONFIG_SMP @@ -55,6 +58,10 @@ void __init init_IRQ(void) irqchip_init(); if (!handle_arch_irq) panic("No interrupt controller found."); + + /* Allocate an irq stack for the boot cpu */ + if (alloc_irq_stack(smp_processor_id())) + panic("Failed to allocate irq stack for boot cpu."); } #ifdef CONFIG_HOTPLUG_CPU @@ -117,3 +124,48 @@ void migrate_irqs(void) local_irq_restore(flags); } #endif /* CONFIG_HOTPLUG_CPU */ + +/* Allocate an irq_stack for a cpu that is about to be brought up. */ +int alloc_irq_stack(unsigned int cpu) +{ + struct page *irq_stack_page; + union thread_union *irq_stack; + + /* reuse stack allocated previously */ + if (per_cpu(irq_sp, cpu)) + return 0; + + irq_stack_page = alloc_kmem_pages(THREADINFO_GFP, THREAD_SIZE_ORDER); + if (!irq_stack_page) { + pr_crit("CPU%u: failed to allocate irq stack for cpu %u\n", + smp_processor_id(), cpu); + return -ENOMEM; + } + irq_stack = page_address(irq_stack_page); + + per_cpu(irq_sp, cpu) = (unsigned long)irq_stack->stack + + THREAD_START_SP; + + return 0; +} + +/* + * Copy struct thread_info between two stacks, and update current->stack. + * This is used when moving to the irq exception stack. + * Changing current->stack is necessary so that non-arch thread_info writers + * don't use the new thread_info->task->stack to find the old thread_info when + * setting flags like TIF_NEED_RESCHED... + */ +asmlinkage int irq_copy_thread_info(unsigned long dst_sp, unsigned long src_sp) +{ + struct thread_info *src = get_thread_info(src_sp); + struct thread_info *dst = get_thread_info(dst_sp); + + if (dst == src) + return 0; + + *dst = *src; + current->stack = dst; + + return 1; +} diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index 50fb4696654e..5283dc5629e4 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -89,6 +89,10 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle) { int ret; + ret = alloc_irq_stack(cpu); + if (ret) + return ret; + /* * We need to tell the secondary core where to find its stack and the * page tables. diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index 407991bf79f5..3d6d5b45aa4b 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -20,6 +20,7 @@ #include <linux/sched.h> #include <linux/stacktrace.h> +#include <asm/irq.h> #include <asm/stacktrace.h> /* @@ -43,7 +44,8 @@ int notrace unwind_frame(struct stackframe *frame) low = frame->sp; high = ALIGN(low, THREAD_SIZE); - if (fp < low || fp > high - 0x18 || fp & 0xf) + if ((fp < low || fp > high - 0x18 || fp & 0xf) && + !is_irq_stack(frame->sp)) return -EINVAL; frame->sp = fp + 0x10;
Having to handle interrupts on top of an existing kernel stack means the kernel stack must be large enough to accomodate both the maximum kernel usage, and the maximum irq handler usage. Switching to a different stack when processing irqs allows us to make the stack size smaller. Maximum kernel stack usage (running ltp and generating usb+ethernet interrupts) was 7256 bytes. With this patch, the same workload gives a maximum stack usage of 5816 bytes. Signed-off-by: James Morse <james.morse@arm.com> --- arch/arm64/include/asm/irq.h | 12 +++++++++ arch/arm64/include/asm/thread_info.h | 8 ++++-- arch/arm64/kernel/entry.S | 33 ++++++++++++++++++++--- arch/arm64/kernel/irq.c | 52 ++++++++++++++++++++++++++++++++++++ arch/arm64/kernel/smp.c | 4 +++ arch/arm64/kernel/stacktrace.c | 4 ++- 6 files changed, 107 insertions(+), 6 deletions(-)