diff mbox

[v2] arm64: Introduce IRQ stack

Message ID 1442155337-7020-1-git-send-email-jungseoklee85@gmail.com (mailing list archive)
State New, archived
Headers show

Commit Message

Jungseok Lee Sept. 13, 2015, 2:42 p.m. UTC
Currently, kernel context and interrupts are handled using a single
kernel stack navigated by sp_el1. This forces many systems to use
16KB stack, not 8KB one. Low memory platforms naturally suffer from
memory pressure accompanied by performance degradation.

This patch addresses the issue as introducing a separate percpu IRQ
stack to handle both hard and soft interrupts with two ground rules:

  - Utilize sp_el0 in EL1 context, which is not used currently
  - Do not complicate current_thread_info calculation

It is a core concept to trace struct thread_info using sp_el0 instead
of sp_el1. This approach helps arm64 align with other architectures
regarding object_is_on_stack() without additional complexity.

Cc: James Morse <james.morse@arm.com>
Signed-off-by: Jungseok Lee <jungseoklee85@gmail.com>
---
Changes since v1:
- Rebased on top of v4.3-rc1
- Removed Kconfig about IRQ stack, per James
- Used PERCPU for IRQ stack, per James
- Tried to allocate IRQ stack when CPU is about to start up, per James
- Moved sp_el0 update into kernel_entry macro, per James
- Dropped S_SP removal patch, per Mark and James

 arch/arm64/include/asm/irq.h         |  8 +++
 arch/arm64/include/asm/thread_info.h |  7 +-
 arch/arm64/kernel/asm-offsets.c      |  5 ++
 arch/arm64/kernel/entry.S            | 54 ++++++++++++++--
 arch/arm64/kernel/head.S             |  3 +
 arch/arm64/kernel/irq.c              | 21 ++++++
 arch/arm64/kernel/smp.c              |  6 ++
 7 files changed, 95 insertions(+), 9 deletions(-)

Comments

Will Deacon Sept. 16, 2015, 11:25 a.m. UTC | #1
On Sun, Sep 13, 2015 at 03:42:17PM +0100, Jungseok Lee wrote:
> Currently, kernel context and interrupts are handled using a single
> kernel stack navigated by sp_el1. This forces many systems to use
> 16KB stack, not 8KB one. Low memory platforms naturally suffer from
> memory pressure accompanied by performance degradation.
> 
> This patch addresses the issue as introducing a separate percpu IRQ
> stack to handle both hard and soft interrupts with two ground rules:
> 
>   - Utilize sp_el0 in EL1 context, which is not used currently
>   - Do not complicate current_thread_info calculation
> 
> It is a core concept to trace struct thread_info using sp_el0 instead
> of sp_el1. This approach helps arm64 align with other architectures
> regarding object_is_on_stack() without additional complexity.
> 
> Cc: James Morse <james.morse@arm.com>
> Signed-off-by: Jungseok Lee <jungseoklee85@gmail.com>
> ---
> Changes since v1:
> - Rebased on top of v4.3-rc1
> - Removed Kconfig about IRQ stack, per James
> - Used PERCPU for IRQ stack, per James
> - Tried to allocate IRQ stack when CPU is about to start up, per James
> - Moved sp_el0 update into kernel_entry macro, per James
> - Dropped S_SP removal patch, per Mark and James
> 
>  arch/arm64/include/asm/irq.h         |  8 +++
>  arch/arm64/include/asm/thread_info.h |  7 +-
>  arch/arm64/kernel/asm-offsets.c      |  5 ++
>  arch/arm64/kernel/entry.S            | 54 ++++++++++++++--
>  arch/arm64/kernel/head.S             |  3 +
>  arch/arm64/kernel/irq.c              | 21 ++++++
>  arch/arm64/kernel/smp.c              |  6 ++
>  7 files changed, 95 insertions(+), 9 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/irq.h b/arch/arm64/include/asm/irq.h
> index bbb251b..67975a2 100644
> --- a/arch/arm64/include/asm/irq.h
> +++ b/arch/arm64/include/asm/irq.h
> @@ -5,11 +5,19 @@
>  
>  #include <asm-generic/irq.h>
>  
> +struct irq_stack {
> +	void *stack;
> +	unsigned long thread_sp;
> +	unsigned int count;
> +};
> +
>  struct pt_regs;
>  
>  extern void migrate_irqs(void);
>  extern void set_handle_irq(void (*handle_irq)(struct pt_regs *));
>  
> +extern int alloc_irq_stack(unsigned int cpu);
> +
>  static inline void acpi_irq_init(void)
>  {
>  	/*
> diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
> index dcd06d1..44839c0 100644
> --- a/arch/arm64/include/asm/thread_info.h
> +++ b/arch/arm64/include/asm/thread_info.h
> @@ -73,8 +73,11 @@ static inline struct thread_info *current_thread_info(void) __attribute_const__;
>  
>  static inline struct thread_info *current_thread_info(void)
>  {
> -	return (struct thread_info *)
> -		(current_stack_pointer & ~(THREAD_SIZE - 1));
> +	unsigned long sp_el0;
> +
> +	asm volatile("mrs %0, sp_el0" : "=r" (sp_el0));
> +
> +	return (struct thread_info *)(sp_el0 & ~(THREAD_SIZE - 1));

This looks like it will generate worse code than our current implementation,
thanks to the asm volatile. Maybe just add something like a global
current_stack_pointer_el0?

Will
James Morse Sept. 17, 2015, 10:33 a.m. UTC | #2
Hi Will,

On 16/09/15 12:25, Will Deacon wrote:
> On Sun, Sep 13, 2015 at 03:42:17PM +0100, Jungseok Lee wrote:
>> diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
>> index dcd06d1..44839c0 100644
>> --- a/arch/arm64/include/asm/thread_info.h
>> +++ b/arch/arm64/include/asm/thread_info.h
>> @@ -73,8 +73,11 @@ static inline struct thread_info *current_thread_info(void) __attribute_const__;
>>  
>>  static inline struct thread_info *current_thread_info(void)
>>  {
>> -	return (struct thread_info *)
>> -		(current_stack_pointer & ~(THREAD_SIZE - 1));
>> +	unsigned long sp_el0;
>> +
>> +	asm volatile("mrs %0, sp_el0" : "=r" (sp_el0));
>> +
>> +	return (struct thread_info *)(sp_el0 & ~(THREAD_SIZE - 1));
> 
> This looks like it will generate worse code than our current implementation,
> thanks to the asm volatile. Maybe just add something like a global
> current_stack_pointer_el0?

Like current_stack_pointer does?:
> register unsigned long current_stack_pointer_el0 asm ("sp_el0");

Unfortunately the compiler won't accept this, as it doesn't like the
register name, it also won't accept instructions in this asm string.

Dropping the 'volatile' has the desired affect[0]. This would only cause a
problem over a call to cpu_switch_to(), which writes to sp_el0, but also
save/restores the callee-saved registers, so they will always be consistent.


James




[0] A fictitious example printk:
> printk("%p%p%u%p", get_fs(), current_thread_info(),
>        smp_processor_id(), current);

With this patch compiles to:
 5f8:   d5384101        mrs     x1, sp_el0
 5fc:   d5384100        mrs     x0, sp_el0
 600:   d5384103        mrs     x3, sp_el0
 604:   d5384104        mrs     x4, sp_el0
 608:   9272c484        and     x4, x4, #0xffffffffffffc000
 60c:   9272c463        and     x3, x3, #0xffffffffffffc000
 610:   9272c421        and     x1, x1, #0xffffffffffffc000
 614:   aa0403e2        mov     x2, x4
 618:   90000000        adrp    x0, 0 <do_bad>
 61c:   f9400884        ldr     x4, [x4,#16]
 620:   91000000        add     x0, x0, #0x0
 624:   b9401c63        ldr     w3, [x3,#28]
 628:   f9400421        ldr     x1, [x1,#8]
 62c:   94000000        bl      0 <printk>

Removing the volatile:
 5e4:   d5384102        mrs     x2, sp_el0
 5e8:   f9400844        ldr     x4, [x2,#16]
 5ec:   91000000        add     x0, x0, #0x0
 5f0:   b9401c43        ldr     w3, [x2,#28]
 5f4:   f9400441        ldr     x1, [x2,#8]
 5f8:   94000000        bl      0 <printk>
Catalin Marinas Sept. 17, 2015, 10:48 a.m. UTC | #3
On Thu, Sep 17, 2015 at 11:33:44AM +0100, James Morse wrote:
> Hi Will,
> 
> On 16/09/15 12:25, Will Deacon wrote:
> > On Sun, Sep 13, 2015 at 03:42:17PM +0100, Jungseok Lee wrote:
> >> diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
> >> index dcd06d1..44839c0 100644
> >> --- a/arch/arm64/include/asm/thread_info.h
> >> +++ b/arch/arm64/include/asm/thread_info.h
> >> @@ -73,8 +73,11 @@ static inline struct thread_info *current_thread_info(void) __attribute_const__;
> >>  
> >>  static inline struct thread_info *current_thread_info(void)
> >>  {
> >> -	return (struct thread_info *)
> >> -		(current_stack_pointer & ~(THREAD_SIZE - 1));
> >> +	unsigned long sp_el0;
> >> +
> >> +	asm volatile("mrs %0, sp_el0" : "=r" (sp_el0));
> >> +
> >> +	return (struct thread_info *)(sp_el0 & ~(THREAD_SIZE - 1));
> > 
> > This looks like it will generate worse code than our current implementation,
> > thanks to the asm volatile. Maybe just add something like a global
> > current_stack_pointer_el0?
> 
> Like current_stack_pointer does?:
> > register unsigned long current_stack_pointer_el0 asm ("sp_el0");
> 
> Unfortunately the compiler won't accept this, as it doesn't like the
> register name, it also won't accept instructions in this asm string.

But once we do SPSel = 0, can we not just use the SP register here?

(I haven't read the rest of the patch yet)
Catalin Marinas Sept. 17, 2015, 11:17 a.m. UTC | #4
On Sun, Sep 13, 2015 at 02:42:17PM +0000, Jungseok Lee wrote:
> Currently, kernel context and interrupts are handled using a single
> kernel stack navigated by sp_el1. This forces many systems to use
> 16KB stack, not 8KB one. Low memory platforms naturally suffer from
> memory pressure accompanied by performance degradation.
> 
> This patch addresses the issue as introducing a separate percpu IRQ
> stack to handle both hard and soft interrupts with two ground rules:
> 
>   - Utilize sp_el0 in EL1 context, which is not used currently
>   - Do not complicate current_thread_info calculation
> 
> It is a core concept to trace struct thread_info using sp_el0 instead
> of sp_el1. This approach helps arm64 align with other architectures
> regarding object_is_on_stack() without additional complexity.

I'm still trying to understand how this patch works. I initially thought
that we would set SPSel = 0 while in kernel thread mode to make use of
SP_EL0 but I can't find any such code. Do you still use SP_EL1 all the
time and SP_EL0 just for temporary saving the thread stack?
Jungseok Lee Sept. 17, 2015, 1:17 p.m. UTC | #5
On Sep 17, 2015, at 8:17 PM, Catalin Marinas wrote:

Hi Catalin,

> On Sun, Sep 13, 2015 at 02:42:17PM +0000, Jungseok Lee wrote:
>> Currently, kernel context and interrupts are handled using a single
>> kernel stack navigated by sp_el1. This forces many systems to use
>> 16KB stack, not 8KB one. Low memory platforms naturally suffer from
>> memory pressure accompanied by performance degradation.
>> 
>> This patch addresses the issue as introducing a separate percpu IRQ
>> stack to handle both hard and soft interrupts with two ground rules:
>> 
>>  - Utilize sp_el0 in EL1 context, which is not used currently
>>  - Do not complicate current_thread_info calculation
>> 
>> It is a core concept to trace struct thread_info using sp_el0 instead
>> of sp_el1. This approach helps arm64 align with other architectures
>> regarding object_is_on_stack() without additional complexity.
> 
> I'm still trying to understand how this patch works. I initially thought
> that we would set SPSel = 0 while in kernel thread mode to make use of
> SP_EL0 but I can't find any such code. Do you still use SP_EL1 all the
> time and SP_EL0 just for temporary saving the thread stack?

Exactly.

My first approach was to set SPSel = 0 and implement EL1t Sync and IRQ.
This idea originally comes from your comment [1]. A kernel thread could
be handled easily and neatly, but it complicated current_thread_info
calculation due to a user process.

Let's assume that a kernel thread uses SP_EL0 by default. When an interrupt
comes in, a core jumps to EL1t IRQ. In case of a user process, a CPU goes
into EL1h IRQ when an interrupt raises. To handle this scenario correctly,
SPSel or spsr_el1 should be referenced. This reaches to quite big overhead
in current_thread_info function.

I always keep my mind on simplicity of the function. Thus, I've decided to
give up the approach.

[1] https://lkml.org/lkml/2015/5/25/454

Best Regards
Jungseok Lee
Jungseok Lee Sept. 17, 2015, 1:22 p.m. UTC | #6
On Sep 17, 2015, at 10:17 PM, Jungseok Lee wrote:
> On Sep 17, 2015, at 8:17 PM, Catalin Marinas wrote:
> 
> Hi Catalin,
> 
>> On Sun, Sep 13, 2015 at 02:42:17PM +0000, Jungseok Lee wrote:
>>> Currently, kernel context and interrupts are handled using a single
>>> kernel stack navigated by sp_el1. This forces many systems to use
>>> 16KB stack, not 8KB one. Low memory platforms naturally suffer from
>>> memory pressure accompanied by performance degradation.
>>> 
>>> This patch addresses the issue as introducing a separate percpu IRQ
>>> stack to handle both hard and soft interrupts with two ground rules:
>>> 
>>> - Utilize sp_el0 in EL1 context, which is not used currently
>>> - Do not complicate current_thread_info calculation
>>> 
>>> It is a core concept to trace struct thread_info using sp_el0 instead
>>> of sp_el1. This approach helps arm64 align with other architectures
>>> regarding object_is_on_stack() without additional complexity.
>> 
>> I'm still trying to understand how this patch works. I initially thought
>> that we would set SPSel = 0 while in kernel thread mode to make use of
>> SP_EL0 but I can't find any such code. Do you still use SP_EL1 all the
>> time and SP_EL0 just for temporary saving the thread stack?
> 
> Exactly.
> 
> My first approach was to set SPSel = 0 and implement EL1t Sync and IRQ.
> This idea originally comes from your comment [1]. A kernel thread could
> be handled easily and neatly, but it complicated current_thread_info
> calculation due to a user process.
> 
> Let's assume that a kernel thread uses SP_EL0 by default. When an interrupt
> comes in, a core jumps to EL1t IRQ. In case of a user process, a CPU goes
> into EL1h IRQ when an interrupt raises. To handle this scenario correctly,
> SPSel or spsr_el1 should be referenced. This reaches to quite big overhead
> in current_thread_info function.

This statement is described incorrectly. In case of user process, a CPU goes
into EL0 IRQ. Under this context, another interrupt could come in. At this
time, a core jumps to EL1h IRQ.

My original intention is to describe this situation.

Sorry for confusion.

Best Regards
Jungseok Lee
Catalin Marinas Sept. 17, 2015, 4:21 p.m. UTC | #7
On Thu, Sep 17, 2015 at 10:22:26PM +0900, Jungseok Lee wrote:
> On Sep 17, 2015, at 10:17 PM, Jungseok Lee wrote:
> > On Sep 17, 2015, at 8:17 PM, Catalin Marinas wrote:
> >> On Sun, Sep 13, 2015 at 02:42:17PM +0000, Jungseok Lee wrote:
> >>> Currently, kernel context and interrupts are handled using a single
> >>> kernel stack navigated by sp_el1. This forces many systems to use
> >>> 16KB stack, not 8KB one. Low memory platforms naturally suffer from
> >>> memory pressure accompanied by performance degradation.
> >>> 
> >>> This patch addresses the issue as introducing a separate percpu IRQ
> >>> stack to handle both hard and soft interrupts with two ground rules:
> >>> 
> >>> - Utilize sp_el0 in EL1 context, which is not used currently
> >>> - Do not complicate current_thread_info calculation
> >>> 
> >>> It is a core concept to trace struct thread_info using sp_el0 instead
> >>> of sp_el1. This approach helps arm64 align with other architectures
> >>> regarding object_is_on_stack() without additional complexity.
> >> 
> >> I'm still trying to understand how this patch works. I initially thought
> >> that we would set SPSel = 0 while in kernel thread mode to make use of
> >> SP_EL0 but I can't find any such code. Do you still use SP_EL1 all the
> >> time and SP_EL0 just for temporary saving the thread stack?
> > 
> > Exactly.
> > 
> > My first approach was to set SPSel = 0 and implement EL1t Sync and IRQ.
> > This idea originally comes from your comment [1]. A kernel thread could
> > be handled easily and neatly, but it complicated current_thread_info
> > calculation due to a user process.
> > 
> > Let's assume that a kernel thread uses SP_EL0 by default. When an interrupt
> > comes in, a core jumps to EL1t IRQ. In case of a user process, a CPU goes
> > into EL1h IRQ when an interrupt raises. To handle this scenario correctly,
> > SPSel or spsr_el1 should be referenced. This reaches to quite big overhead
> > in current_thread_info function.
> 
> This statement is described incorrectly. In case of user process, a CPU goes
> into EL0 IRQ. Under this context, another interrupt could come in. At this
> time, a core jumps to EL1h IRQ.

I don't I entirely follow you here.

First of all, we don't allow re-entrant IRQs, they are disabled during
handling (there are patches for NMI via IRQ priorities but these would
be a special case on a different code path; for the current code, let's
just assume that IRQs are not re-entrant).

Second, SPSel is automatically set to 1 when taking an exception. So we
are guaranteed that the kernel entry code always switches to SP_EL1
(EL1h mode).

My initial thought was to populate SP_EL1 per CPU as a handler stack and
never change it afterwards. The entry code may continue to use SP_EL1 if
in interrupt or switch to SP_EL0 and SPSel = 0 if in thread context.
What I didn't realise is that SP_EL0 cannot be accessed directly when
SPSel == 0, only as SP. This indeed complicates current_thread_info
slightly.

I did some tests with using SPSel in current_thread_info() to read SP or
SP_EL0 and it doesn't look good, it increased the .text section by 132KB
(I may have been able to optimise it a bit but it is still quite large).
With your approach to always use sp_el0, I get about 4KB larger .text
section.

So, without any better suggestion for current_thread_info(), I'm giving
up the idea of using SPSel == 0 in the kernel. I'll look at your patch
in more detail. BTW, I don't think we need the any count for the irq
stack as we don't re-enter the same IRQ stack.
Jungseok Lee Sept. 18, 2015, 12:57 p.m. UTC | #8
On Sep 18, 2015, at 1:21 AM, Catalin Marinas wrote:
> On Thu, Sep 17, 2015 at 10:22:26PM +0900, Jungseok Lee wrote:
>> On Sep 17, 2015, at 10:17 PM, Jungseok Lee wrote:
>>> On Sep 17, 2015, at 8:17 PM, Catalin Marinas wrote:
>>>> On Sun, Sep 13, 2015 at 02:42:17PM +0000, Jungseok Lee wrote:
>>>>> Currently, kernel context and interrupts are handled using a single
>>>>> kernel stack navigated by sp_el1. This forces many systems to use
>>>>> 16KB stack, not 8KB one. Low memory platforms naturally suffer from
>>>>> memory pressure accompanied by performance degradation.
>>>>> 
>>>>> This patch addresses the issue as introducing a separate percpu IRQ
>>>>> stack to handle both hard and soft interrupts with two ground rules:
>>>>> 
>>>>> - Utilize sp_el0 in EL1 context, which is not used currently
>>>>> - Do not complicate current_thread_info calculation
>>>>> 
>>>>> It is a core concept to trace struct thread_info using sp_el0 instead
>>>>> of sp_el1. This approach helps arm64 align with other architectures
>>>>> regarding object_is_on_stack() without additional complexity.
>>>> 
>>>> I'm still trying to understand how this patch works. I initially thought
>>>> that we would set SPSel = 0 while in kernel thread mode to make use of
>>>> SP_EL0 but I can't find any such code. Do you still use SP_EL1 all the
>>>> time and SP_EL0 just for temporary saving the thread stack?
>>> 
>>> Exactly.
>>> 
>>> My first approach was to set SPSel = 0 and implement EL1t Sync and IRQ.
>>> This idea originally comes from your comment [1]. A kernel thread could
>>> be handled easily and neatly, but it complicated current_thread_info
>>> calculation due to a user process.
>>> 
>>> Let's assume that a kernel thread uses SP_EL0 by default. When an interrupt
>>> comes in, a core jumps to EL1t IRQ. In case of a user process, a CPU goes
>>> into EL1h IRQ when an interrupt raises. To handle this scenario correctly,
>>> SPSel or spsr_el1 should be referenced. This reaches to quite big overhead
>>> in current_thread_info function.
>> 
>> This statement is described incorrectly. In case of user process, a CPU goes
>> into EL0 IRQ. Under this context, another interrupt could come in. At this
>> time, a core jumps to EL1h IRQ.
> 
> I don't I entirely follow you here.

My description was not clear enough. It's my fault.

> First of all, we don't allow re-entrant IRQs, they are disabled during
> handling (there are patches for NMI via IRQ priorities but these would
> be a special case on a different code path; for the current code, let's
> just assume that IRQs are not re-entrant).
> 
> Second, SPSel is automatically set to 1 when taking an exception. So we
> are guaranteed that the kernel entry code always switches to SP_EL1
> (EL1h mode).
> 
> My initial thought was to populate SP_EL1 per CPU as a handler stack and
> never change it afterwards. The entry code may continue to use SP_EL1 if
> in interrupt or switch to SP_EL0 and SPSel = 0 if in thread context.
> What I didn't realise is that SP_EL0 cannot be accessed directly when
> SPSel == 0, only as SP. This indeed complicates current_thread_info
> slightly.
> 
> I did some tests with using SPSel in current_thread_info() to read SP or
> SP_EL0 and it doesn't look good, it increased the .text section by 132KB
> (I may have been able to optimise it a bit but it is still quite large).
> With your approach to always use sp_el0, I get about 4KB larger .text
> section.
> 
> So, without any better suggestion for current_thread_info(), I'm giving
> up the idea of using SPSel == 0 in the kernel. I'll look at your patch
> in more detail. BTW, I don't think we need the any count for the irq
> stack as we don't re-enter the same IRQ stack.

Another interrupt could come in since IRQ is enabled when handling softirq
according to the following information which are self-evident.

(Am I missing something?)

1) kernel/softirq.c

asmlinkage __visible void __do_softirq(void)
{
        unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
        unsigned long old_flags = current->flags;
        int max_restart = MAX_SOFTIRQ_RESTART;
        struct softirq_action *h;
        bool in_hardirq;
        __u32 pending;
        int softirq_bit;

        /*  
         * Mask out PF_MEMALLOC s current task context is borrowed for the
         * softirq. A softirq handled such as network RX might set PF_MEMALLOC
         * again if the socket is related to swap
         */
        current->flags &= ~PF_MEMALLOC;

        pending = local_softirq_pending();
        account_irq_enter_time(current);

        __local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET);
        in_hardirq = lockdep_softirq_start();

restart:
        /* Reset the pending bitmask before enabling irqs */
        set_softirq_pending(0);

        local_irq_enable();

2) Stack tracer on ftrace

        Depth    Size   Location    (49 entries)
        -----    ----   --------
  0)     4456      16   arch_counter_read+0xc/0x24
  1)     4440      16   ktime_get+0x44/0xb4
  2)     4424      48   get_drm_timestamp+0x30/0x40
  3)     4376      16   drm_get_last_vbltimestamp+0x94/0xb4
  4)     4360      80   drm_handle_vblank+0x84/0x3c0
  5)     4280     144   mdp5_irq+0x118/0x130
  6)     4136      80   msm_irq+0x2c/0x68
  7)     4056      32   handle_irq_event_percpu+0x60/0x210
  8)     4024      96   handle_irq_event+0x50/0x80
  9)     3928      64   handle_fasteoi_irq+0xb0/0x178
 10)     3864      48   generic_handle_irq+0x38/0x54
 11)     3816      32   __handle_domain_irq+0x68/0xbc
 12)     3784      64   gic_handle_irq+0x38/0x88
 13)     3720     280   el1_irq+0x64/0xd8
 14)     3440     168   ftrace_ops_no_ops+0xb4/0x16c
 15)     3272      64   ftrace_call+0x0/0x4
 16)     3208      16   _raw_spin_lock_irqsave+0x14/0x70
 17)     3192      32   msm_gpio_set+0x44/0xb4
 18)     3160      48   _gpiod_set_raw_value+0x68/0x148
 19)     3112      64   gpiod_set_value+0x40/0x70
 20)     3048      32   gpio_led_set+0x3c/0x94
 21)     3016      48   led_set_brightness+0x50/0xa4
 22)     2968      32   led_trigger_event+0x4c/0x78
 23)     2936      48   mmc_request_done+0x38/0x84
 24)     2888      32   sdhci_tasklet_finish+0xcc/0x12c
 25)     2856      48   tasklet_action+0x64/0x120
 26)     2808      48   __do_softirq+0x114/0x2f0
 27)     2760     128   irq_exit+0x98/0xd8
 28)     2632      32   __handle_domain_irq+0x6c/0xbc
 29)     2600      64   gic_handle_irq+0x38/0x88
 30)     2536     280   el1_irq+0x64/0xd8
 31)     2256     168   ftrace_ops_no_ops+0xb4/0x16c
 32)     2088      64   ftrace_call+0x0/0x4
 33)     2024      16   __schedule+0x1c/0x748
 34)     2008      80   schedule+0x38/0x94
 35)     1928      32   schedule_timeout+0x1a8/0x200
 36)     1896     128   wait_for_common+0xa8/0x150
 37)     1768     128   wait_for_completion+0x24/0x34
 38)     1640      32   mmc_wait_for_req_done+0x3c/0x104
 39)     1608      64   mmc_wait_for_cmd+0x68/0x94
 40)     1544     128   get_card_status.isra.25+0x6c/0x88
 41)     1416     112   card_busy_detect.isra.31+0x7c/0x154
 42)     1304     128   mmc_blk_err_check+0xd0/0x4f8
 43)     1176     192   mmc_start_req+0xe4/0x3a8
 44)      984     160   mmc_blk_issue_rw_rq+0xc4/0x9c0
 45)      824     176   mmc_blk_issue_rq+0x19c/0x450
 46)      648     112   mmc_queue_thread+0x134/0x17c
 47)      536      80   kthread+0xe0/0xf8
 48)      456     456   ret_from_fork+0xc/0x50

In my first approach using SPSel = 0, current_thread_info function was inefficient
in order to handle this case correctly.

BTW, in this context, it is only meaningful to decide whether a current interrupt
is re-enterrant or not. Its actual value is not important, but I could not figure
out a better implementation than this one yet. Any suggestions are welcome!

Best Regards
Jungseok Lee
James Morse Sept. 18, 2015, 1:44 p.m. UTC | #9
On 18/09/15 13:57, Jungseok Lee wrote:
> On Sep 18, 2015, at 1:21 AM, Catalin Marinas wrote:
>> in more detail. BTW, I don't think we need the any count for the irq
>> stack as we don't re-enter the same IRQ stack.
> 
> Another interrupt could come in since IRQ is enabled when handling softirq
> according to the following information which are self-evident.
> 
> (Am I missing something?)
> 
> 1) kernel/softirq.c
> 
> asmlinkage __visible void __do_softirq(void)
> {
>         unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
>         unsigned long old_flags = current->flags;
>         int max_restart = MAX_SOFTIRQ_RESTART;
>         struct softirq_action *h;
>         bool in_hardirq;
>         __u32 pending;
>         int softirq_bit;
> 
>         /*  
>          * Mask out PF_MEMALLOC s current task context is borrowed for the
>          * softirq. A softirq handled such as network RX might set PF_MEMALLOC
>          * again if the socket is related to swap
>          */
>         current->flags &= ~PF_MEMALLOC;
> 
>         pending = local_softirq_pending();
>         account_irq_enter_time(current);
> 
>         __local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET);
>         in_hardirq = lockdep_softirq_start();
> 
> restart:
>         /* Reset the pending bitmask before enabling irqs */
>         set_softirq_pending(0);
> 
>         local_irq_enable();

This call into __do_softirq() should be prevented by
kernel/softirq.c:irq_exit():
> preempt_count_sub(HARDIRQ_OFFSET);
> if (!in_interrupt() && local_softirq_pending())
>      invoke_softirq();

in_interrupt(), pulls preempt_count out of thread_info, and masks it with
(SOFTIRQ_MASK | HARDIRQ_MASK | NMI_MASK). This value is zero due to the
preempt_count_sub() immediately before.

preempt_count_add(HARDIRQ_OFFSET) is called in __irq_enter(), so its not
unreasonable that it decrements it here - but it is falling to zero,
causing softirqs to be handled during interrupt handling.


Despite the '!in_interrupt()', it looks like this is entirely intentional,
from invoke_softirq():
/*
 * We can safely execute softirq on the current stack if
 * it is the irq stack, because it should be near empty
 * at this stage.
 */


x86 has an additional preempt_count_{add,sub}(HARDIRQ_OFFSET) in
ist_{enter,exit}(), which would prevent this. They call this for
double_faults and debug-exceptions.

It looks like we need to 'preempt_count_add(HARDIRQ_OFFSET)' in el1_irq()
to prevent the fall-to-zero, to prevent recursive use of the irq stack -
alternatively I have a smaller set of asm for irq_stack_entry() which keeps
the status quo.



James
James Morse Sept. 18, 2015, 1:46 p.m. UTC | #10
Hi Jungseok Lee,

I gave this a go on a Juno board, while generating usb/network interrupts:

Tested-by: James Morse <james.morse@arm.com>


On 13/09/15 15:42, Jungseok Lee wrote:
> Currently, kernel context and interrupts are handled using a single
> kernel stack navigated by sp_el1. This forces many systems to use
> 16KB stack, not 8KB one. Low memory platforms naturally suffer from
> memory pressure accompanied by performance degradation.
> 
> This patch addresses the issue as introducing a separate percpu IRQ
> stack to handle both hard and soft interrupts with two ground rules:
> 
>   - Utilize sp_el0 in EL1 context, which is not used currently
>   - Do not complicate current_thread_info calculation
> 
> It is a core concept to trace struct thread_info using sp_el0 instead
> of sp_el1. This approach helps arm64 align with other architectures
> regarding object_is_on_stack() without additional complexity.

I think you are missing a 'mov <reg>, sp; msr sp_el0, <reg>' in
kernel/sleep.S:cpu_resume():175. This code finds the saved stack pointer
from 'sleep_save_sp', and is called when the cpu wakes up from suspend.

It didn't show up in testing, because the wake-up is always under the idle
task, which evidently doesn't call current_thread_info() after wake-up.


> diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
> index 4306c93..c156540 100644
> --- a/arch/arm64/kernel/entry.S
> +++ b/arch/arm64/kernel/entry.S
> @@ -88,7 +88,7 @@
>  
>  	.if	\el == 0
>  	mrs	x21, sp_el0
> -	get_thread_info tsk			// Ensure MDSCR_EL1.SS is clear,
> +	get_thread_info \el, tsk		// Ensure MDSCR_EL1.SS is clear,
>  	ldr	x19, [tsk, #TI_FLAGS]		// since we can unmask debug
>  	disable_step_tsk x19, x20		// exceptions when scheduling.
>  	.else
> @@ -105,6 +105,8 @@
>  	.if	\el == 0
>  	mvn	x21, xzr
>  	str	x21, [sp, #S_SYSCALLNO]
> +	mov	x25, sp
> +	msr	sp_el0, x25
>  	.endif
>  
>  	/*
> @@ -163,9 +165,45 @@ alternative_endif
>  	eret					// return to kernel
>  	.endm
>  
> -	.macro	get_thread_info, rd
> +	.macro	get_thread_info, el, rd
> +	.if	\el == 0

Why does \el matter here?
If \el==0, we interrupted an el0 thread, and set sp_el0 in kernel_entry()
to the el1 stack.
If \el==1, we interrupted an el1 thread, didn't overwrite its sp_el0, so
sp_el0 & ~(THREAD_SIZE-1) will give us the struct thread_info of the
interrupted task.

So either way, sp_el0 is correct...


>  	mov	\rd, sp
> -	and	\rd, \rd, #~(THREAD_SIZE - 1)	// top of stack
> +	.else
> +	mrs	\rd, sp_el0
> +	.endif
> +	and	\rd, \rd, #~(THREAD_SIZE - 1)	// bottom of thread stack
> +	.endm
> +
> +	.macro	get_irq_stack
> +	adr_l	x21, irq_stacks
> +	mrs	x22, tpidr_el1
> +	add	x21, x21, x22
> +	.endm
> +
> +	.macro	irq_stack_entry
> +	get_irq_stack
> +	ldr	w23, [x21, #IRQ_COUNT]
> +	cbnz	w23, 1f				// check irq recursion
> +	mov	x23, sp
> +	str	x23, [x21, #IRQ_THREAD_SP]
> +	ldr	x23, [x21, #IRQ_STACK]
> +	mov	sp, x23
> +	mov	x23, xzr
> +1:	add	w23, w23, #1
> +	str	w23, [x21, #IRQ_COUNT]

A (largely untested) example for the 'compare the high-order bits' way of
doing this:

	.macro	irq_stack_entry
	get_irq_stack
	ldr	x22, [x21, #IRQ_STACK]
	and	x23, x22, #~(THREAD_SIZE -1)
	mov	x24, sp
	and	x24, x24, #~(THREAD_SIZE -1)
	cmp	x23, x24	// irq_recursion?
	mov	x24, sp
	csel	x23, x24, x22, eq
	mov	sp, x23
	.endm

	/* preserve x24 between irq_stack_entry/irq_stack_exit */

	.macro	irq_stack_exit
	mov	sp, x24
	.endm

This would let you remove IRQ_COUNT and IRQ_THREAD_SP, and avoid the two
stores and a conditional-branch in irq_stack_entry/irq_stack_exit.

Thoughts?


> +	.endm
> +
> +	.macro	irq_stack_exit
> +	get_irq_stack
> +	ldr	w23, [x21, #IRQ_COUNT]
> +	sub	w23, w23, #1
> +	cbnz	w23, 1f				// check irq recursion
> +	mov	x23, sp
> +	str	x23, [x21, #IRQ_STACK]
> +	ldr	x23, [x21, #IRQ_THREAD_SP]
> +	mov	sp, x23
> +	mov	x23, xzr
> +1:	str	w23, [x21, #IRQ_COUNT]
>  	.endm
>  
>  /*
> @@ -183,10 +221,11 @@ tsk	.req	x28		// current thread_info
>   * Interrupt handling.
>   */
>  	.macro	irq_handler
> -	adrp	x1, handle_arch_irq
> -	ldr	x1, [x1, #:lo12:handle_arch_irq]
> +	ldr_l	x1, handle_arch_irq
>  	mov	x0, sp
> +	irq_stack_entry
>  	blr	x1
> +	irq_stack_exit
>  	.endm
>  
>  	.text
> @@ -361,7 +400,7 @@ el1_irq:
>  	irq_handler
>  
>  #ifdef CONFIG_PREEMPT
> -	get_thread_info tsk
> +	get_thread_info 1, tsk
>  	ldr	w24, [tsk, #TI_PREEMPT]		// get preempt count
>  	cbnz	w24, 1f				// preempt count != 0
>  	ldr	x0, [tsk, #TI_FLAGS]		// get flags
> @@ -597,6 +636,7 @@ ENTRY(cpu_switch_to)
>  	ldp	x29, x9, [x8], #16
>  	ldr	lr, [x8]
>  	mov	sp, x9
> +	msr	sp_el0, x9
>  	ret
>  ENDPROC(cpu_switch_to)
>  
> @@ -655,7 +695,7 @@ ENTRY(ret_from_fork)
>  	cbz	x19, 1f				// not a kernel thread
>  	mov	x0, x20
>  	blr	x19
> -1:	get_thread_info tsk
> +1:	get_thread_info 1, tsk
>  	b	ret_to_user
>  ENDPROC(ret_from_fork)
>  
> diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
> index a055be6..cb13290 100644
> --- a/arch/arm64/kernel/head.S
> +++ b/arch/arm64/kernel/head.S
> @@ -441,6 +441,8 @@ __mmap_switched:
>  	b	1b
>  2:
>  	adr_l	sp, initial_sp, x4
> +	mov	x4, sp

There should probably a comment explaining why sp_el0 is being set (for the
changes outside entry.S). Something like:

	msr	sp_el0, x4		// stash struct thread_info


> +	msr	sp_el0, x4
>  	str_l	x21, __fdt_pointer, x5		// Save FDT pointer
>  	str_l	x24, memstart_addr, x6		// Save PHYS_OFFSET
>  	mov	x29, #0
> @@ -613,6 +615,7 @@ ENDPROC(secondary_startup)
>  ENTRY(__secondary_switched)
>  	ldr	x0, [x21]			// get secondary_data.stack
>  	mov	sp, x0
> +	msr	sp_el0, x0
>  	mov	x29, #0
>  	b	secondary_start_kernel
>  ENDPROC(__secondary_switched)


Thanks,

James
Catalin Marinas Sept. 18, 2015, 3:03 p.m. UTC | #11
On Fri, Sep 18, 2015 at 09:57:56PM +0900, Jungseok Lee wrote:
> On Sep 18, 2015, at 1:21 AM, Catalin Marinas wrote:
> > So, without any better suggestion for current_thread_info(), I'm giving
> > up the idea of using SPSel == 0 in the kernel. I'll look at your patch
> > in more detail. BTW, I don't think we need the any count for the irq
> > stack as we don't re-enter the same IRQ stack.
> 
> Another interrupt could come in since IRQ is enabled when handling softirq
> according to the following information which are self-evident.
> 
> (Am I missing something?)

No. I had the wrong impression that we switch to the softirqd stack for
softirqs but you are right, if we run them on the same stack the IRQs
are enabled and they can be re-entered before we return from this
exception handler.

I've seen other architectures implementing a dedicated softirq stack but
for now we should just re-use the current IRQ stack.

> In my first approach using SPSel = 0, current_thread_info function was inefficient
> in order to handle this case correctly.

I agree. And we don't have any other scratch register left as we use
tpidr_el1 for per-cpu areas.

> BTW, in this context, it is only meaningful to decide whether a current interrupt
> is re-enterrant or not. Its actual value is not important, but I could not figure
> out a better implementation than this one yet. Any suggestions are welcome!

James' idea of checking the lower SP bits instead of a count may work.
Catalin Marinas Sept. 18, 2015, 3:31 p.m. UTC | #12
On Fri, Sep 18, 2015 at 04:03:02PM +0100, Catalin Marinas wrote:
> On Fri, Sep 18, 2015 at 09:57:56PM +0900, Jungseok Lee wrote:
> > On Sep 18, 2015, at 1:21 AM, Catalin Marinas wrote:
> > > So, without any better suggestion for current_thread_info(), I'm giving
> > > up the idea of using SPSel == 0 in the kernel. I'll look at your patch
> > > in more detail. BTW, I don't think we need the any count for the irq
> > > stack as we don't re-enter the same IRQ stack.
> > 
> > Another interrupt could come in since IRQ is enabled when handling softirq
> > according to the following information which are self-evident.
> > 
> > (Am I missing something?)
> 
> No. I had the wrong impression that we switch to the softirqd stack for
> softirqs but you are right, if we run them on the same stack the IRQs
> are enabled and they can be re-entered before we return from this
> exception handler.
> 
> I've seen other architectures implementing a dedicated softirq stack but
> for now we should just re-use the current IRQ stack.
> 
> > In my first approach using SPSel = 0, current_thread_info function was inefficient
> > in order to handle this case correctly.
> 
> I agree. And we don't have any other scratch register left as we use
> tpidr_el1 for per-cpu areas.
> 
> > BTW, in this context, it is only meaningful to decide whether a current interrupt
> > is re-enterrant or not. Its actual value is not important, but I could not figure
> > out a better implementation than this one yet. Any suggestions are welcome!
> 
> James' idea of checking the lower SP bits instead of a count may work.

Another thought (it seems that x86 does something similar): we know the
IRQ stack is not re-entered until interrupts are enabled in
__do_softirq. If we enable __ARCH_HAS_DO_SOFTIRQ, we can implement an
arm64-specific do_softirq_own_stack() which increments a counter before
calling __do_softirq. The difference from your patch is that
irq_stack_entry only reads such counter, doesn't need to write it.

Yet another idea is to reserve some space in the lower address part of
the stack with a "stack type" information. It still requires another
read, so I think the x86 approach is probably better.
Jungseok Lee Sept. 19, 2015, 8:20 a.m. UTC | #13
On Sep 18, 2015, at 10:46 PM, James Morse wrote:
> Hi Jungseok Lee,

Hi James Morse,

> I gave this a go on a Juno board, while generating usb/network interrupts:
> 
> Tested-by: James Morse <james.morse@arm.com>

Thanks a lot!

> On 13/09/15 15:42, Jungseok Lee wrote:
>> Currently, kernel context and interrupts are handled using a single
>> kernel stack navigated by sp_el1. This forces many systems to use
>> 16KB stack, not 8KB one. Low memory platforms naturally suffer from
>> memory pressure accompanied by performance degradation.
>> 
>> This patch addresses the issue as introducing a separate percpu IRQ
>> stack to handle both hard and soft interrupts with two ground rules:
>> 
>>  - Utilize sp_el0 in EL1 context, which is not used currently
>>  - Do not complicate current_thread_info calculation
>> 
>> It is a core concept to trace struct thread_info using sp_el0 instead
>> of sp_el1. This approach helps arm64 align with other architectures
>> regarding object_is_on_stack() without additional complexity.
> 
> I think you are missing a 'mov <reg>, sp; msr sp_el0, <reg>' in
> kernel/sleep.S:cpu_resume():175. This code finds the saved stack pointer
> from 'sleep_save_sp', and is called when the cpu wakes up from suspend.

Make sense.

> It didn't show up in testing, because the wake-up is always under the idle
> task, which evidently doesn't call current_thread_info() after wake-up.

I've never seen any issues under suspend/resume scenario yet, but it is
logically reasonable to update sp_el0 in resume context.

> 
>> diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
>> index 4306c93..c156540 100644
>> --- a/arch/arm64/kernel/entry.S
>> +++ b/arch/arm64/kernel/entry.S
>> @@ -88,7 +88,7 @@
>> 
>> 	.if	\el == 0
>> 	mrs	x21, sp_el0
>> -	get_thread_info tsk			// Ensure MDSCR_EL1.SS is clear,
>> +	get_thread_info \el, tsk		// Ensure MDSCR_EL1.SS is clear,
>> 	ldr	x19, [tsk, #TI_FLAGS]		// since we can unmask debug
>> 	disable_step_tsk x19, x20		// exceptions when scheduling.
>> 	.else
>> @@ -105,6 +105,8 @@
>> 	.if	\el == 0
>> 	mvn	x21, xzr
>> 	str	x21, [sp, #S_SYSCALLNO]
>> +	mov	x25, sp
>> +	msr	sp_el0, x25
>> 	.endif
>> 
>> 	/*
>> @@ -163,9 +165,45 @@ alternative_endif
>> 	eret					// return to kernel
>> 	.endm
>> 
>> -	.macro	get_thread_info, rd
>> +	.macro	get_thread_info, el, rd
>> +	.if	\el == 0
> 
> Why does \el matter here?
> If \el==0, we interrupted an el0 thread, and set sp_el0 in kernel_entry()
> to the el1 stack.
> If \el==1, we interrupted an el1 thread, didn't overwrite its sp_el0, so
> sp_el0 & ~(THREAD_SIZE-1) will give us the struct thread_info of the
> interrupted task.
> 
> So either way, sp_el0 is correct…

You're right.

For the next version, I've written this macro with a single line as directly
accessing thread_info via sp_el0.

> 
>> 	mov	\rd, sp
>> -	and	\rd, \rd, #~(THREAD_SIZE - 1)	// top of stack
>> +	.else
>> +	mrs	\rd, sp_el0
>> +	.endif
>> +	and	\rd, \rd, #~(THREAD_SIZE - 1)	// bottom of thread stack
>> +	.endm
>> +
>> +	.macro	get_irq_stack
>> +	adr_l	x21, irq_stacks
>> +	mrs	x22, tpidr_el1
>> +	add	x21, x21, x22
>> +	.endm
>> +
>> +	.macro	irq_stack_entry
>> +	get_irq_stack
>> +	ldr	w23, [x21, #IRQ_COUNT]
>> +	cbnz	w23, 1f				// check irq recursion
>> +	mov	x23, sp
>> +	str	x23, [x21, #IRQ_THREAD_SP]
>> +	ldr	x23, [x21, #IRQ_STACK]
>> +	mov	sp, x23
>> +	mov	x23, xzr
>> +1:	add	w23, w23, #1
>> +	str	w23, [x21, #IRQ_COUNT]
> 
> A (largely untested) example for the 'compare the high-order bits' way of
> doing this:
> 
> 	.macro	irq_stack_entry
> 	get_irq_stack
> 	ldr	x22, [x21, #IRQ_STACK]
> 	and	x23, x22, #~(THREAD_SIZE -1)
> 	mov	x24, sp
> 	and	x24, x24, #~(THREAD_SIZE -1)
> 	cmp	x23, x24	// irq_recursion?
> 	mov	x24, sp
> 	csel	x23, x24, x22, eq
> 	mov	sp, x23
> 	.endm
> 
> 	/* preserve x24 between irq_stack_entry/irq_stack_exit */
> 
> 	.macro	irq_stack_exit
> 	mov	sp, x24
> 	.endm
> 
> This would let you remove IRQ_COUNT and IRQ_THREAD_SP, and avoid the two
> stores and a conditional-branch in irq_stack_entry/irq_stack_exit.
> 
> Thoughts?

Great idea. Thanks to this change, about 20 lines can be removed.
It works well on my board till now.

> 
>> +	.endm
>> +
>> +	.macro	irq_stack_exit
>> +	get_irq_stack
>> +	ldr	w23, [x21, #IRQ_COUNT]
>> +	sub	w23, w23, #1
>> +	cbnz	w23, 1f				// check irq recursion
>> +	mov	x23, sp
>> +	str	x23, [x21, #IRQ_STACK]
>> +	ldr	x23, [x21, #IRQ_THREAD_SP]
>> +	mov	sp, x23
>> +	mov	x23, xzr
>> +1:	str	w23, [x21, #IRQ_COUNT]
>> 	.endm
>> 
>> /*
>> @@ -183,10 +221,11 @@ tsk	.req	x28		// current thread_info
>>  * Interrupt handling.
>>  */
>> 	.macro	irq_handler
>> -	adrp	x1, handle_arch_irq
>> -	ldr	x1, [x1, #:lo12:handle_arch_irq]
>> +	ldr_l	x1, handle_arch_irq
>> 	mov	x0, sp
>> +	irq_stack_entry
>> 	blr	x1
>> +	irq_stack_exit
>> 	.endm
>> 
>> 	.text
>> @@ -361,7 +400,7 @@ el1_irq:
>> 	irq_handler
>> 
>> #ifdef CONFIG_PREEMPT
>> -	get_thread_info tsk
>> +	get_thread_info 1, tsk
>> 	ldr	w24, [tsk, #TI_PREEMPT]		// get preempt count
>> 	cbnz	w24, 1f				// preempt count != 0
>> 	ldr	x0, [tsk, #TI_FLAGS]		// get flags
>> @@ -597,6 +636,7 @@ ENTRY(cpu_switch_to)
>> 	ldp	x29, x9, [x8], #16
>> 	ldr	lr, [x8]
>> 	mov	sp, x9
>> +	msr	sp_el0, x9
>> 	ret
>> ENDPROC(cpu_switch_to)
>> 
>> @@ -655,7 +695,7 @@ ENTRY(ret_from_fork)
>> 	cbz	x19, 1f				// not a kernel thread
>> 	mov	x0, x20
>> 	blr	x19
>> -1:	get_thread_info tsk
>> +1:	get_thread_info 1, tsk
>> 	b	ret_to_user
>> ENDPROC(ret_from_fork)
>> 
>> diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
>> index a055be6..cb13290 100644
>> --- a/arch/arm64/kernel/head.S
>> +++ b/arch/arm64/kernel/head.S
>> @@ -441,6 +441,8 @@ __mmap_switched:
>> 	b	1b
>> 2:
>> 	adr_l	sp, initial_sp, x4
>> +	mov	x4, sp
> 
> There should probably a comment explaining why sp_el0 is being set (for the
> changes outside entry.S). Something like:
> 
> 	msr	sp_el0, x4		// stash struct thread_info

Agreed. I will add comments on sp_el0 across some relevant changes.

Thanks for comments!

Best Regards
Jungseok Lee
Jungseok Lee Sept. 19, 2015, 8:44 a.m. UTC | #14
On Sep 19, 2015, at 12:31 AM, Catalin Marinas wrote:
> On Fri, Sep 18, 2015 at 04:03:02PM +0100, Catalin Marinas wrote:
>> On Fri, Sep 18, 2015 at 09:57:56PM +0900, Jungseok Lee wrote:
>>> On Sep 18, 2015, at 1:21 AM, Catalin Marinas wrote:
>>>> So, without any better suggestion for current_thread_info(), I'm giving
>>>> up the idea of using SPSel == 0 in the kernel. I'll look at your patch
>>>> in more detail. BTW, I don't think we need the any count for the irq
>>>> stack as we don't re-enter the same IRQ stack.
>>> 
>>> Another interrupt could come in since IRQ is enabled when handling softirq
>>> according to the following information which are self-evident.
>>> 
>>> (Am I missing something?)
>> 
>> No. I had the wrong impression that we switch to the softirqd stack for
>> softirqs but you are right, if we run them on the same stack the IRQs
>> are enabled and they can be re-entered before we return from this
>> exception handler.
>> 
>> I've seen other architectures implementing a dedicated softirq stack but
>> for now we should just re-use the current IRQ stack.
>> 
>>> In my first approach using SPSel = 0, current_thread_info function was inefficient
>>> in order to handle this case correctly.
>> 
>> I agree. And we don't have any other scratch register left as we use
>> tpidr_el1 for per-cpu areas.
>> 
>>> BTW, in this context, it is only meaningful to decide whether a current interrupt
>>> is re-enterrant or not. Its actual value is not important, but I could not figure
>>> out a better implementation than this one yet. Any suggestions are welcome!
>> 
>> James' idea of checking the lower SP bits instead of a count may work.
> 
> Another thought (it seems that x86 does something similar): we know the
> IRQ stack is not re-entered until interrupts are enabled in
> __do_softirq. If we enable __ARCH_HAS_DO_SOFTIRQ, we can implement an
> arm64-specific do_softirq_own_stack() which increments a counter before
> calling __do_softirq. The difference from your patch is that
> irq_stack_entry only reads such counter, doesn't need to write it.
> 
> Yet another idea is to reserve some space in the lower address part of
> the stack with a "stack type" information. It still requires another
> read, so I think the x86 approach is probably better.

I've realized both hardirq and softirq should be handled on a separate stack
in order to reduce kernel stack size, which is a principal objective of this
patch. (If I'm not missing something) It is not possible to get a big win
with implementing do_softirq_own_stack() since hardirq is handled using a task
stack. This prevents a size of kernel stack from being decreased.

However, it would be meaningful to separate hard IRQ stack and soft IRQ one
as the next step.

Best Regards
Jungseok Lee
Catalin Marinas Sept. 21, 2015, 9:25 a.m. UTC | #15
On Sat, Sep 19, 2015 at 05:44:37PM +0900, Jungseok Lee wrote:
> On Sep 19, 2015, at 12:31 AM, Catalin Marinas wrote:
> > On Fri, Sep 18, 2015 at 04:03:02PM +0100, Catalin Marinas wrote:
> >> On Fri, Sep 18, 2015 at 09:57:56PM +0900, Jungseok Lee wrote:
> >>> On Sep 18, 2015, at 1:21 AM, Catalin Marinas wrote:
> >>>> So, without any better suggestion for current_thread_info(), I'm giving
> >>>> up the idea of using SPSel == 0 in the kernel. I'll look at your patch
> >>>> in more detail. BTW, I don't think we need the any count for the irq
> >>>> stack as we don't re-enter the same IRQ stack.
[...]
> >>> BTW, in this context, it is only meaningful to decide whether a current interrupt
> >>> is re-enterrant or not. Its actual value is not important, but I could not figure
> >>> out a better implementation than this one yet. Any suggestions are welcome!
[...]
> > Another thought (it seems that x86 does something similar): we know the
> > IRQ stack is not re-entered until interrupts are enabled in
> > __do_softirq. If we enable __ARCH_HAS_DO_SOFTIRQ, we can implement an
> > arm64-specific do_softirq_own_stack() which increments a counter before
> > calling __do_softirq. The difference from your patch is that
> > irq_stack_entry only reads such counter, doesn't need to write it.
> > 
> > Yet another idea is to reserve some space in the lower address part of
> > the stack with a "stack type" information. It still requires another
> > read, so I think the x86 approach is probably better.
> 
> I've realized both hardirq and softirq should be handled on a separate stack
> in order to reduce kernel stack size, which is a principal objective of this
> patch.

The objective is to reduce the kernel thread stack size (THREAD_SIZE).
This can get pretty deep on some syscalls and together with IRQs (hard
or soft), we run out of stack.

So, for now, just stick to reducing THREAD_SIZE by moving the IRQs off
this stack. If we later find that hardirqs + softirqs can't fit on the
same _IRQ_ stack, we could either increase it or allocate separate stack
for softirqs. These are static anyway, allocated during boot. But I
wouldn't get distracted with separate hard and soft IRQ stacks for now,
I doubt we would see any issues (when a softirq runs, the IRQ stack is
pretty much empty, apart from the pt_regs).

> (If I'm not missing something) It is not possible to get a big win
> with implementing do_softirq_own_stack() since hardirq is handled using a task
> stack. This prevents a size of kernel stack from being decreased.

What I meant is that hard and soft IRQs both run on the IRQ stack (not
the thread stack). But instead of incrementing a counter every time you
take a hard IRQ, just increment it in do_softirq_own_stack() with a
simple read+check in elX_irq. The "own_stack" is not the most
appropriate name because we still have the same IRQ stack but I'm not
really bothered about this.

> However, it would be meaningful to separate hard IRQ stack and soft IRQ one
> as the next step.

Only if we see IRQ stack overflowing, otherwise I don't think it's worth
the effort.
Jungseok Lee Sept. 21, 2015, 12:14 p.m. UTC | #16
On Sep 21, 2015, at 6:25 PM, Catalin Marinas wrote:
> On Sat, Sep 19, 2015 at 05:44:37PM +0900, Jungseok Lee wrote:
>> On Sep 19, 2015, at 12:31 AM, Catalin Marinas wrote:
>>> On Fri, Sep 18, 2015 at 04:03:02PM +0100, Catalin Marinas wrote:
>>>> On Fri, Sep 18, 2015 at 09:57:56PM +0900, Jungseok Lee wrote:
>>>>> On Sep 18, 2015, at 1:21 AM, Catalin Marinas wrote:
>>>>>> So, without any better suggestion for current_thread_info(), I'm giving
>>>>>> up the idea of using SPSel == 0 in the kernel. I'll look at your patch
>>>>>> in more detail. BTW, I don't think we need the any count for the irq
>>>>>> stack as we don't re-enter the same IRQ stack.
> [...]
>>>>> BTW, in this context, it is only meaningful to decide whether a current interrupt
>>>>> is re-enterrant or not. Its actual value is not important, but I could not figure
>>>>> out a better implementation than this one yet. Any suggestions are welcome!
> [...]
>>> Another thought (it seems that x86 does something similar): we know the
>>> IRQ stack is not re-entered until interrupts are enabled in
>>> __do_softirq. If we enable __ARCH_HAS_DO_SOFTIRQ, we can implement an
>>> arm64-specific do_softirq_own_stack() which increments a counter before
>>> calling __do_softirq. The difference from your patch is that
>>> irq_stack_entry only reads such counter, doesn't need to write it.
>>> 
>>> Yet another idea is to reserve some space in the lower address part of
>>> the stack with a "stack type" information. It still requires another
>>> read, so I think the x86 approach is probably better.
>> 
>> I've realized both hardirq and softirq should be handled on a separate stack
>> in order to reduce kernel stack size, which is a principal objective of this
>> patch.
> 
> The objective is to reduce the kernel thread stack size (THREAD_SIZE).
> This can get pretty deep on some syscalls and together with IRQs (hard
> or soft), we run out of stack.
> 
> So, for now, just stick to reducing THREAD_SIZE by moving the IRQs off
> this stack. If we later find that hardirqs + softirqs can't fit on the
> same _IRQ_ stack, we could either increase it or allocate separate stack
> for softirqs. These are static anyway, allocated during boot. But I
> wouldn't get distracted with separate hard and soft IRQ stacks for now,
> I doubt we would see any issues (when a softirq runs, the IRQ stack is
> pretty much empty, apart from the pt_regs).

Completely agreed.

> 
>> (If I'm not missing something) It is not possible to get a big win
>> with implementing do_softirq_own_stack() since hardirq is handled using a task
>> stack. This prevents a size of kernel stack from being decreased.
> 
> What I meant is that hard and soft IRQs both run on the IRQ stack (not
> the thread stack). But instead of incrementing a counter every time you
> take a hard IRQ, just increment it in do_softirq_own_stack() with a
> simple read+check in elX_irq. The "own_stack" is not the most
> appropriate name because we still have the same IRQ stack but I'm not
> really bothered about this.

Personally I'm favor of James's top-bit comparison idea since there is no
{load|store} operation in irq_stack_exit macro by utilizing one of callee-
saved registers.

I will do re-spin soon with the changes based on feedbacks in this thread
for clarification. It would be better to trace a change history of this patch.

>> However, it would be meaningful to separate hard IRQ stack and soft IRQ one
>> as the next step.
> 
> Only if we see IRQ stack overflowing, otherwise I don't think it's worth
> the effort.

Okay.

Thanks for comments!

Best Regards
Jungseok Lee
diff mbox

Patch

diff --git a/arch/arm64/include/asm/irq.h b/arch/arm64/include/asm/irq.h
index bbb251b..67975a2 100644
--- a/arch/arm64/include/asm/irq.h
+++ b/arch/arm64/include/asm/irq.h
@@ -5,11 +5,19 @@ 
 
 #include <asm-generic/irq.h>
 
+struct irq_stack {
+	void *stack;
+	unsigned long thread_sp;
+	unsigned int count;
+};
+
 struct pt_regs;
 
 extern void migrate_irqs(void);
 extern void set_handle_irq(void (*handle_irq)(struct pt_regs *));
 
+extern int alloc_irq_stack(unsigned int cpu);
+
 static inline void acpi_irq_init(void)
 {
 	/*
diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
index dcd06d1..44839c0 100644
--- a/arch/arm64/include/asm/thread_info.h
+++ b/arch/arm64/include/asm/thread_info.h
@@ -73,8 +73,11 @@  static inline struct thread_info *current_thread_info(void) __attribute_const__;
 
 static inline struct thread_info *current_thread_info(void)
 {
-	return (struct thread_info *)
-		(current_stack_pointer & ~(THREAD_SIZE - 1));
+	unsigned long sp_el0;
+
+	asm volatile("mrs %0, sp_el0" : "=r" (sp_el0));
+
+	return (struct thread_info *)(sp_el0 & ~(THREAD_SIZE - 1));
 }
 
 #define thread_saved_pc(tsk)	\
diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
index 8d89cf8..3bb5ce0 100644
--- a/arch/arm64/kernel/asm-offsets.c
+++ b/arch/arm64/kernel/asm-offsets.c
@@ -22,6 +22,7 @@ 
 #include <linux/mm.h>
 #include <linux/dma-mapping.h>
 #include <linux/kvm_host.h>
+#include <asm/irq.h>
 #include <asm/thread_info.h>
 #include <asm/memory.h>
 #include <asm/smp_plat.h>
@@ -41,6 +42,10 @@  int main(void)
   BLANK();
   DEFINE(THREAD_CPU_CONTEXT,	offsetof(struct task_struct, thread.cpu_context));
   BLANK();
+  DEFINE(IRQ_STACK,		offsetof(struct irq_stack, stack));
+  DEFINE(IRQ_THREAD_SP,		offsetof(struct irq_stack, thread_sp));
+  DEFINE(IRQ_COUNT,		offsetof(struct irq_stack, count));
+  BLANK();
   DEFINE(S_X0,			offsetof(struct pt_regs, regs[0]));
   DEFINE(S_X1,			offsetof(struct pt_regs, regs[1]));
   DEFINE(S_X2,			offsetof(struct pt_regs, regs[2]));
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 4306c93..c156540 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -88,7 +88,7 @@ 
 
 	.if	\el == 0
 	mrs	x21, sp_el0
-	get_thread_info tsk			// Ensure MDSCR_EL1.SS is clear,
+	get_thread_info \el, tsk		// Ensure MDSCR_EL1.SS is clear,
 	ldr	x19, [tsk, #TI_FLAGS]		// since we can unmask debug
 	disable_step_tsk x19, x20		// exceptions when scheduling.
 	.else
@@ -105,6 +105,8 @@ 
 	.if	\el == 0
 	mvn	x21, xzr
 	str	x21, [sp, #S_SYSCALLNO]
+	mov	x25, sp
+	msr	sp_el0, x25
 	.endif
 
 	/*
@@ -163,9 +165,45 @@  alternative_endif
 	eret					// return to kernel
 	.endm
 
-	.macro	get_thread_info, rd
+	.macro	get_thread_info, el, rd
+	.if	\el == 0
 	mov	\rd, sp
-	and	\rd, \rd, #~(THREAD_SIZE - 1)	// top of stack
+	.else
+	mrs	\rd, sp_el0
+	.endif
+	and	\rd, \rd, #~(THREAD_SIZE - 1)	// bottom of thread stack
+	.endm
+
+	.macro	get_irq_stack
+	adr_l	x21, irq_stacks
+	mrs	x22, tpidr_el1
+	add	x21, x21, x22
+	.endm
+
+	.macro	irq_stack_entry
+	get_irq_stack
+	ldr	w23, [x21, #IRQ_COUNT]
+	cbnz	w23, 1f				// check irq recursion
+	mov	x23, sp
+	str	x23, [x21, #IRQ_THREAD_SP]
+	ldr	x23, [x21, #IRQ_STACK]
+	mov	sp, x23
+	mov	x23, xzr
+1:	add	w23, w23, #1
+	str	w23, [x21, #IRQ_COUNT]
+	.endm
+
+	.macro	irq_stack_exit
+	get_irq_stack
+	ldr	w23, [x21, #IRQ_COUNT]
+	sub	w23, w23, #1
+	cbnz	w23, 1f				// check irq recursion
+	mov	x23, sp
+	str	x23, [x21, #IRQ_STACK]
+	ldr	x23, [x21, #IRQ_THREAD_SP]
+	mov	sp, x23
+	mov	x23, xzr
+1:	str	w23, [x21, #IRQ_COUNT]
 	.endm
 
 /*
@@ -183,10 +221,11 @@  tsk	.req	x28		// current thread_info
  * Interrupt handling.
  */
 	.macro	irq_handler
-	adrp	x1, handle_arch_irq
-	ldr	x1, [x1, #:lo12:handle_arch_irq]
+	ldr_l	x1, handle_arch_irq
 	mov	x0, sp
+	irq_stack_entry
 	blr	x1
+	irq_stack_exit
 	.endm
 
 	.text
@@ -361,7 +400,7 @@  el1_irq:
 	irq_handler
 
 #ifdef CONFIG_PREEMPT
-	get_thread_info tsk
+	get_thread_info 1, tsk
 	ldr	w24, [tsk, #TI_PREEMPT]		// get preempt count
 	cbnz	w24, 1f				// preempt count != 0
 	ldr	x0, [tsk, #TI_FLAGS]		// get flags
@@ -597,6 +636,7 @@  ENTRY(cpu_switch_to)
 	ldp	x29, x9, [x8], #16
 	ldr	lr, [x8]
 	mov	sp, x9
+	msr	sp_el0, x9
 	ret
 ENDPROC(cpu_switch_to)
 
@@ -655,7 +695,7 @@  ENTRY(ret_from_fork)
 	cbz	x19, 1f				// not a kernel thread
 	mov	x0, x20
 	blr	x19
-1:	get_thread_info tsk
+1:	get_thread_info 1, tsk
 	b	ret_to_user
 ENDPROC(ret_from_fork)
 
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index a055be6..cb13290 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -441,6 +441,8 @@  __mmap_switched:
 	b	1b
 2:
 	adr_l	sp, initial_sp, x4
+	mov	x4, sp
+	msr	sp_el0, x4
 	str_l	x21, __fdt_pointer, x5		// Save FDT pointer
 	str_l	x24, memstart_addr, x6		// Save PHYS_OFFSET
 	mov	x29, #0
@@ -613,6 +615,7 @@  ENDPROC(secondary_startup)
 ENTRY(__secondary_switched)
 	ldr	x0, [x21]			// get secondary_data.stack
 	mov	sp, x0
+	msr	sp_el0, x0
 	mov	x29, #0
 	b	secondary_start_kernel
 ENDPROC(__secondary_switched)
diff --git a/arch/arm64/kernel/irq.c b/arch/arm64/kernel/irq.c
index 11dc3fd..88acb63 100644
--- a/arch/arm64/kernel/irq.c
+++ b/arch/arm64/kernel/irq.c
@@ -31,6 +31,8 @@ 
 
 unsigned long irq_err_count;
 
+DEFINE_PER_CPU(struct irq_stack, irq_stacks);
+
 int arch_show_interrupts(struct seq_file *p, int prec)
 {
 	show_ipi_list(p, prec);
@@ -50,6 +52,9 @@  void __init set_handle_irq(void (*handle_irq)(struct pt_regs *))
 
 void __init init_IRQ(void)
 {
+	if (alloc_irq_stack(smp_processor_id()))
+		panic("Failed to allocate IRQ stack for boot cpu");
+
 	irqchip_init();
 	if (!handle_arch_irq)
 		panic("No interrupt controller found.");
@@ -115,3 +120,19 @@  void migrate_irqs(void)
 	local_irq_restore(flags);
 }
 #endif /* CONFIG_HOTPLUG_CPU */
+
+int alloc_irq_stack(unsigned int cpu)
+{
+	void *stack;
+
+	if (per_cpu(irq_stacks, cpu).stack)
+		return 0;
+
+	stack = (void *)__get_free_pages(THREADINFO_GFP, THREAD_SIZE_ORDER);
+	if (!stack)
+		return -ENOMEM;
+
+	per_cpu(irq_stacks, cpu).stack = stack + THREAD_START_SP;
+
+	return 0;
+}
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index dbdaacd..0bd7049 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -97,6 +97,12 @@  int __cpu_up(unsigned int cpu, struct task_struct *idle)
 	secondary_data.stack = task_stack_page(idle) + THREAD_START_SP;
 	__flush_dcache_area(&secondary_data, sizeof(secondary_data));
 
+	ret = alloc_irq_stack(cpu);
+	if (ret) {
+		pr_crit("CPU%u: failed to allocate IRQ stack\n", cpu);
+		return ret;
+	}
+
 	/*
 	 * Now bring the CPU into our world.
 	 */