Message ID | 1398959381-8126-5-git-send-email-ard.biesheuvel@linaro.org (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Thu, May 01, 2014 at 04:49:36PM +0100, Ard Biesheuvel wrote: > diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h > index 7a900142dbc8..05e1b24aca4c 100644 > --- a/arch/arm64/include/asm/fpsimd.h > +++ b/arch/arm64/include/asm/fpsimd.h > @@ -41,6 +41,17 @@ struct fpsimd_state { > unsigned int cpu; > }; > > +/* > + * Struct for stacking the bottom 'n' FP/SIMD registers. > + */ > +struct fpsimd_partial_state { > + u32 num_regs; > + u32 fpsr; > + u32 fpcr; > + __uint128_t vregs[32] __aligned(16); > +} __aligned(16); Do we need this explicit alignment here? > diff --git a/arch/arm64/include/asm/fpsimdmacros.h b/arch/arm64/include/asm/fpsimdmacros.h > index bbec599c96bd..69e75134689d 100644 > --- a/arch/arm64/include/asm/fpsimdmacros.h > +++ b/arch/arm64/include/asm/fpsimdmacros.h > @@ -62,3 +62,38 @@ > ldr w\tmpnr, [\state, #16 * 2 + 4] > msr fpcr, x\tmpnr > .endm > + > +.altmacro > +.macro fpsimd_save_partial state, numnr, tmpnr1, tmpnr2 > + mrs x\tmpnr1, fpsr > + str w\numnr, [\state] > + mrs x\tmpnr2, fpcr > + stp w\tmpnr1, w\tmpnr2, [\state, #4] > + adr x\tmpnr1, 0f > + add \state, \state, x\numnr, lsl #4 > + sub x\tmpnr1, x\tmpnr1, x\numnr, lsl #1 > + br x\tmpnr1 > + .irp qa, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0 > + .irp qb, %(qa + 1) > + stp q\qa, q\qb, [\state, # -16 * \qa - 16] > + .endr > + .endr > +0: > +.endm > + > +.macro fpsimd_restore_partial state, tmpnr1, tmpnr2 > + ldp w\tmpnr1, w\tmpnr2, [\state, #4] > + msr fpsr, x\tmpnr1 > + msr fpcr, x\tmpnr2 > + adr x\tmpnr1, 0f > + ldr w\tmpnr2, [\state] > + add \state, \state, x\tmpnr2, lsl #4 > + sub x\tmpnr1, x\tmpnr1, x\tmpnr2, lsl #1 > + br x\tmpnr1 > + .irp qa, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0 > + .irp qb, %(qa + 1) > + ldp q\qa, q\qb, [\state, # -16 * \qa - 16] > + .endr > + .endr > +0: > +.endm BTW, it may be better if num_regs is placed at the end of the structure, especially since you use stp to store both fpsr and fpcr (though I haven't rewritten the above to see how they look).
On 6 May 2014 18:49, Catalin Marinas <catalin.marinas@arm.com> wrote: > On Thu, May 01, 2014 at 04:49:36PM +0100, Ard Biesheuvel wrote: >> diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h >> index 7a900142dbc8..05e1b24aca4c 100644 >> --- a/arch/arm64/include/asm/fpsimd.h >> +++ b/arch/arm64/include/asm/fpsimd.h >> @@ -41,6 +41,17 @@ struct fpsimd_state { >> unsigned int cpu; >> }; >> >> +/* >> + * Struct for stacking the bottom 'n' FP/SIMD registers. >> + */ >> +struct fpsimd_partial_state { >> + u32 num_regs; >> + u32 fpsr; >> + u32 fpcr; >> + __uint128_t vregs[32] __aligned(16); >> +} __aligned(16); > > Do we need this explicit alignment here? > Without it, the implied alignment is 8 bytes, I suppose, but I haven't checked carefully. I will check and remove this if 8 bytes is the default. >> diff --git a/arch/arm64/include/asm/fpsimdmacros.h b/arch/arm64/include/asm/fpsimdmacros.h >> index bbec599c96bd..69e75134689d 100644 >> --- a/arch/arm64/include/asm/fpsimdmacros.h >> +++ b/arch/arm64/include/asm/fpsimdmacros.h >> @@ -62,3 +62,38 @@ >> ldr w\tmpnr, [\state, #16 * 2 + 4] >> msr fpcr, x\tmpnr >> .endm >> + >> +.altmacro >> +.macro fpsimd_save_partial state, numnr, tmpnr1, tmpnr2 >> + mrs x\tmpnr1, fpsr >> + str w\numnr, [\state] >> + mrs x\tmpnr2, fpcr >> + stp w\tmpnr1, w\tmpnr2, [\state, #4] >> + adr x\tmpnr1, 0f >> + add \state, \state, x\numnr, lsl #4 >> + sub x\tmpnr1, x\tmpnr1, x\numnr, lsl #1 >> + br x\tmpnr1 >> + .irp qa, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0 >> + .irp qb, %(qa + 1) >> + stp q\qa, q\qb, [\state, # -16 * \qa - 16] >> + .endr >> + .endr >> +0: >> +.endm >> + >> +.macro fpsimd_restore_partial state, tmpnr1, tmpnr2 >> + ldp w\tmpnr1, w\tmpnr2, [\state, #4] >> + msr fpsr, x\tmpnr1 >> + msr fpcr, x\tmpnr2 >> + adr x\tmpnr1, 0f >> + ldr w\tmpnr2, [\state] >> + add \state, \state, x\tmpnr2, lsl #4 >> + sub x\tmpnr1, x\tmpnr1, x\tmpnr2, lsl #1 >> + br x\tmpnr1 >> + .irp qa, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0 >> + .irp qb, %(qa + 1) >> + ldp q\qa, q\qb, [\state, # -16 * \qa - 16] >> + .endr >> + .endr >> +0: >> +.endm > > BTW, it may be better if num_regs is placed at the end of the structure, > especially since you use stp to store both fpsr and fpcr (though I > haven't rewritten the above to see how they look). > I suppose you mean in the middle, i.e., after fpsr and fpcr? Yes that makes sense, I will change that.
diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h index 7a900142dbc8..05e1b24aca4c 100644 --- a/arch/arm64/include/asm/fpsimd.h +++ b/arch/arm64/include/asm/fpsimd.h @@ -41,6 +41,17 @@ struct fpsimd_state { unsigned int cpu; }; +/* + * Struct for stacking the bottom 'n' FP/SIMD registers. + */ +struct fpsimd_partial_state { + u32 num_regs; + u32 fpsr; + u32 fpcr; + __uint128_t vregs[32] __aligned(16); +} __aligned(16); + + #if defined(__KERNEL__) && defined(CONFIG_COMPAT) /* Masks for extracting the FPSR and FPCR from the FPSCR */ #define VFP_FPSCR_STAT_MASK 0xf800009f @@ -66,6 +77,10 @@ extern void fpsimd_update_current_state(struct fpsimd_state *state); extern void fpsimd_flush_task_state(struct task_struct *target); +extern void fpsimd_save_partial_state(struct fpsimd_partial_state *state, + u32 num_regs); +extern void fpsimd_load_partial_state(struct fpsimd_partial_state *state); + #endif #endif diff --git a/arch/arm64/include/asm/fpsimdmacros.h b/arch/arm64/include/asm/fpsimdmacros.h index bbec599c96bd..69e75134689d 100644 --- a/arch/arm64/include/asm/fpsimdmacros.h +++ b/arch/arm64/include/asm/fpsimdmacros.h @@ -62,3 +62,38 @@ ldr w\tmpnr, [\state, #16 * 2 + 4] msr fpcr, x\tmpnr .endm + +.altmacro +.macro fpsimd_save_partial state, numnr, tmpnr1, tmpnr2 + mrs x\tmpnr1, fpsr + str w\numnr, [\state] + mrs x\tmpnr2, fpcr + stp w\tmpnr1, w\tmpnr2, [\state, #4] + adr x\tmpnr1, 0f + add \state, \state, x\numnr, lsl #4 + sub x\tmpnr1, x\tmpnr1, x\numnr, lsl #1 + br x\tmpnr1 + .irp qa, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0 + .irp qb, %(qa + 1) + stp q\qa, q\qb, [\state, # -16 * \qa - 16] + .endr + .endr +0: +.endm + +.macro fpsimd_restore_partial state, tmpnr1, tmpnr2 + ldp w\tmpnr1, w\tmpnr2, [\state, #4] + msr fpsr, x\tmpnr1 + msr fpcr, x\tmpnr2 + adr x\tmpnr1, 0f + ldr w\tmpnr2, [\state] + add \state, \state, x\tmpnr2, lsl #4 + sub x\tmpnr1, x\tmpnr1, x\tmpnr2, lsl #1 + br x\tmpnr1 + .irp qa, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0 + .irp qb, %(qa + 1) + ldp q\qa, q\qb, [\state, # -16 * \qa - 16] + .endr + .endr +0: +.endm diff --git a/arch/arm64/include/asm/neon.h b/arch/arm64/include/asm/neon.h index b0cc58a97780..13ce4cc18e26 100644 --- a/arch/arm64/include/asm/neon.h +++ b/arch/arm64/include/asm/neon.h @@ -8,7 +8,11 @@ * published by the Free Software Foundation. */ +#include <linux/types.h> + #define cpu_has_neon() (1) -void kernel_neon_begin(void); +#define kernel_neon_begin() kernel_neon_begin_partial(32) + +void kernel_neon_begin_partial(u32 num_regs); void kernel_neon_end(void); diff --git a/arch/arm64/kernel/entry-fpsimd.S b/arch/arm64/kernel/entry-fpsimd.S index 6a27cd6dbfa6..d358ccacfc00 100644 --- a/arch/arm64/kernel/entry-fpsimd.S +++ b/arch/arm64/kernel/entry-fpsimd.S @@ -41,3 +41,27 @@ ENTRY(fpsimd_load_state) fpsimd_restore x0, 8 ret ENDPROC(fpsimd_load_state) + +#ifdef CONFIG_KERNEL_MODE_NEON + +/* + * Save the bottom n FP registers. + * + * x0 - pointer to struct fpsimd_partial_state + */ +ENTRY(fpsimd_save_partial_state) + fpsimd_save_partial x0, 1, 8, 9 + ret +ENDPROC(fpsimd_load_partial_state) + +/* + * Load the bottom n FP registers. + * + * x0 - pointer to struct fpsimd_partial_state + */ +ENTRY(fpsimd_load_partial_state) + fpsimd_restore_partial x0, 8, 9 + ret +ENDPROC(fpsimd_load_partial_state) + +#endif diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index 6cfbb4ef27d7..82ebc259a787 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -218,29 +218,45 @@ void fpsimd_flush_task_state(struct task_struct *t) #ifdef CONFIG_KERNEL_MODE_NEON +static DEFINE_PER_CPU(struct fpsimd_partial_state, hardirq_fpsimdstate); +static DEFINE_PER_CPU(struct fpsimd_partial_state, softirq_fpsimdstate); + /* * Kernel-side NEON support functions */ -void kernel_neon_begin(void) +void kernel_neon_begin_partial(u32 num_regs) { - /* Avoid using the NEON in interrupt context */ - BUG_ON(in_interrupt()); - preempt_disable(); + if (in_interrupt()) { + struct fpsimd_partial_state *s = this_cpu_ptr( + in_irq() ? &hardirq_fpsimdstate : &softirq_fpsimdstate); - /* - * Save the userland FPSIMD state if we have one and if we haven't done - * so already. Clear fpsimd_last_state to indicate that there is no - * longer userland FPSIMD state in the registers. - */ - if (current->mm && !test_and_set_thread_flag(TIF_FOREIGN_FPSTATE)) - fpsimd_save_state(¤t->thread.fpsimd_state); - this_cpu_write(fpsimd_last_state, NULL); + BUG_ON(num_regs > 32); + fpsimd_save_partial_state(s, roundup(num_regs, 2)); + } else { + /* + * Save the userland FPSIMD state if we have one and if we + * haven't done so already. Clear fpsimd_last_state to indicate + * that there is no longer userland FPSIMD state in the + * registers. + */ + preempt_disable(); + if (current->mm && + !test_and_set_thread_flag(TIF_FOREIGN_FPSTATE)) + fpsimd_save_state(¤t->thread.fpsimd_state); + this_cpu_write(fpsimd_last_state, NULL); + } } -EXPORT_SYMBOL(kernel_neon_begin); +EXPORT_SYMBOL(kernel_neon_begin_partial); void kernel_neon_end(void) { - preempt_enable(); + if (in_interrupt()) { + struct fpsimd_partial_state *s = this_cpu_ptr( + in_irq() ? &hardirq_fpsimdstate : &softirq_fpsimdstate); + fpsimd_load_partial_state(s); + } else { + preempt_enable(); + } } EXPORT_SYMBOL(kernel_neon_end);
This patch modifies kernel_neon_begin() and kernel_neon_end(), so they may be called from any context. To address the case where only a couple of registers are needed, kernel_neon_begin_partial(u32) is introduced which takes as a parameter the number of bottom 'n' NEON q-registers required. To mark the end of such a partial section, the regular kernel_neon_end() should be used. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> --- arch/arm64/include/asm/fpsimd.h | 15 ++++++++++++ arch/arm64/include/asm/fpsimdmacros.h | 35 ++++++++++++++++++++++++++++ arch/arm64/include/asm/neon.h | 6 ++++- arch/arm64/kernel/entry-fpsimd.S | 24 +++++++++++++++++++ arch/arm64/kernel/fpsimd.c | 44 ++++++++++++++++++++++++----------- 5 files changed, 109 insertions(+), 15 deletions(-)