Message ID | 1495193199-1291-1-git-send-email-Dave.Martin@arm.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Fri, May 19, 2017 at 12:26:39PM +0100, Dave Martin wrote: [...] > I've build-tested this only for now: no runtime testing, no > benchmarking. Note, there's a missing #include of <asm/neon.h> from fpsimd.c. (I moved/renamed kernel_neon_allowed() after build-testing...) [...] Cheers ---Dave
On 19 May 2017 at 12:26, Dave Martin <Dave.Martin@arm.com> wrote: > The benefits of nested kernel-mode NEON may be marginal. Likewise, > use of kernel-mode NEON in hardirq context is rare to nonexistent. > The only use case for kernel mode NEON outside of process context is the mac80211 subsystem which performs crypto on the packets in softirq context. > Removing support for these eliminates signifcant cost and complexity. > typo: significant > This patch implements a kernel_neon_allowed() function to allow > clients to check whether kernel-mode NEON is usable in the current > context, and simplifies kernel_neon_{begin,end}() to handle only > saving of the task FPSIMD state (if any). Without nesting, there > is no other state to save. > > The partial fpsimd save/restore functions become redundant as a > result of these changes, so they are removed too. > > Signed-off-by: Dave Martin <Dave.Martin@arm.com> > > --- > > This patch requires drivers to be ported to cope with conditional > availability of kernel-mode NEON: [1] should be close, since Ard > was already working on this. > > I've build-tested this only for now: no runtime testing, no > benchmarking. > > This patch is broken without some conversion of preempt_disable()/ > > enable() to local_bh_disable()/enable() around some of the context > handling code in fpsimd.c, in order to be robust against the task's > FPSIMD context being unexpectedly saved by a preempting softirq. > Yes, this is basically the issue we originally discussed in the context of SVE support. > [1] git://git.kernel.org/pub/scm/linux/kernel/git/ardb/linux.git kernel-mode-neon > > > arch/arm64/include/asm/fpsimd.h | 14 --------- > arch/arm64/include/asm/fpsimdmacros.h | 56 ----------------------------------- > arch/arm64/include/asm/neon.h | 44 +++++++++++++++++++++++++-- > arch/arm64/kernel/entry-fpsimd.S | 24 --------------- > arch/arm64/kernel/fpsimd.c | 50 ++++++++++++++----------------- > 5 files changed, 64 insertions(+), 124 deletions(-) > > diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h > index 50f559f..ff2f6cd 100644 > --- a/arch/arm64/include/asm/fpsimd.h > +++ b/arch/arm64/include/asm/fpsimd.h > @@ -41,16 +41,6 @@ struct fpsimd_state { > unsigned int cpu; > }; > > -/* > - * Struct for stacking the bottom 'n' FP/SIMD registers. > - */ > -struct fpsimd_partial_state { > - u32 fpsr; > - u32 fpcr; > - u32 num_regs; > - __uint128_t vregs[32]; > -}; > - > > #if defined(__KERNEL__) && defined(CONFIG_COMPAT) > /* Masks for extracting the FPSR and FPCR from the FPSCR */ > @@ -77,10 +67,6 @@ extern void fpsimd_update_current_state(struct fpsimd_state *state); > > extern void fpsimd_flush_task_state(struct task_struct *target); > > -extern void fpsimd_save_partial_state(struct fpsimd_partial_state *state, > - u32 num_regs); > -extern void fpsimd_load_partial_state(struct fpsimd_partial_state *state); > - > #endif > > #endif > diff --git a/arch/arm64/include/asm/fpsimdmacros.h b/arch/arm64/include/asm/fpsimdmacros.h > index a2daf12..0f5fdd3 100644 > --- a/arch/arm64/include/asm/fpsimdmacros.h > +++ b/arch/arm64/include/asm/fpsimdmacros.h > @@ -75,59 +75,3 @@ > ldr w\tmpnr, [\state, #16 * 2 + 4] > fpsimd_restore_fpcr x\tmpnr, \state > .endm > - > -.macro fpsimd_save_partial state, numnr, tmpnr1, tmpnr2 > - mrs x\tmpnr1, fpsr > - str w\numnr, [\state, #8] > - mrs x\tmpnr2, fpcr > - stp w\tmpnr1, w\tmpnr2, [\state] > - adr x\tmpnr1, 0f > - add \state, \state, x\numnr, lsl #4 > - sub x\tmpnr1, x\tmpnr1, x\numnr, lsl #1 > - br x\tmpnr1 > - stp q30, q31, [\state, #-16 * 30 - 16] > - stp q28, q29, [\state, #-16 * 28 - 16] > - stp q26, q27, [\state, #-16 * 26 - 16] > - stp q24, q25, [\state, #-16 * 24 - 16] > - stp q22, q23, [\state, #-16 * 22 - 16] > - stp q20, q21, [\state, #-16 * 20 - 16] > - stp q18, q19, [\state, #-16 * 18 - 16] > - stp q16, q17, [\state, #-16 * 16 - 16] > - stp q14, q15, [\state, #-16 * 14 - 16] > - stp q12, q13, [\state, #-16 * 12 - 16] > - stp q10, q11, [\state, #-16 * 10 - 16] > - stp q8, q9, [\state, #-16 * 8 - 16] > - stp q6, q7, [\state, #-16 * 6 - 16] > - stp q4, q5, [\state, #-16 * 4 - 16] > - stp q2, q3, [\state, #-16 * 2 - 16] > - stp q0, q1, [\state, #-16 * 0 - 16] > -0: > -.endm > - > -.macro fpsimd_restore_partial state, tmpnr1, tmpnr2 > - ldp w\tmpnr1, w\tmpnr2, [\state] > - msr fpsr, x\tmpnr1 > - fpsimd_restore_fpcr x\tmpnr2, x\tmpnr1 > - adr x\tmpnr1, 0f > - ldr w\tmpnr2, [\state, #8] > - add \state, \state, x\tmpnr2, lsl #4 > - sub x\tmpnr1, x\tmpnr1, x\tmpnr2, lsl #1 > - br x\tmpnr1 > - ldp q30, q31, [\state, #-16 * 30 - 16] > - ldp q28, q29, [\state, #-16 * 28 - 16] > - ldp q26, q27, [\state, #-16 * 26 - 16] > - ldp q24, q25, [\state, #-16 * 24 - 16] > - ldp q22, q23, [\state, #-16 * 22 - 16] > - ldp q20, q21, [\state, #-16 * 20 - 16] > - ldp q18, q19, [\state, #-16 * 18 - 16] > - ldp q16, q17, [\state, #-16 * 16 - 16] > - ldp q14, q15, [\state, #-16 * 14 - 16] > - ldp q12, q13, [\state, #-16 * 12 - 16] > - ldp q10, q11, [\state, #-16 * 10 - 16] > - ldp q8, q9, [\state, #-16 * 8 - 16] > - ldp q6, q7, [\state, #-16 * 6 - 16] > - ldp q4, q5, [\state, #-16 * 4 - 16] > - ldp q2, q3, [\state, #-16 * 2 - 16] > - ldp q0, q1, [\state, #-16 * 0 - 16] > -0: > -.endm > diff --git a/arch/arm64/include/asm/neon.h b/arch/arm64/include/asm/neon.h > index ad4cdc9..2269322 100644 > --- a/arch/arm64/include/asm/neon.h > +++ b/arch/arm64/include/asm/neon.h > @@ -8,12 +8,50 @@ > * published by the Free Software Foundation. > */ > > +#include <linux/compiler.h> > +#include <linux/percpu.h> > +#include <linux/preempt.h> > #include <linux/types.h> > #include <asm/fpsimd.h> > +#include <asm/local.h> > > #define cpu_has_neon() system_supports_fpsimd() > > -#define kernel_neon_begin() kernel_neon_begin_partial(32) > - > -void kernel_neon_begin_partial(u32 num_regs); > +void kernel_neon_begin(void); > void kernel_neon_end(void); > + > +/* FIXME: Backwards compatibility only, should go away: */ > +#define kernel_neon_begin_partial(num_regs) kernel_neon_begin() > + > +#ifdef CONFIG_KERNEL_MODE_NEON > + > +extern DEFINE_PER_CPU(local_t, kernel_neon_busy); DECLARE_PER_CPU > + > +/* > + * Returns true if and only if it is safe to call kernel_neon_begin(). > + * Callers must not assume that the result remains true beyond the next > + * preempt_enable() or return from softirq context. > + */ > +static bool __maybe_unused kernel_neon_allowed(void) If you make it static inline, you don't need the __maybe_unused > +{ > + /* > + * The per_cpu_ptr() is racy if called with preemption enabled. > + * This is not a bug: per_cpu(kernel_neon_busy) is only set > + * when preemption is disabled, so we cannot migrate to another > + * CPU while it is set, nor can we migrate to a CPU where it is set. > + * So, if we find it clear on some CPU then we're guaranteed to find it > + * clear on any CPU we could migrate to. > + * > + * If we are in between kernel_neon_begin()...kernel_neon_end(), the > + * flag will be set, but preemption is also disabled, so we can't > + * migrate to another CPU and spuriously see it become false. > + */ > + return !(in_irq() || in_nmi()) && > + !local_read(this_cpu_ptr(&kernel_neon_busy)); For readability, could we change this to '!x && !y && !z' instead? > +} > + > +#else /* ! CONFIG_KERNEL_MODE_NEON */ > + > +static bool __maybe_unused kernel_neon_allowed(void) { return false; } static inline here as well > + > +#endif /* ! CONFIG_KERNEL_MODE_NEON */ > diff --git a/arch/arm64/kernel/entry-fpsimd.S b/arch/arm64/kernel/entry-fpsimd.S > index c44a82f..6a27cd6 100644 > --- a/arch/arm64/kernel/entry-fpsimd.S > +++ b/arch/arm64/kernel/entry-fpsimd.S > @@ -41,27 +41,3 @@ ENTRY(fpsimd_load_state) > fpsimd_restore x0, 8 > ret > ENDPROC(fpsimd_load_state) > - > -#ifdef CONFIG_KERNEL_MODE_NEON > - > -/* > - * Save the bottom n FP registers. > - * > - * x0 - pointer to struct fpsimd_partial_state > - */ > -ENTRY(fpsimd_save_partial_state) > - fpsimd_save_partial x0, 1, 8, 9 > - ret > -ENDPROC(fpsimd_save_partial_state) > - > -/* > - * Load the bottom n FP registers. > - * > - * x0 - pointer to struct fpsimd_partial_state > - */ > -ENTRY(fpsimd_load_partial_state) > - fpsimd_restore_partial x0, 8, 9 > - ret > -ENDPROC(fpsimd_load_partial_state) > - > -#endif > diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c > index 06da8ea..22cb8e8 100644 > --- a/arch/arm64/kernel/fpsimd.c > +++ b/arch/arm64/kernel/fpsimd.c > @@ -21,12 +21,14 @@ > #include <linux/cpu_pm.h> > #include <linux/kernel.h> > #include <linux/init.h> > +#include <linux/percpu.h> > #include <linux/sched/signal.h> > #include <linux/signal.h> > #include <linux/hardirq.h> > > #include <asm/fpsimd.h> > #include <asm/cputype.h> > +#include <asm/local.h> > > #define FPEXC_IOF (1 << 0) > #define FPEXC_DZF (1 << 1) > @@ -230,49 +232,43 @@ void fpsimd_flush_task_state(struct task_struct *t) > > #ifdef CONFIG_KERNEL_MODE_NEON > > -static DEFINE_PER_CPU(struct fpsimd_partial_state, hardirq_fpsimdstate); > -static DEFINE_PER_CPU(struct fpsimd_partial_state, softirq_fpsimdstate); > +DEFINE_PER_CPU(local_t, kernel_neon_busy); > > /* > * Kernel-side NEON support functions > */ > -void kernel_neon_begin_partial(u32 num_regs) > +void kernel_neon_begin(void) > { > if (WARN_ON(!system_supports_fpsimd())) > return; > - if (in_interrupt()) { > - struct fpsimd_partial_state *s = this_cpu_ptr( > - in_irq() ? &hardirq_fpsimdstate : &softirq_fpsimdstate); > > - BUG_ON(num_regs > 32); > - fpsimd_save_partial_state(s, roundup(num_regs, 2)); > - } else { > - /* > - * Save the userland FPSIMD state if we have one and if we > - * haven't done so already. Clear fpsimd_last_state to indicate > - * that there is no longer userland FPSIMD state in the > - * registers. > - */ > - preempt_disable(); > - if (current->mm && > - !test_and_set_thread_flag(TIF_FOREIGN_FPSTATE)) > - fpsimd_save_state(¤t->thread.fpsimd_state); > + BUG_ON(!kernel_neon_allowed()); > + > + preempt_disable(); > + local_set(this_cpu_ptr(&kernel_neon_busy), 1); > + > + /* > + * If there is unsaved task fpsimd state in the CPU, save it > + * and invalidate the copy stored in the fpsimd regs: > + */ > + if (current->mm && !test_and_set_thread_flag(TIF_FOREIGN_FPSTATE)) { > + fpsimd_save_state(¤t->thread.fpsimd_state); > this_cpu_write(fpsimd_last_state, NULL); > } > } > -EXPORT_SYMBOL(kernel_neon_begin_partial); > +EXPORT_SYMBOL(kernel_neon_begin); > > void kernel_neon_end(void) > { > + long busy; > + > if (!system_supports_fpsimd()) > return; > - if (in_interrupt()) { > - struct fpsimd_partial_state *s = this_cpu_ptr( > - in_irq() ? &hardirq_fpsimdstate : &softirq_fpsimdstate); > - fpsimd_load_partial_state(s); > - } else { > - preempt_enable(); > - } > + > + busy = local_xchg(this_cpu_ptr(&kernel_neon_busy), 0); > + WARN_ON(!busy); /* No matching kernel_neon_begin()? */ > + > + preempt_enable(); > } > EXPORT_SYMBOL(kernel_neon_end); > > -- > 2.1.4 > I think the general approach is sound. I still think we could call kernel_neon_allowed() may_use_simd(), and let it override the asm-generic version in asm/simd.h. In any case, given this is a prereq for SVE support which is still far out, we should probably phase this change by migrating KMN users to kernel_neon_allowed/may_use_simd first (and add a 'return true' version for the former if needed), so that the crypto changes can be queued for v4.13
Hi, On Fri, May 19, 2017 at 12:26:39PM +0100, Dave Martin wrote: > +static bool __maybe_unused kernel_neon_allowed(void) > +{ > + /* > + * The per_cpu_ptr() is racy if called with preemption enabled. > + * This is not a bug: per_cpu(kernel_neon_busy) is only set > + * when preemption is disabled, so we cannot migrate to another > + * CPU while it is set, nor can we migrate to a CPU where it is set. > + * So, if we find it clear on some CPU then we're guaranteed to find it > + * clear on any CPU we could migrate to. > + * > + * If we are in between kernel_neon_begin()...kernel_neon_end(), the > + * flag will be set, but preemption is also disabled, so we can't > + * migrate to another CPU and spuriously see it become false. > + */ > + return !(in_irq() || in_nmi()) && > + !local_read(this_cpu_ptr(&kernel_neon_busy)); > +} I think it would be better to use the this_cpu ops for this, rather than manually messing with the pointer. Here, we can use raw_cpu_read(kernel_neon_busy), given the comment above. [...] > +DEFINE_PER_CPU(local_t, kernel_neon_busy); This can become a basic type like int or bool. [...] > +void kernel_neon_begin(void) > { > if (WARN_ON(!system_supports_fpsimd())) > return; > - if (in_interrupt()) { > - struct fpsimd_partial_state *s = this_cpu_ptr( > - in_irq() ? &hardirq_fpsimdstate : &softirq_fpsimdstate); > > - BUG_ON(num_regs > 32); > - fpsimd_save_partial_state(s, roundup(num_regs, 2)); > - } else { > - /* > - * Save the userland FPSIMD state if we have one and if we > - * haven't done so already. Clear fpsimd_last_state to indicate > - * that there is no longer userland FPSIMD state in the > - * registers. > - */ > - preempt_disable(); > - if (current->mm && > - !test_and_set_thread_flag(TIF_FOREIGN_FPSTATE)) > - fpsimd_save_state(¤t->thread.fpsimd_state); > + BUG_ON(!kernel_neon_allowed()); > + > + preempt_disable(); > + local_set(this_cpu_ptr(&kernel_neon_busy), 1); As preemption is disabled: __this_cpu_write(kernel_neon_busy, 1); [...] > void kernel_neon_end(void) > { > + long busy; > + > if (!system_supports_fpsimd()) > return; > - if (in_interrupt()) { > - struct fpsimd_partial_state *s = this_cpu_ptr( > - in_irq() ? &hardirq_fpsimdstate : &softirq_fpsimdstate); > - fpsimd_load_partial_state(s); > - } else { > - preempt_enable(); > - } > + > + busy = local_xchg(this_cpu_ptr(&kernel_neon_busy), 0); Likewise: busy = __this_cpu_xchg(kernel_neon_busy, 0); Thanks, Mark.
On Fri, May 19, 2017 at 01:49:53PM +0100, Mark Rutland wrote: > Hi, > > On Fri, May 19, 2017 at 12:26:39PM +0100, Dave Martin wrote: > > +static bool __maybe_unused kernel_neon_allowed(void) > > +{ > > + /* > > + * The per_cpu_ptr() is racy if called with preemption enabled. > > + * This is not a bug: per_cpu(kernel_neon_busy) is only set > > + * when preemption is disabled, so we cannot migrate to another > > + * CPU while it is set, nor can we migrate to a CPU where it is set. > > + * So, if we find it clear on some CPU then we're guaranteed to find it > > + * clear on any CPU we could migrate to. > > + * > > + * If we are in between kernel_neon_begin()...kernel_neon_end(), the > > + * flag will be set, but preemption is also disabled, so we can't > > + * migrate to another CPU and spuriously see it become false. > > + */ > > + return !(in_irq() || in_nmi()) && > > + !local_read(this_cpu_ptr(&kernel_neon_busy)); > > +} > > I think it would be better to use the this_cpu ops for this, rather than > manually messing with the pointer. I had thought that the properties of local_t were important here, but this_cpu ops seem equally appropriate. > Here, we can use raw_cpu_read(kernel_neon_busy), given the comment > above. What's the difference? The raw_ variants aren't documented. Do these not bother about atomicity between the address calculation and the read? > [...] > > > +DEFINE_PER_CPU(local_t, kernel_neon_busy); > > This can become a basic type like int or bool. Agreed -- probably bool, given how it's used. > [...] > > > +void kernel_neon_begin(void) > > { > > if (WARN_ON(!system_supports_fpsimd())) > > return; > > - if (in_interrupt()) { > > - struct fpsimd_partial_state *s = this_cpu_ptr( > > - in_irq() ? &hardirq_fpsimdstate : &softirq_fpsimdstate); > > > > - BUG_ON(num_regs > 32); > > - fpsimd_save_partial_state(s, roundup(num_regs, 2)); > > - } else { > > - /* > > - * Save the userland FPSIMD state if we have one and if we > > - * haven't done so already. Clear fpsimd_last_state to indicate > > - * that there is no longer userland FPSIMD state in the > > - * registers. > > - */ > > - preempt_disable(); > > - if (current->mm && > > - !test_and_set_thread_flag(TIF_FOREIGN_FPSTATE)) > > - fpsimd_save_state(¤t->thread.fpsimd_state); > > + BUG_ON(!kernel_neon_allowed()); > > + > > + preempt_disable(); > > + local_set(this_cpu_ptr(&kernel_neon_busy), 1); > > As preemption is disabled: > > __this_cpu_write(kernel_neon_busy, 1); > > [...] > > > void kernel_neon_end(void) > > { > > + long busy; > > + > > if (!system_supports_fpsimd()) > > return; > > - if (in_interrupt()) { > > - struct fpsimd_partial_state *s = this_cpu_ptr( > > - in_irq() ? &hardirq_fpsimdstate : &softirq_fpsimdstate); > > - fpsimd_load_partial_state(s); > > - } else { > > - preempt_enable(); > > - } > > + > > + busy = local_xchg(this_cpu_ptr(&kernel_neon_busy), 0); > > Likewise: > > busy = __this_cpu_xchg(kernel_neon_busy, 0); Yup. Will tweak. Cheers ---Dave
On Fri, May 19, 2017 at 02:13:21PM +0100, Dave Martin wrote: > On Fri, May 19, 2017 at 01:49:53PM +0100, Mark Rutland wrote: > > Hi, > > > > On Fri, May 19, 2017 at 12:26:39PM +0100, Dave Martin wrote: > > > +static bool __maybe_unused kernel_neon_allowed(void) > > > +{ > > > + /* > > > + * The per_cpu_ptr() is racy if called with preemption enabled. > > > + * This is not a bug: per_cpu(kernel_neon_busy) is only set > > > + * when preemption is disabled, so we cannot migrate to another > > > + * CPU while it is set, nor can we migrate to a CPU where it is set. > > > + * So, if we find it clear on some CPU then we're guaranteed to find it > > > + * clear on any CPU we could migrate to. > > > + * > > > + * If we are in between kernel_neon_begin()...kernel_neon_end(), the > > > + * flag will be set, but preemption is also disabled, so we can't > > > + * migrate to another CPU and spuriously see it become false. > > > + */ > > > + return !(in_irq() || in_nmi()) && > > > + !local_read(this_cpu_ptr(&kernel_neon_busy)); > > > +} > > > > I think it would be better to use the this_cpu ops for this, rather than > > manually messing with the pointer. > > I had thought that the properties of local_t were important here, but > this_cpu ops seem equally appropriate. > > > Here, we can use raw_cpu_read(kernel_neon_busy), given the comment > > above. > > What's the difference? The raw_ variants aren't documented. Do these > not bother about atomicity between the address calculation and the read? Yup. Comparing raw_cpu_{x}, __this_cpu_{x}, and this_cpu_{x}: * raw_cpu_{x} don't have any preemption checks, and don't disable preemption internally. Due to this, the address gen and operation can occur on different CPUs. * __this_cpu_{x} have lockdep annotations to check that preemption is disabled, but otherwise behave the same as raw_cpu_{x}. i.e. they don't try to disable preemption internally. * this_cpu_{x} ensure that preemption is disabled around the address gen and operation. Since preemption may be possible, but we believe this is fine, we have to use the raw form. We'll want to keep the commment explaining why. Thanks, Mark.
On Fri, May 19, 2017 at 01:34:37PM +0100, Ard Biesheuvel wrote: > On 19 May 2017 at 12:26, Dave Martin <Dave.Martin@arm.com> wrote: > > The benefits of nested kernel-mode NEON may be marginal. Likewise, > > use of kernel-mode NEON in hardirq context is rare to nonexistent. > > > > The only use case for kernel mode NEON outside of process context is > the mac80211 subsystem which performs crypto on the packets in softirq > context. > > > Removing support for these eliminates signifcant cost and complexity. > > > > typo: significant Good spot -- thanks. [...] > > This patch is broken without some conversion of preempt_disable()/ > > enable() to local_bh_disable()/enable() around some of the context > > handling code in fpsimd.c, in order to be robust against the task's > > FPSIMD context being unexpectedly saved by a preempting softirq. > > > > Yes, this is basically the issue we originally discussed in the > context of SVE support. Agreed -- I'll come back to that. [...] > > diff --git a/arch/arm64/include/asm/neon.h b/arch/arm64/include/asm/neon.h [...] > > +#ifdef CONFIG_KERNEL_MODE_NEON > > + > > +extern DEFINE_PER_CPU(local_t, kernel_neon_busy); > > DECLARE_PER_CPU Arg, yes. > > > + > > +/* > > + * Returns true if and only if it is safe to call kernel_neon_begin(). > > + * Callers must not assume that the result remains true beyond the next > > + * preempt_enable() or return from softirq context. > > + */ > > +static bool __maybe_unused kernel_neon_allowed(void) > > If you make it static inline, you don't need the __maybe_unused True. I'm a bit prejudiced against inline due to the inconsistent interpretations between C/C++/gcc. But it is cleaner and it's used with an appropriate meaning elsewhere in the kernel. [...] > > + * If we are in between kernel_neon_begin()...kernel_neon_end(), the > > + * flag will be set, but preemption is also disabled, so we can't > > + * migrate to another CPU and spuriously see it become false. > > + */ > > + return !(in_irq() || in_nmi()) && > > + !local_read(this_cpu_ptr(&kernel_neon_busy)); > > For readability, could we change this to '!x && !y && !z' instead? Agreed, this mis-evolved from !in_irq() && !local_read(... [...] > static inline here as well Ditto [...] > > - } > > + > > + busy = local_xchg(this_cpu_ptr(&kernel_neon_busy), 0); > > + WARN_ON(!busy); /* No matching kernel_neon_begin()? */ > > + > > + preempt_enable(); > > } > > EXPORT_SYMBOL(kernel_neon_end); [...] > I think the general approach is sound. I still think we could call > kernel_neon_allowed() may_use_simd(), and let it override the > asm-generic version in asm/simd.h. In any case, given this is a prereq > for SVE support which is still far out, we should probably phase this > change by migrating KMN users to kernel_neon_allowed/may_use_simd > first (and add a 'return true' version for the former if needed), so > that the crypto changes can be queued for v4.13 OK -- when do you expect your kernel-mode-neon series (or relevant bits of it) to be merged? With that in place, I can carry this patch in the SVE series, or propose it to be merged separately. I'd also expect CONFIG_KERNEL_NEON_MODE_NEON_FALLBACK and kernel_neon_need_fallback() to be folded in (=y and true respectively), since removal of nesting support will mean that fallback code is always needed for clients that may run elsewhere than in task context. Cheers ---Dave
On 19 May 2017 at 14:46, Dave Martin <Dave.Martin@arm.com> wrote: > On Fri, May 19, 2017 at 01:34:37PM +0100, Ard Biesheuvel wrote: >> On 19 May 2017 at 12:26, Dave Martin <Dave.Martin@arm.com> wrote: >> > The benefits of nested kernel-mode NEON may be marginal. Likewise, >> > use of kernel-mode NEON in hardirq context is rare to nonexistent. >> > >> >> The only use case for kernel mode NEON outside of process context is >> the mac80211 subsystem which performs crypto on the packets in softirq >> context. >> >> > Removing support for these eliminates signifcant cost and complexity. >> > >> >> typo: significant > > Good spot -- thanks. > > [...] > >> > This patch is broken without some conversion of preempt_disable()/ >> > enable() to local_bh_disable()/enable() around some of the context >> > handling code in fpsimd.c, in order to be robust against the task's >> > FPSIMD context being unexpectedly saved by a preempting softirq. >> > >> >> Yes, this is basically the issue we originally discussed in the >> context of SVE support. > > Agreed -- I'll come back to that. > > [...] > >> > diff --git a/arch/arm64/include/asm/neon.h b/arch/arm64/include/asm/neon.h > > [...] > >> > +#ifdef CONFIG_KERNEL_MODE_NEON >> > + >> > +extern DEFINE_PER_CPU(local_t, kernel_neon_busy); >> >> DECLARE_PER_CPU > > Arg, yes. > >> >> > + >> > +/* >> > + * Returns true if and only if it is safe to call kernel_neon_begin(). >> > + * Callers must not assume that the result remains true beyond the next >> > + * preempt_enable() or return from softirq context. >> > + */ >> > +static bool __maybe_unused kernel_neon_allowed(void) >> >> If you make it static inline, you don't need the __maybe_unused > > True. I'm a bit prejudiced against inline due to the inconsistent > interpretations between C/C++/gcc. > > But it is cleaner and it's used with an appropriate meaning > elsewhere in the kernel. > > [...] > >> > + * If we are in between kernel_neon_begin()...kernel_neon_end(), the >> > + * flag will be set, but preemption is also disabled, so we can't >> > + * migrate to another CPU and spuriously see it become false. >> > + */ >> > + return !(in_irq() || in_nmi()) && >> > + !local_read(this_cpu_ptr(&kernel_neon_busy)); >> >> For readability, could we change this to '!x && !y && !z' instead? > > Agreed, this mis-evolved from !in_irq() && !local_read(... > > [...] > >> static inline here as well > > Ditto > > [...] > >> > - } >> > + >> > + busy = local_xchg(this_cpu_ptr(&kernel_neon_busy), 0); >> > + WARN_ON(!busy); /* No matching kernel_neon_begin()? */ >> > + >> > + preempt_enable(); >> > } >> > EXPORT_SYMBOL(kernel_neon_end); > > [...] > >> I think the general approach is sound. I still think we could call >> kernel_neon_allowed() may_use_simd(), and let it override the >> asm-generic version in asm/simd.h. In any case, given this is a prereq >> for SVE support which is still far out, we should probably phase this >> change by migrating KMN users to kernel_neon_allowed/may_use_simd >> first (and add a 'return true' version for the former if needed), so >> that the crypto changes can be queued for v4.13 > > OK -- when do you expect your kernel-mode-neon series (or relevant bits > of it) to be merged? With that in place, I can carry this patch in > the SVE series, or propose it to be merged separately. > There is no reason for any of this to go through the crypto tree or mailing list. So for now, let's go with kernel_neon_allowed(), and I can respin my patches against that and ask Catalin or Will to queue it for v4.13. I will be travelling next week, though, so no ETA yet. > I'd also expect CONFIG_KERNEL_NEON_MODE_NEON_FALLBACK and > kernel_neon_need_fallback() to be folded in (=y and true respectively), > since removal of nesting support will mean that fallback code is always > needed for clients that may run elsewhere than in task context. > Yes. So we no longer have to reason about whether a fallback should be provided, which is an improvement imo.
On Fri, May 19, 2017 at 02:34:25PM +0100, Mark Rutland wrote: > On Fri, May 19, 2017 at 02:13:21PM +0100, Dave Martin wrote: > > On Fri, May 19, 2017 at 01:49:53PM +0100, Mark Rutland wrote: > > > Hi, > > > > > > On Fri, May 19, 2017 at 12:26:39PM +0100, Dave Martin wrote: > > > > +static bool __maybe_unused kernel_neon_allowed(void) > > > > +{ > > > > + /* > > > > + * The per_cpu_ptr() is racy if called with preemption enabled. > > > > + * This is not a bug: per_cpu(kernel_neon_busy) is only set > > > > + * when preemption is disabled, so we cannot migrate to another > > > > + * CPU while it is set, nor can we migrate to a CPU where it is set. > > > > + * So, if we find it clear on some CPU then we're guaranteed to find it > > > > + * clear on any CPU we could migrate to. > > > > + * > > > > + * If we are in between kernel_neon_begin()...kernel_neon_end(), the > > > > + * flag will be set, but preemption is also disabled, so we can't > > > > + * migrate to another CPU and spuriously see it become false. > > > > + */ > > > > + return !(in_irq() || in_nmi()) && > > > > + !local_read(this_cpu_ptr(&kernel_neon_busy)); > > > > +} > > > > > > I think it would be better to use the this_cpu ops for this, rather than > > > manually messing with the pointer. > > > > I had thought that the properties of local_t were important here, but > > this_cpu ops seem equally appropriate. > > > > > Here, we can use raw_cpu_read(kernel_neon_busy), given the comment > > > above. > > > > What's the difference? The raw_ variants aren't documented. Do these > > not bother about atomicity between the address calculation and the read? > > Yup. > > Comparing raw_cpu_{x}, __this_cpu_{x}, and this_cpu_{x}: > > * raw_cpu_{x} don't have any preemption checks, and don't disable > preemption internally. Due to this, the address gen and operation can > occur on different CPUs. Ah, I see. This is not fantastically intuitive... > * __this_cpu_{x} have lockdep annotations to check that preemption is > disabled, but otherwise behave the same as raw_cpu_{x}. i.e. they > don't try to disable preemption internally. > > * this_cpu_{x} ensure that preemption is disabled around the address gen > and operation. > > Since preemption may be possible, but we believe this is fine, we have > to use the raw form. We'll want to keep the commment explaining why. Absolutely. Those seem natural fits then. Cheers ---Dave
On Fri, May 19, 2017 at 02:56:20PM +0100, Ard Biesheuvel wrote: > On 19 May 2017 at 14:46, Dave Martin <Dave.Martin@arm.com> wrote: [...] > > OK -- when do you expect your kernel-mode-neon series (or relevant bits > > of it) to be merged? With that in place, I can carry this patch in > > the SVE series, or propose it to be merged separately. > > > > There is no reason for any of this to go through the crypto tree or > mailing list. So for now, let's go with kernel_neon_allowed(), and I Arg, I just changed it back... What are the intended meanings of !kernel_neon_allowed() && !may_use_simd() !kernel_neon_allowed() && may_use_simd() kernel_neon_allowed() && !may_use_simd() kernel_neon_allowed() && may_use_simd() ? I'm still a little confused here... > can respin my patches against that and ask Catalin or Will to queue it > for v4.13. I will be travelling next week, though, so no ETA yet. OK. It they get queued for v4.13 that's fine for me. > > I'd also expect CONFIG_KERNEL_NEON_MODE_NEON_FALLBACK and > > kernel_neon_need_fallback() to be folded in (=y and true respectively), > > since removal of nesting support will mean that fallback code is always > > needed for clients that may run elsewhere than in task context. > > Yes. So we no longer have to reason about whether a fallback should be > provided, which is an improvement imo. Agreed, it seems simpler this way. Cheers ---Dave
On 19 May 2017 at 15:09, Dave Martin <Dave.Martin@arm.com> wrote: > On Fri, May 19, 2017 at 02:56:20PM +0100, Ard Biesheuvel wrote: >> On 19 May 2017 at 14:46, Dave Martin <Dave.Martin@arm.com> wrote: > > [...] > >> > OK -- when do you expect your kernel-mode-neon series (or relevant bits >> > of it) to be merged? With that in place, I can carry this patch in >> > the SVE series, or propose it to be merged separately. >> > >> >> There is no reason for any of this to go through the crypto tree or >> mailing list. So for now, let's go with kernel_neon_allowed(), and I > > Arg, I just changed it back... > Sorry :-) The only reason I suggested that was to make your life easier given that we can always #define one to mean the other or vice versa. We can go with either, but I'd like to remove the distinction asap. > What are the intended meanings of > > !kernel_neon_allowed() && !may_use_simd() > !kernel_neon_allowed() && may_use_simd() > kernel_neon_allowed() && !may_use_simd() > kernel_neon_allowed() && may_use_simd() > > ? > > I'm still a little confused here... > Ultimately, I think they should be the same. The partial state save/restore is potentially much more costly (e.g., when using the v8 AES cipher that operates on 16 bytes at a time), and so a client of the async API should never see a performance hit caused by the fact that we are doing work in softirq context simply because we can and not because it is a good idea. But with nesting removed, doing the work in softirq context may still be strictly unnecessary, but at least it doesn't do more work. >> can respin my patches against that and ask Catalin or Will to queue it >> for v4.13. I will be travelling next week, though, so no ETA yet. > > OK. It they get queued for v4.13 that's fine for me. > >> > I'd also expect CONFIG_KERNEL_NEON_MODE_NEON_FALLBACK and >> > kernel_neon_need_fallback() to be folded in (=y and true respectively), >> > since removal of nesting support will mean that fallback code is always >> > needed for clients that may run elsewhere than in task context. >> >> Yes. So we no longer have to reason about whether a fallback should be >> provided, which is an improvement imo. > > Agreed, it seems simpler this way. > > Cheers > ---Dave
diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h index 50f559f..ff2f6cd 100644 --- a/arch/arm64/include/asm/fpsimd.h +++ b/arch/arm64/include/asm/fpsimd.h @@ -41,16 +41,6 @@ struct fpsimd_state { unsigned int cpu; }; -/* - * Struct for stacking the bottom 'n' FP/SIMD registers. - */ -struct fpsimd_partial_state { - u32 fpsr; - u32 fpcr; - u32 num_regs; - __uint128_t vregs[32]; -}; - #if defined(__KERNEL__) && defined(CONFIG_COMPAT) /* Masks for extracting the FPSR and FPCR from the FPSCR */ @@ -77,10 +67,6 @@ extern void fpsimd_update_current_state(struct fpsimd_state *state); extern void fpsimd_flush_task_state(struct task_struct *target); -extern void fpsimd_save_partial_state(struct fpsimd_partial_state *state, - u32 num_regs); -extern void fpsimd_load_partial_state(struct fpsimd_partial_state *state); - #endif #endif diff --git a/arch/arm64/include/asm/fpsimdmacros.h b/arch/arm64/include/asm/fpsimdmacros.h index a2daf12..0f5fdd3 100644 --- a/arch/arm64/include/asm/fpsimdmacros.h +++ b/arch/arm64/include/asm/fpsimdmacros.h @@ -75,59 +75,3 @@ ldr w\tmpnr, [\state, #16 * 2 + 4] fpsimd_restore_fpcr x\tmpnr, \state .endm - -.macro fpsimd_save_partial state, numnr, tmpnr1, tmpnr2 - mrs x\tmpnr1, fpsr - str w\numnr, [\state, #8] - mrs x\tmpnr2, fpcr - stp w\tmpnr1, w\tmpnr2, [\state] - adr x\tmpnr1, 0f - add \state, \state, x\numnr, lsl #4 - sub x\tmpnr1, x\tmpnr1, x\numnr, lsl #1 - br x\tmpnr1 - stp q30, q31, [\state, #-16 * 30 - 16] - stp q28, q29, [\state, #-16 * 28 - 16] - stp q26, q27, [\state, #-16 * 26 - 16] - stp q24, q25, [\state, #-16 * 24 - 16] - stp q22, q23, [\state, #-16 * 22 - 16] - stp q20, q21, [\state, #-16 * 20 - 16] - stp q18, q19, [\state, #-16 * 18 - 16] - stp q16, q17, [\state, #-16 * 16 - 16] - stp q14, q15, [\state, #-16 * 14 - 16] - stp q12, q13, [\state, #-16 * 12 - 16] - stp q10, q11, [\state, #-16 * 10 - 16] - stp q8, q9, [\state, #-16 * 8 - 16] - stp q6, q7, [\state, #-16 * 6 - 16] - stp q4, q5, [\state, #-16 * 4 - 16] - stp q2, q3, [\state, #-16 * 2 - 16] - stp q0, q1, [\state, #-16 * 0 - 16] -0: -.endm - -.macro fpsimd_restore_partial state, tmpnr1, tmpnr2 - ldp w\tmpnr1, w\tmpnr2, [\state] - msr fpsr, x\tmpnr1 - fpsimd_restore_fpcr x\tmpnr2, x\tmpnr1 - adr x\tmpnr1, 0f - ldr w\tmpnr2, [\state, #8] - add \state, \state, x\tmpnr2, lsl #4 - sub x\tmpnr1, x\tmpnr1, x\tmpnr2, lsl #1 - br x\tmpnr1 - ldp q30, q31, [\state, #-16 * 30 - 16] - ldp q28, q29, [\state, #-16 * 28 - 16] - ldp q26, q27, [\state, #-16 * 26 - 16] - ldp q24, q25, [\state, #-16 * 24 - 16] - ldp q22, q23, [\state, #-16 * 22 - 16] - ldp q20, q21, [\state, #-16 * 20 - 16] - ldp q18, q19, [\state, #-16 * 18 - 16] - ldp q16, q17, [\state, #-16 * 16 - 16] - ldp q14, q15, [\state, #-16 * 14 - 16] - ldp q12, q13, [\state, #-16 * 12 - 16] - ldp q10, q11, [\state, #-16 * 10 - 16] - ldp q8, q9, [\state, #-16 * 8 - 16] - ldp q6, q7, [\state, #-16 * 6 - 16] - ldp q4, q5, [\state, #-16 * 4 - 16] - ldp q2, q3, [\state, #-16 * 2 - 16] - ldp q0, q1, [\state, #-16 * 0 - 16] -0: -.endm diff --git a/arch/arm64/include/asm/neon.h b/arch/arm64/include/asm/neon.h index ad4cdc9..2269322 100644 --- a/arch/arm64/include/asm/neon.h +++ b/arch/arm64/include/asm/neon.h @@ -8,12 +8,50 @@ * published by the Free Software Foundation. */ +#include <linux/compiler.h> +#include <linux/percpu.h> +#include <linux/preempt.h> #include <linux/types.h> #include <asm/fpsimd.h> +#include <asm/local.h> #define cpu_has_neon() system_supports_fpsimd() -#define kernel_neon_begin() kernel_neon_begin_partial(32) - -void kernel_neon_begin_partial(u32 num_regs); +void kernel_neon_begin(void); void kernel_neon_end(void); + +/* FIXME: Backwards compatibility only, should go away: */ +#define kernel_neon_begin_partial(num_regs) kernel_neon_begin() + +#ifdef CONFIG_KERNEL_MODE_NEON + +extern DEFINE_PER_CPU(local_t, kernel_neon_busy); + +/* + * Returns true if and only if it is safe to call kernel_neon_begin(). + * Callers must not assume that the result remains true beyond the next + * preempt_enable() or return from softirq context. + */ +static bool __maybe_unused kernel_neon_allowed(void) +{ + /* + * The per_cpu_ptr() is racy if called with preemption enabled. + * This is not a bug: per_cpu(kernel_neon_busy) is only set + * when preemption is disabled, so we cannot migrate to another + * CPU while it is set, nor can we migrate to a CPU where it is set. + * So, if we find it clear on some CPU then we're guaranteed to find it + * clear on any CPU we could migrate to. + * + * If we are in between kernel_neon_begin()...kernel_neon_end(), the + * flag will be set, but preemption is also disabled, so we can't + * migrate to another CPU and spuriously see it become false. + */ + return !(in_irq() || in_nmi()) && + !local_read(this_cpu_ptr(&kernel_neon_busy)); +} + +#else /* ! CONFIG_KERNEL_MODE_NEON */ + +static bool __maybe_unused kernel_neon_allowed(void) { return false; } + +#endif /* ! CONFIG_KERNEL_MODE_NEON */ diff --git a/arch/arm64/kernel/entry-fpsimd.S b/arch/arm64/kernel/entry-fpsimd.S index c44a82f..6a27cd6 100644 --- a/arch/arm64/kernel/entry-fpsimd.S +++ b/arch/arm64/kernel/entry-fpsimd.S @@ -41,27 +41,3 @@ ENTRY(fpsimd_load_state) fpsimd_restore x0, 8 ret ENDPROC(fpsimd_load_state) - -#ifdef CONFIG_KERNEL_MODE_NEON - -/* - * Save the bottom n FP registers. - * - * x0 - pointer to struct fpsimd_partial_state - */ -ENTRY(fpsimd_save_partial_state) - fpsimd_save_partial x0, 1, 8, 9 - ret -ENDPROC(fpsimd_save_partial_state) - -/* - * Load the bottom n FP registers. - * - * x0 - pointer to struct fpsimd_partial_state - */ -ENTRY(fpsimd_load_partial_state) - fpsimd_restore_partial x0, 8, 9 - ret -ENDPROC(fpsimd_load_partial_state) - -#endif diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index 06da8ea..22cb8e8 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -21,12 +21,14 @@ #include <linux/cpu_pm.h> #include <linux/kernel.h> #include <linux/init.h> +#include <linux/percpu.h> #include <linux/sched/signal.h> #include <linux/signal.h> #include <linux/hardirq.h> #include <asm/fpsimd.h> #include <asm/cputype.h> +#include <asm/local.h> #define FPEXC_IOF (1 << 0) #define FPEXC_DZF (1 << 1) @@ -230,49 +232,43 @@ void fpsimd_flush_task_state(struct task_struct *t) #ifdef CONFIG_KERNEL_MODE_NEON -static DEFINE_PER_CPU(struct fpsimd_partial_state, hardirq_fpsimdstate); -static DEFINE_PER_CPU(struct fpsimd_partial_state, softirq_fpsimdstate); +DEFINE_PER_CPU(local_t, kernel_neon_busy); /* * Kernel-side NEON support functions */ -void kernel_neon_begin_partial(u32 num_regs) +void kernel_neon_begin(void) { if (WARN_ON(!system_supports_fpsimd())) return; - if (in_interrupt()) { - struct fpsimd_partial_state *s = this_cpu_ptr( - in_irq() ? &hardirq_fpsimdstate : &softirq_fpsimdstate); - BUG_ON(num_regs > 32); - fpsimd_save_partial_state(s, roundup(num_regs, 2)); - } else { - /* - * Save the userland FPSIMD state if we have one and if we - * haven't done so already. Clear fpsimd_last_state to indicate - * that there is no longer userland FPSIMD state in the - * registers. - */ - preempt_disable(); - if (current->mm && - !test_and_set_thread_flag(TIF_FOREIGN_FPSTATE)) - fpsimd_save_state(¤t->thread.fpsimd_state); + BUG_ON(!kernel_neon_allowed()); + + preempt_disable(); + local_set(this_cpu_ptr(&kernel_neon_busy), 1); + + /* + * If there is unsaved task fpsimd state in the CPU, save it + * and invalidate the copy stored in the fpsimd regs: + */ + if (current->mm && !test_and_set_thread_flag(TIF_FOREIGN_FPSTATE)) { + fpsimd_save_state(¤t->thread.fpsimd_state); this_cpu_write(fpsimd_last_state, NULL); } } -EXPORT_SYMBOL(kernel_neon_begin_partial); +EXPORT_SYMBOL(kernel_neon_begin); void kernel_neon_end(void) { + long busy; + if (!system_supports_fpsimd()) return; - if (in_interrupt()) { - struct fpsimd_partial_state *s = this_cpu_ptr( - in_irq() ? &hardirq_fpsimdstate : &softirq_fpsimdstate); - fpsimd_load_partial_state(s); - } else { - preempt_enable(); - } + + busy = local_xchg(this_cpu_ptr(&kernel_neon_busy), 0); + WARN_ON(!busy); /* No matching kernel_neon_begin()? */ + + preempt_enable(); } EXPORT_SYMBOL(kernel_neon_end);
The benefits of nested kernel-mode NEON may be marginal. Likewise, use of kernel-mode NEON in hardirq context is rare to nonexistent. Removing support for these eliminates signifcant cost and complexity. This patch implements a kernel_neon_allowed() function to allow clients to check whether kernel-mode NEON is usable in the current context, and simplifies kernel_neon_{begin,end}() to handle only saving of the task FPSIMD state (if any). Without nesting, there is no other state to save. The partial fpsimd save/restore functions become redundant as a result of these changes, so they are removed too. Signed-off-by: Dave Martin <Dave.Martin@arm.com> --- This patch requires drivers to be ported to cope with conditional availability of kernel-mode NEON: [1] should be close, since Ard was already working on this. I've build-tested this only for now: no runtime testing, no benchmarking. This patch is broken without some conversion of preempt_disable()/ enable() to local_bh_disable()/enable() around some of the context handling code in fpsimd.c, in order to be robust against the task's FPSIMD context being unexpectedly saved by a preempting softirq. [1] git://git.kernel.org/pub/scm/linux/kernel/git/ardb/linux.git kernel-mode-neon arch/arm64/include/asm/fpsimd.h | 14 --------- arch/arm64/include/asm/fpsimdmacros.h | 56 ----------------------------------- arch/arm64/include/asm/neon.h | 44 +++++++++++++++++++++++++-- arch/arm64/kernel/entry-fpsimd.S | 24 --------------- arch/arm64/kernel/fpsimd.c | 50 ++++++++++++++----------------- 5 files changed, 64 insertions(+), 124 deletions(-)