diff mbox

[RFC,v2,2/4] ARM64: add support for kernel mode NEON in atomic context

Message ID 1381344634-14917-3-git-send-email-ard.biesheuvel@linaro.org (mailing list archive)
State New, archived
Headers show

Commit Message

Ard Biesheuvel Oct. 9, 2013, 6:50 p.m. UTC
This patch adds kernel_neon_begin_atomic() and kernel_neon_end_atomic(),
which may be called from any context. In !in_interrupt() case, they
just call their non-_atomic counterparts. In atomic context, they
stack resp. unstack the number of NEON registers declared when setting
up the stack area using DEFINE_NEON_REG_STACK().

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
 arch/arm64/include/asm/fpsimd.h       | 16 +++++++++++++++
 arch/arm64/include/asm/fpsimdmacros.h | 37 +++++++++++++++++++++++++++++++++++
 arch/arm64/include/asm/neon.h         | 31 +++++++++++++++++++++++++++++
 arch/arm64/kernel/entry-fpsimd.S      | 24 +++++++++++++++++++++++
 arch/arm64/kernel/fpsimd.c            |  3 +++
 5 files changed, 111 insertions(+)

Comments

Catalin Marinas Oct. 11, 2013, 5:14 p.m. UTC | #1
On Wed, Oct 09, 2013 at 07:50:32PM +0100, Ard Biesheuvel wrote:
> --- a/arch/arm64/include/asm/neon.h
> +++ b/arch/arm64/include/asm/neon.h
> @@ -8,7 +8,38 @@
>   * published by the Free Software Foundation.
>   */
>  
> +#include <linux/hardirq.h>
> +#include <linux/types.h>
> +#include <asm/fpsimd.h>
> +
>  #define cpu_has_neon()		(1)
>  
> +#define DEFINE_NEON_STACK_REGS(a, num)					\
> +	struct {							\
> +		struct fpsimd_partial_state regs;			\
> +		__uint128_t vregs[(num) > 32 ? 32 : ((num) + 1) & ~1U];	\
> +	} a = { .regs.num_regs = sizeof(a.vregs) / sizeof(__uint128_t) }
> +
> +#define DEFINE_NEON_STACK_REGS_ALL(name)	DEFINE_NEON_STACK_REGS(name, 32)
> +
>  void kernel_neon_begin(void);
>  void kernel_neon_end(void);
> +
> +static inline void __kernel_neon_begin_atomic(struct fpsimd_partial_state *regs)
> +{
> +	if (!in_interrupt())
> +		kernel_neon_begin();
> +	else
> +		fpsimd_save_partial_state(regs);
> +}
> +
> +static inline void __kernel_neon_end_atomic(struct fpsimd_partial_state *regs)
> +{
> +	if (!in_interrupt())
> +		kernel_neon_end();
> +	else
> +		fpsimd_load_partial_state(regs);
> +}

The _atomic suffix is a bit misleading (you basically mean no user
context). I wonder whether it's better to have some _fast/_slow variants
instead. Looking at the other two patches, you only need 2 or 4
registers to do the crypto stuff but if you are not in_interrupt(), you
basically save and restore the full NEON bank. I would say for such
cases just make kernel_neon_begin_fast() call which is safe in all
contexts and much faster.
Ard Biesheuvel Oct. 11, 2013, 5:30 p.m. UTC | #2
> On 11 okt. 2013, at 19:14, Catalin Marinas <catalin.marinas@arm.com> wrote:
> 
>> On Wed, Oct 09, 2013 at 07:50:32PM +0100, Ard Biesheuvel wrote:
>> --- a/arch/arm64/include/asm/neon.h
>> +++ b/arch/arm64/include/asm/neon.h
>> @@ -8,7 +8,38 @@
>>  * published by the Free Software Foundation.
>>  */
>> 
>> +#include <linux/hardirq.h>
>> +#include <linux/types.h>
>> +#include <asm/fpsimd.h>
>> +
>> #define cpu_has_neon()        (1)
>> 
>> +#define DEFINE_NEON_STACK_REGS(a, num)                    \
>> +    struct {                            \
>> +        struct fpsimd_partial_state regs;            \
>> +        __uint128_t vregs[(num) > 32 ? 32 : ((num) + 1) & ~1U];    \
>> +    } a = { .regs.num_regs = sizeof(a.vregs) / sizeof(__uint128_t) }
>> +
>> +#define DEFINE_NEON_STACK_REGS_ALL(name)    DEFINE_NEON_STACK_REGS(name, 32)
>> +
>> void kernel_neon_begin(void);
>> void kernel_neon_end(void);
>> +
>> +static inline void __kernel_neon_begin_atomic(struct fpsimd_partial_state *regs)
>> +{
>> +    if (!in_interrupt())
>> +        kernel_neon_begin();
>> +    else
>> +        fpsimd_save_partial_state(regs);
>> +}
>> +
>> +static inline void __kernel_neon_end_atomic(struct fpsimd_partial_state *regs)
>> +{
>> +    if (!in_interrupt())
>> +        kernel_neon_end();
>> +    else
>> +        fpsimd_load_partial_state(regs);
>> +}
> 
> The _atomic suffix is a bit misleading (you basically mean no user
> context). I wonder whether it's better to have some _fast/_slow variants
> instead. Looking at the other two patches, you only need 2 or 4
> registers to do the crypto stuff but if you are not in_interrupt(), you
> basically save and restore the full NEON bank. I would say for such
> cases just make kernel_neon_begin_fast() call which is safe in all
> contexts and much faster.
> 

I agree that the name is a bit misleading.

Regarding fast/slow: if you take the core aes cipher as an example, it will likely be called from a loop somewhere, and (assuming lazy restore gets merged at some point) you may be stacking and unstacking 2 or 4 registers many times while kernel_neon_begin() would just stack them once and let the lazy restore unstack them only when needed.

This is probably a detail where arm and arm64 will be implemented somewhat differently. I would still like to align the api between the two, if possible, so intrinsics or gcc vectorized code can be shared easily. However, as ARM has fewer use cases where using only 2 registers makes sense, (memcpy perhaps?) and already has lazy restore wired up, I personally feel hooking into the lazy restore in the !in_interrupt() case is still the best solution there.
Catalin Marinas Oct. 11, 2013, 7:35 p.m. UTC | #3
On 11 Oct 2013, at 18:30, Ard Biesheuvel <ard.biesheuvel@linaro.org> wrote:
>> On 11 okt. 2013, at 19:14, Catalin Marinas <catalin.marinas@arm.com> wrote:
>>> On Wed, Oct 09, 2013 at 07:50:32PM +0100, Ard Biesheuvel wrote:
>>> --- a/arch/arm64/include/asm/neon.h
>>> +++ b/arch/arm64/include/asm/neon.h
>>> @@ -8,7 +8,38 @@
>>> * published by the Free Software Foundation.
>>> */
>>> 
>>> +#include <linux/hardirq.h>
>>> +#include <linux/types.h>
>>> +#include <asm/fpsimd.h>
>>> +
>>> #define cpu_has_neon()        (1)
>>> 
>>> +#define DEFINE_NEON_STACK_REGS(a, num)                    \
>>> +    struct {                            \
>>> +        struct fpsimd_partial_state regs;            \
>>> +        __uint128_t vregs[(num) > 32 ? 32 : ((num) + 1) & ~1U];    \
>>> +    } a = { .regs.num_regs = sizeof(a.vregs) / sizeof(__uint128_t) }
>>> +
>>> +#define DEFINE_NEON_STACK_REGS_ALL(name)    DEFINE_NEON_STACK_REGS(name, 32)
>>> +
>>> void kernel_neon_begin(void);
>>> void kernel_neon_end(void);
>>> +
>>> +static inline void __kernel_neon_begin_atomic(struct fpsimd_partial_state *regs)
>>> +{
>>> +    if (!in_interrupt())
>>> +        kernel_neon_begin();
>>> +    else
>>> +        fpsimd_save_partial_state(regs);
>>> +}
>>> +
>>> +static inline void __kernel_neon_end_atomic(struct fpsimd_partial_state *regs)
>>> +{
>>> +    if (!in_interrupt())
>>> +        kernel_neon_end();
>>> +    else
>>> +        fpsimd_load_partial_state(regs);
>>> +}
>> 
>> The _atomic suffix is a bit misleading (you basically mean no user
>> context). I wonder whether it's better to have some _fast/_slow variants
>> instead. Looking at the other two patches, you only need 2 or 4
>> registers to do the crypto stuff but if you are not in_interrupt(), you
>> basically save and restore the full NEON bank. I would say for such
>> cases just make kernel_neon_begin_fast() call which is safe in all
>> contexts and much faster.
> 
> I agree that the name is a bit misleading.
> 
> Regarding fast/slow: if you take the core aes cipher as an example, it
> will likely be called from a loop somewhere, and (assuming lazy restore
> gets merged at some point) you may be stacking and unstacking 2 or 4
> registers many times while kernel_neon_begin() would just stack them
> once and let the lazy restore unstack them only when needed.
> 
> This is probably a detail where arm and arm64 will be implemented
> somewhat differently.  I would still like to align the api between the
> two, if possible, so intrinsics or gcc vectorized code can be shared
> easily.  However, as ARM has fewer use cases where using only 2
> registers makes sense, (memcpy perhaps?) and already has lazy restore
> wired up, I personally feel hooking into the lazy restore in the
> !in_interrupt() case is still the best solution there.

Lazy saving/restoring on context switch may not be beneficial on arm64
and I don't want to enable it until I see some real user space
benchmarks (not kernel crypto API benchmarks).

The way kernel_neon_begin() is currently implemented on arm64 just saves
the whole register bank, so being called in a loop it will be worse.
What you could do is to save the register bank in kernel_neon_begin()
only the first time, set a TIF flag and restore them when returning to
user.  This way you can call it multiple times but only save/restore
once.

Catalin
Nicolas Pitre Oct. 11, 2013, 8:09 p.m. UTC | #4
On Fri, 11 Oct 2013, Catalin Marinas wrote:

> On 11 Oct 2013, at 18:30, Ard Biesheuvel <ard.biesheuvel@linaro.org> wrote:
> >> On 11 okt. 2013, at 19:14, Catalin Marinas <catalin.marinas@arm.com> wrote:
> >>> On Wed, Oct 09, 2013 at 07:50:32PM +0100, Ard Biesheuvel wrote:
> >>> --- a/arch/arm64/include/asm/neon.h
> >>> +++ b/arch/arm64/include/asm/neon.h
> >>> @@ -8,7 +8,38 @@
> >>> * published by the Free Software Foundation.
> >>> */
> >>> 
> >>> +#include <linux/hardirq.h>
> >>> +#include <linux/types.h>
> >>> +#include <asm/fpsimd.h>
> >>> +
> >>> #define cpu_has_neon()        (1)
> >>> 
> >>> +#define DEFINE_NEON_STACK_REGS(a, num)                    \
> >>> +    struct {                            \
> >>> +        struct fpsimd_partial_state regs;            \
> >>> +        __uint128_t vregs[(num) > 32 ? 32 : ((num) + 1) & ~1U];    \
> >>> +    } a = { .regs.num_regs = sizeof(a.vregs) / sizeof(__uint128_t) }
> >>> +
> >>> +#define DEFINE_NEON_STACK_REGS_ALL(name)    DEFINE_NEON_STACK_REGS(name, 32)
> >>> +
> >>> void kernel_neon_begin(void);
> >>> void kernel_neon_end(void);
> >>> +
> >>> +static inline void __kernel_neon_begin_atomic(struct fpsimd_partial_state *regs)
> >>> +{
> >>> +    if (!in_interrupt())
> >>> +        kernel_neon_begin();
> >>> +    else
> >>> +        fpsimd_save_partial_state(regs);
> >>> +}
> >>> +
> >>> +static inline void __kernel_neon_end_atomic(struct fpsimd_partial_state *regs)
> >>> +{
> >>> +    if (!in_interrupt())
> >>> +        kernel_neon_end();
> >>> +    else
> >>> +        fpsimd_load_partial_state(regs);
> >>> +}
> >> 
> >> The _atomic suffix is a bit misleading (you basically mean no user
> >> context). I wonder whether it's better to have some _fast/_slow variants
> >> instead. Looking at the other two patches, you only need 2 or 4
> >> registers to do the crypto stuff but if you are not in_interrupt(), you
> >> basically save and restore the full NEON bank. I would say for such
> >> cases just make kernel_neon_begin_fast() call which is safe in all
> >> contexts and much faster.
> > 
> > I agree that the name is a bit misleading.
> > 
> > Regarding fast/slow: if you take the core aes cipher as an example, it
> > will likely be called from a loop somewhere, and (assuming lazy restore
> > gets merged at some point) you may be stacking and unstacking 2 or 4
> > registers many times while kernel_neon_begin() would just stack them
> > once and let the lazy restore unstack them only when needed.
> > 
> > This is probably a detail where arm and arm64 will be implemented
> > somewhat differently.  I would still like to align the api between the
> > two, if possible, so intrinsics or gcc vectorized code can be shared
> > easily.  However, as ARM has fewer use cases where using only 2
> > registers makes sense, (memcpy perhaps?) and already has lazy restore
> > wired up, I personally feel hooking into the lazy restore in the
> > !in_interrupt() case is still the best solution there.
> 
> Lazy saving/restoring on context switch may not be beneficial on arm64
> and I don't want to enable it until I see some real user space
> benchmarks (not kernel crypto API benchmarks).

I think it is more important to establish the API semantics here.  
Implementation may vary afterwards.

The difference right now between kernel_neon_begin() and 
__kernel_neon_begin_atomic() is that the later can be stacked while the 
former cannot.  And when there is multiple invocations of Neon claiming 
then subsequent calls to the former are supposed to be very cheap. So 
this is all about tradeoff and constraints.

And I agree that the kernel_neon_begin_atomic name is potentially 
confusing.  But the fact is that this is the only version that can be 
used in atomic context.

Maybe the API should be kernel_neon_begin() and 
kernel_neon_begin_partial(nb_regs), the former being a simple alias to 
the later with the full register set as argument.  And then the actual 
register saving method (whether it is an atomic context or not, the 
number of registers, etc.) could be handled and optimized internally 
instead of exposing such implementation constraints to users of the API.

> The way kernel_neon_begin() is currently implemented on arm64 just saves
> the whole register bank, so being called in a loop it will be worse.
> What you could do is to save the register bank in kernel_neon_begin()
> only the first time, set a TIF flag and restore them when returning to
> user.  This way you can call it multiple times but only save/restore
> once.

This is certainly more inline with the lazy restore behavior assumed on 
ARM32.


Nicolas
Catalin Marinas Oct. 13, 2013, 10:48 p.m. UTC | #5
On 11 Oct 2013, at 21:09, Nicolas Pitre <nicolas.pitre@linaro.org> wrote:
> On Fri, 11 Oct 2013, Catalin Marinas wrote:
>> On 11 Oct 2013, at 18:30, Ard Biesheuvel <ard.biesheuvel@linaro.org> wrote:
>>>> On 11 okt. 2013, at 19:14, Catalin Marinas <catalin.marinas@arm.com> wrote:
>>>>> On Wed, Oct 09, 2013 at 07:50:32PM +0100, Ard Biesheuvel wrote:
>>>>> --- a/arch/arm64/include/asm/neon.h
>>>>> +++ b/arch/arm64/include/asm/neon.h
>>>>> @@ -8,7 +8,38 @@
>>>>> * published by the Free Software Foundation.
>>>>> */
>>>>> 
>>>>> +#include <linux/hardirq.h>
>>>>> +#include <linux/types.h>
>>>>> +#include <asm/fpsimd.h>
>>>>> +
>>>>> #define cpu_has_neon()        (1)
>>>>> 
>>>>> +#define DEFINE_NEON_STACK_REGS(a, num)                    \
>>>>> +    struct {                            \
>>>>> +        struct fpsimd_partial_state regs;            \
>>>>> +        __uint128_t vregs[(num) > 32 ? 32 : ((num) + 1) & ~1U];    \
>>>>> +    } a = { .regs.num_regs = sizeof(a.vregs) / sizeof(__uint128_t) }
>>>>> +
>>>>> +#define DEFINE_NEON_STACK_REGS_ALL(name)    DEFINE_NEON_STACK_REGS(name, 32)
>>>>> +
>>>>> void kernel_neon_begin(void);
>>>>> void kernel_neon_end(void);
>>>>> +
>>>>> +static inline void __kernel_neon_begin_atomic(struct fpsimd_partial_state *regs)
>>>>> +{
>>>>> +    if (!in_interrupt())
>>>>> +        kernel_neon_begin();
>>>>> +    else
>>>>> +        fpsimd_save_partial_state(regs);
>>>>> +}
>>>>> +
>>>>> +static inline void __kernel_neon_end_atomic(struct fpsimd_partial_state *regs)
>>>>> +{
>>>>> +    if (!in_interrupt())
>>>>> +        kernel_neon_end();
>>>>> +    else
>>>>> +        fpsimd_load_partial_state(regs);
>>>>> +}
>>>> 
>>>> The _atomic suffix is a bit misleading (you basically mean no user
>>>> context). I wonder whether it's better to have some _fast/_slow variants
>>>> instead. Looking at the other two patches, you only need 2 or 4
>>>> registers to do the crypto stuff but if you are not in_interrupt(), you
>>>> basically save and restore the full NEON bank. I would say for such
>>>> cases just make kernel_neon_begin_fast() call which is safe in all
>>>> contexts and much faster.
>>> 
>>> I agree that the name is a bit misleading.
>>> 
>>> Regarding fast/slow: if you take the core aes cipher as an example, it
>>> will likely be called from a loop somewhere, and (assuming lazy restore
>>> gets merged at some point) you may be stacking and unstacking 2 or 4
>>> registers many times while kernel_neon_begin() would just stack them
>>> once and let the lazy restore unstack them only when needed.
>>> 
>>> This is probably a detail where arm and arm64 will be implemented
>>> somewhat differently.  I would still like to align the api between the
>>> two, if possible, so intrinsics or gcc vectorized code can be shared
>>> easily.  However, as ARM has fewer use cases where using only 2
>>> registers makes sense, (memcpy perhaps?) and already has lazy restore
>>> wired up, I personally feel hooking into the lazy restore in the
>>> !in_interrupt() case is still the best solution there.
>> 
>> Lazy saving/restoring on context switch may not be beneficial on arm64
>> and I don't want to enable it until I see some real user space
>> benchmarks (not kernel crypto API benchmarks).
> 
> I think it is more important to establish the API semantics here.  
> Implementation may vary afterwards.
> 
> The difference right now between kernel_neon_begin() and 
> __kernel_neon_begin_atomic() is that the later can be stacked while the 
> former cannot.  

How much stacking do we need?  If we limit the nesting to two levels
(process and IRQ context), we could pre-allocate per-CPU
fpsimd_state structures for interrupt context and always use the same
API. About softirqs, do we need another level of nesting?

> Maybe the API should be kernel_neon_begin() and 
> kernel_neon_begin_partial(nb_regs), the former being a simple alias to 
> the later with the full register set as argument.  And then the actual 
> register saving method (whether it is an atomic context or not, the 
> number of registers, etc.) could be handled and optimized internally 
> instead of exposing such implementation constraints to users of the API.

It could be more efficient to always specify the number of registers to
be saved/restored even for kernel_neon_begin().  But I haven't paid much
attention to the register use in the actual crypto algorithms.

Catalin
Ard Biesheuvel Oct. 14, 2013, 8:12 a.m. UTC | #6
On 14 October 2013 00:48, Catalin Marinas <catalin.marinas@arm.com> wrote:
> On 11 Oct 2013, at 21:09, Nicolas Pitre <nicolas.pitre@linaro.org> wrote:

[...]

>>
>> I think it is more important to establish the API semantics here.
>> Implementation may vary afterwards.
>>
>> The difference right now between kernel_neon_begin() and
>> __kernel_neon_begin_atomic() is that the later can be stacked while the
>> former cannot.
>
> How much stacking do we need?  If we limit the nesting to two levels
> (process and IRQ context), we could pre-allocate per-CPU
> fpsimd_state structures for interrupt context and always use the same
> API. About softirqs, do we need another level of nesting?
>

Softirq context is required as well, so that implies two additional
fpsimd_states of 512 bytes each. If we can afford that, then sure, why
not?

>> Maybe the API should be kernel_neon_begin() and
>> kernel_neon_begin_partial(nb_regs), the former being a simple alias to
>> the later with the full register set as argument.  And then the actual
>> register saving method (whether it is an atomic context or not, the
>> number of registers, etc.) could be handled and optimized internally
>> instead of exposing such implementation constraints to users of the API.
>
> It could be more efficient to always specify the number of registers to
> be saved/restored even for kernel_neon_begin().  But I haven't paid much
> attention to the register use in the actual crypto algorithms.
>

To elaborate a bit: WPA-CCMP uses AES in CCM mode executed in softirq
context. I have included a reference implementation using 4 NEON
registers only, which makes sense in this case as the CCM transform
itself cannot be parallelized.

On the other hand, AES in XTS mode (dm-crypt) is fully parallelizable,
always executes from a kernel thread and always operates on a sector.
In this case, using the entire register file allows an 8 way
interleaved (*) implementation with all the round keys (between 11 and
15 16-byte keys) cached in registers.

The bottom line is that even if the crypto instructions can be used in
a meaningful way with only 2 or 4 registers, it is highly likely that
using more registers will result in higher performance [at least in
the AES case]

For the plain NEON case, I have written an implementation that keeps
the entire S-box (256 bytes) in registers. This should perform quite
well [assuming 4 register wide tbl/tbx lookups are not too costly],
but only in the cases where the cost of loading the S-box can be
amortized over multiple operations. This implies no core AES cipher
using plain NEON, but doing the CCM might be feasible, even if we have
to stack the whole register file in that case.

I agree that always specifying the number of registers used is
probably a meaningful addition, and in fact this is what I have
implemented in the v3 that I sent yesterday. The only difference
between Nico's suggestion and my implementation is that the number of
registers is declared at the time that the stack area is reserved so
we don't waste a lot of space.

Regards,
Ard.


* I am assuming some level of interleaving will be required to get
optimal performance from these instructions, but whether 8 is the
sweet spot is TBD
diff mbox

Patch

diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h
index c43b4ac..3a741b0 100644
--- a/arch/arm64/include/asm/fpsimd.h
+++ b/arch/arm64/include/asm/fpsimd.h
@@ -39,6 +39,19 @@  struct fpsimd_state {
 	};
 };
 
+/*
+ * Variable sized struct for stacking the bottom n FP/SIMD registers.
+ * Mainly intended for kernel use of v8 Crypto Extensions which only
+ * needs a few registers and may need to execute in atomic context.
+ */
+struct fpsimd_partial_state {
+	const u32	num_regs;
+	u32		fpsr;
+	u32		fpcr;
+	__uint128_t	vregs[] __aligned(16);
+} __aligned(16);
+
+
 #if defined(__KERNEL__) && defined(CONFIG_COMPAT)
 /* Masks for extracting the FPSR and FPCR from the FPSCR */
 #define VFP_FPSCR_STAT_MASK	0xf800009f
@@ -55,6 +68,9 @@  struct task_struct;
 extern void fpsimd_save_state(struct fpsimd_state *state);
 extern void fpsimd_load_state(struct fpsimd_state *state);
 
+extern void fpsimd_save_partial_state(struct fpsimd_partial_state *state);
+extern void fpsimd_load_partial_state(struct fpsimd_partial_state *state);
+
 extern void fpsimd_thread_switch(struct task_struct *next);
 extern void fpsimd_flush_thread(void);
 
diff --git a/arch/arm64/include/asm/fpsimdmacros.h b/arch/arm64/include/asm/fpsimdmacros.h
index bbec599..1b47587 100644
--- a/arch/arm64/include/asm/fpsimdmacros.h
+++ b/arch/arm64/include/asm/fpsimdmacros.h
@@ -62,3 +62,40 @@ 
 	ldr	w\tmpnr, [\state, #16 * 2 + 4]
 	msr	fpcr, x\tmpnr
 .endm
+
+.altmacro
+.macro	q2op, op, q1, q2, state
+	\op	q\q1, q\q2, [\state, #-(16 * \q1) - 16]
+.endm
+
+.macro fpsimd_save_partial state, tmpnr1, tmpnr2
+	mrs	x\tmpnr1, fpsr
+	mrs	x\tmpnr2, fpcr
+	stp	w\tmpnr1, w\tmpnr2, [\state, #4]
+	adr	x\tmpnr1, 0f
+	ldr	w\tmpnr2, [\state]
+	add	\state, \state, x\tmpnr2, lsl #4
+	sub	x\tmpnr1, x\tmpnr1, x\tmpnr2, lsl #1
+	br	x\tmpnr1
+	.irp	qa, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0
+		qb = \qa + 1
+	q2op	stp, \qa, %qb, \state
+	.endr
+0:
+.endm
+
+.macro fpsimd_restore_partial state, tmpnr1, tmpnr2
+	ldp	w\tmpnr1, w\tmpnr2, [\state, #4]
+	msr	fpsr, x\tmpnr1
+	msr	fpcr, x\tmpnr2
+	adr	x\tmpnr1, 0f
+	ldr	w\tmpnr2, [\state]
+	add	\state, \state, x\tmpnr2, lsl #4
+	sub	x\tmpnr1, x\tmpnr1, x\tmpnr2, lsl #1
+	br	x\tmpnr1
+	.irp	qa, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0
+		qb = \qa + 1
+	q2op	ldp, \qa, %qb, \state
+	.endr
+0:
+.endm
diff --git a/arch/arm64/include/asm/neon.h b/arch/arm64/include/asm/neon.h
index b0cc58a9..1c8600a 100644
--- a/arch/arm64/include/asm/neon.h
+++ b/arch/arm64/include/asm/neon.h
@@ -8,7 +8,38 @@ 
  * published by the Free Software Foundation.
  */
 
+#include <linux/hardirq.h>
+#include <linux/types.h>
+#include <asm/fpsimd.h>
+
 #define cpu_has_neon()		(1)
 
+#define DEFINE_NEON_STACK_REGS(a, num)					\
+	struct {							\
+		struct fpsimd_partial_state regs;			\
+		__uint128_t vregs[(num) > 32 ? 32 : ((num) + 1) & ~1U];	\
+	} a = { .regs.num_regs = sizeof(a.vregs) / sizeof(__uint128_t) }
+
+#define DEFINE_NEON_STACK_REGS_ALL(name)	DEFINE_NEON_STACK_REGS(name, 32)
+
 void kernel_neon_begin(void);
 void kernel_neon_end(void);
+
+static inline void __kernel_neon_begin_atomic(struct fpsimd_partial_state *regs)
+{
+	if (!in_interrupt())
+		kernel_neon_begin();
+	else
+		fpsimd_save_partial_state(regs);
+}
+
+static inline void __kernel_neon_end_atomic(struct fpsimd_partial_state *regs)
+{
+	if (!in_interrupt())
+		kernel_neon_end();
+	else
+		fpsimd_load_partial_state(regs);
+}
+
+#define kernel_neon_begin_atomic(a)	__kernel_neon_begin_atomic(&(a).regs)
+#define kernel_neon_end_atomic(a)	__kernel_neon_end_atomic(&(a).regs)
diff --git a/arch/arm64/kernel/entry-fpsimd.S b/arch/arm64/kernel/entry-fpsimd.S
index 6a27cd6..82cf648 100644
--- a/arch/arm64/kernel/entry-fpsimd.S
+++ b/arch/arm64/kernel/entry-fpsimd.S
@@ -41,3 +41,27 @@  ENTRY(fpsimd_load_state)
 	fpsimd_restore x0, 8
 	ret
 ENDPROC(fpsimd_load_state)
+
+#ifdef CONFIG_KERNEL_MODE_NEON
+
+/*
+ * Save the bottom n FP registers.
+ *
+ * x0 - pointer to struct fpsimd_partial_state
+ */
+ENTRY(fpsimd_save_partial_state)
+	fpsimd_save_partial x0, 8, 9
+	ret
+ENDPROC(fpsimd_load_partial_state)
+
+/*
+ * Load the bottom n FP registers.
+ *
+ * x0 - pointer to struct fpsimd_partial_state
+ */
+ENTRY(fpsimd_load_partial_state)
+	fpsimd_restore_partial x0, 8, 9
+	ret
+ENDPROC(fpsimd_load_partial_state)
+
+#endif
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index 1f2e4d5..69c7962 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -109,6 +109,9 @@  void kernel_neon_end(void)
 }
 EXPORT_SYMBOL(kernel_neon_end);
 
+EXPORT_SYMBOL(fpsimd_load_partial_state);
+EXPORT_SYMBOL(fpsimd_save_partial_state);
+
 #endif /* CONFIG_KERNEL_MODE_NEON */
 
 /*