@@ -77,19 +77,25 @@
*/
.text
ENTRY(sm3_ce_transform)
+ frame_push 3
+
+ mov x19, x0
+ mov x20, x1
+ mov x21, x2
+
/* load state */
- ld1 {v8.4s-v9.4s}, [x0]
+ ld1 {v8.4s-v9.4s}, [x19]
rev64 v8.4s, v8.4s
rev64 v9.4s, v9.4s
ext v8.16b, v8.16b, v8.16b, #8
ext v9.16b, v9.16b, v9.16b, #8
- adr_l x8, .Lt
+0: adr_l x8, .Lt
ldp s13, s14, [x8]
/* load input */
-0: ld1 {v0.16b-v3.16b}, [x1], #64
- sub w2, w2, #1
+1: ld1 {v0.16b-v3.16b}, [x20], #64
+ sub w21, w21, #1
mov v15.16b, v8.16b
mov v16.16b, v9.16b
@@ -125,14 +131,24 @@ CPU_LE( rev32 v3.16b, v3.16b )
eor v9.16b, v9.16b, v16.16b
/* handled all input blocks? */
- cbnz w2, 0b
+ cbz w21, 2f
+
+ if_will_cond_yield_neon
+ st1 {v8.4s-v9.4s}, [x19]
+ do_cond_yield_neon
+ ld1 {v8.4s-v9.4s}, [x19]
+ b 0b
+ endif_yield_neon
+
+ b 1b
/* save state */
- rev64 v8.4s, v8.4s
+2: rev64 v8.4s, v8.4s
rev64 v9.4s, v9.4s
ext v8.16b, v8.16b, v8.16b, #8
ext v9.16b, v9.16b, v9.16b, #8
- st1 {v8.4s-v9.4s}, [x0]
+ st1 {v8.4s-v9.4s}, [x19]
+ frame_pop
ret
ENDPROC(sm3_ce_transform)
Avoid excessive scheduling delays under a preemptible kernel by conditionally yielding the NEON after every block of input. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> --- arch/arm64/crypto/sm3-ce-core.S | 30 +++++++++++++++----- 1 file changed, 23 insertions(+), 7 deletions(-)