@@ -1,8 +1,11 @@
//
// Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions
//
-// Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
-// Copyright (C) 2019 Google LLC <ebiggers@google.com>
+// Copyright (C) 2016 Linaro Ltd
+// Copyright (C) 2019-2024 Google LLC
+//
+// Authors: Ard Biesheuvel <ardb@google.com>
+// Eric Biggers <ebiggers@google.com>
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License version 2 as
@@ -122,6 +125,13 @@
sli perm2.2d, perm1.2d, #56
sli perm3.2d, perm1.2d, #48
sli perm4.2d, perm1.2d, #40
+
+ // Compose { 0,0,0,0, 8,8,8,8, 1,1,1,1, 9,9,9,9 }
+ movi bd1.4h, #8, lsl #8
+ orr bd1.2s, #1, lsl #16
+ orr bd1.2s, #1, lsl #24
+ zip1 bd1.16b, bd1.16b, bd1.16b
+ zip1 bd1.16b, bd1.16b, bd1.16b
.endm
.macro __pmull_pre_p8, bd
@@ -196,6 +206,92 @@ SYM_FUNC_START_LOCAL(__pmull_p8_core)
ret
SYM_FUNC_END(__pmull_p8_core)
+ .macro pmull16x64_p64, a16, b64, c64
+ pmull2 \c64\().1q, \a16\().2d, \b64\().2d
+ pmull \b64\().1q, \a16\().1d, \b64\().1d
+ .endm
+
+ /*
+ * Pairwise long polynomial multiplication of two 16-bit values
+ *
+ * { w0, w1 }, { y0, y1 }
+ *
+ * by two 64-bit values
+ *
+ * { x0, x1, x2, x3, x4, x5, x6, x7 }, { z0, z1, z2, z3, z4, z5, z6, z7 }
+ *
+ * where each vector element is a byte, ordered from least to most
+ * significant.
+ *
+ * This can be implemented using 8x8 long polynomial multiplication, by
+ * reorganizing the input so that each pairwise 8x8 multiplication
+ * produces one of the terms from the decomposition below, and
+ * combining the results of each rank and shifting them into place.
+ *
+ * Rank
+ * 0 w0*x0 ^ | y0*z0 ^
+ * 1 (w0*x1 ^ w1*x0) << 8 ^ | (y0*z1 ^ y1*z0) << 8 ^
+ * 2 (w0*x2 ^ w1*x1) << 16 ^ | (y0*z2 ^ y1*z1) << 16 ^
+ * 3 (w0*x3 ^ w1*x2) << 24 ^ | (y0*z3 ^ y1*z2) << 24 ^
+ * 4 (w0*x4 ^ w1*x3) << 32 ^ | (y0*z4 ^ y1*z3) << 32 ^
+ * 5 (w0*x5 ^ w1*x4) << 40 ^ | (y0*z5 ^ y1*z4) << 40 ^
+ * 6 (w0*x6 ^ w1*x5) << 48 ^ | (y0*z6 ^ y1*z5) << 48 ^
+ * 7 (w0*x7 ^ w1*x6) << 56 ^ | (y0*z7 ^ y1*z6) << 56 ^
+ * 8 w1*x7 << 64 | y1*z7 << 64
+ *
+ * The inputs can be reorganized into
+ *
+ * { w0, w0, w0, w0, y0, y0, y0, y0 }, { w1, w1, w1, w1, y1, y1, y1, y1 }
+ * { x0, x2, x4, x6, z0, z2, z4, z6 }, { x1, x3, x5, x7, z1, z3, z5, z7 }
+ *
+ * and after performing 8x8->16 bit long polynomial multiplication of
+ * each of the halves of the first vector with those of the second one,
+ * we obtain the following four vectors of 16-bit elements:
+ *
+ * a := { w0*x0, w0*x2, w0*x4, w0*x6 }, { y0*z0, y0*z2, y0*z4, y0*z6 }
+ * b := { w0*x1, w0*x3, w0*x5, w0*x7 }, { y0*z1, y0*z3, y0*z5, y0*z7 }
+ * c := { w1*x0, w1*x2, w1*x4, w1*x6 }, { y1*z0, y1*z2, y1*z4, y1*z6 }
+ * d := { w1*x1, w1*x3, w1*x5, w1*x7 }, { y1*z1, y1*z3, y1*z5, y1*z7 }
+ *
+ * Results b and c can be XORed together, as the vector elements have
+ * matching ranks. Then, the final XOR (*) can be pulled forward, and
+ * applied between the halves of each of the remaining three vectors,
+ * which are then shifted into place, and combined to produce two
+ * 80-bit results.
+ *
+ * (*) NOTE: the 16x64 bit polynomial multiply below is not equivalent
+ * to the 64x64 bit one above, but XOR'ing the outputs together will
+ * produce the expected result, and this is sufficient in the context of
+ * this algorithm.
+ */
+ .macro pmull16x64_p8, a16, b64, c64
+ ext t7.16b, \b64\().16b, \b64\().16b, #1
+ tbl t5.16b, {\a16\().16b}, bd1.16b
+ uzp1 t7.16b, \b64\().16b, t7.16b
+ bl __pmull_p8_16x64
+ ext \b64\().16b, t4.16b, t4.16b, #15
+ eor \c64\().16b, t8.16b, t5.16b
+ .endm
+
+SYM_FUNC_START_LOCAL(__pmull_p8_16x64)
+ ext t6.16b, t5.16b, t5.16b, #8
+
+ pmull t3.8h, t7.8b, t5.8b
+ pmull t4.8h, t7.8b, t6.8b
+ pmull2 t5.8h, t7.16b, t5.16b
+ pmull2 t6.8h, t7.16b, t6.16b
+
+ ext t8.16b, t3.16b, t3.16b, #8
+ eor t4.16b, t4.16b, t6.16b
+ ext t7.16b, t5.16b, t5.16b, #8
+ ext t6.16b, t4.16b, t4.16b, #8
+ eor t8.8b, t8.8b, t3.8b
+ eor t5.8b, t5.8b, t7.8b
+ eor t4.8b, t4.8b, t6.8b
+ ext t5.16b, t5.16b, t5.16b, #14
+ ret
+SYM_FUNC_END(__pmull_p8_16x64)
+
.macro __pmull_p8, rq, ad, bd, i
.ifnc \bd, fold_consts
.err
@@ -218,14 +314,12 @@ SYM_FUNC_END(__pmull_p8_core)
.macro fold_32_bytes, p, reg1, reg2
ldp q11, q12, [buf], #0x20
- __pmull_\p v8, \reg1, fold_consts, 2
- __pmull_\p \reg1, \reg1, fold_consts
+ pmull16x64_\p fold_consts, \reg1, v8
CPU_LE( rev64 v11.16b, v11.16b )
CPU_LE( rev64 v12.16b, v12.16b )
- __pmull_\p v9, \reg2, fold_consts, 2
- __pmull_\p \reg2, \reg2, fold_consts
+ pmull16x64_\p fold_consts, \reg2, v9
CPU_LE( ext v11.16b, v11.16b, v11.16b, #8 )
CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 )
@@ -238,11 +332,9 @@ CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 )
// Fold src_reg into dst_reg, optionally loading the next fold constants
.macro fold_16_bytes, p, src_reg, dst_reg, load_next_consts
- __pmull_\p v8, \src_reg, fold_consts
- __pmull_\p \src_reg, \src_reg, fold_consts, 2
+ pmull16x64_\p fold_consts, \src_reg, v8
.ifnb \load_next_consts
ld1 {fold_consts.2d}, [fold_consts_ptr], #16
- __pmull_pre_\p fold_consts
.endif
eor \dst_reg\().16b, \dst_reg\().16b, v8.16b
eor \dst_reg\().16b, \dst_reg\().16b, \src_reg\().16b
@@ -296,7 +388,6 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
// Load the constants for folding across 128 bytes.
ld1 {fold_consts.2d}, [fold_consts_ptr]
- __pmull_pre_\p fold_consts
// Subtract 128 for the 128 data bytes just consumed. Subtract another
// 128 to simplify the termination condition of the following loop.
@@ -318,7 +409,6 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
// Fold across 64 bytes.
add fold_consts_ptr, fold_consts_ptr, #16
ld1 {fold_consts.2d}, [fold_consts_ptr], #16
- __pmull_pre_\p fold_consts
fold_16_bytes \p, v0, v4
fold_16_bytes \p, v1, v5
fold_16_bytes \p, v2, v6
@@ -339,8 +429,7 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
// into them, storing the result back into v7.
b.lt .Lfold_16_bytes_loop_done_\@
.Lfold_16_bytes_loop_\@:
- __pmull_\p v8, v7, fold_consts
- __pmull_\p v7, v7, fold_consts, 2
+ pmull16x64_\p fold_consts, v7, v8
eor v7.16b, v7.16b, v8.16b
ldr q0, [buf], #16
CPU_LE( rev64 v0.16b, v0.16b )
@@ -387,9 +476,8 @@ CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
bsl v2.16b, v1.16b, v0.16b
// Fold the first chunk into the second chunk, storing the result in v7.
- __pmull_\p v0, v3, fold_consts
- __pmull_\p v7, v3, fold_consts, 2
- eor v7.16b, v7.16b, v0.16b
+ pmull16x64_\p fold_consts, v3, v0
+ eor v7.16b, v3.16b, v0.16b
eor v7.16b, v7.16b, v2.16b
.Lreduce_final_16_bytes_\@:
@@ -450,7 +538,6 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
// Load the fold-across-16-bytes constants.
ld1 {fold_consts.2d}, [fold_consts_ptr], #16
- __pmull_pre_\p fold_consts
cmp len, #16
b.eq .Lreduce_final_16_bytes_\@ // len == 16