Message ID | 20220113064118.1580916-5-kevin@bracey.fi (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | arm64: accelerate crc32_be | expand |
On Thu, 13 Jan 2022 at 07:41, Kevin Bracey <kevin@bracey.fi> wrote: > > It makes no sense to leave crc32_be using the generic code while we > only accelerate the little-endian ops. > > Even though the big-endian form doesn't fit as smoothly into the arm64, > we can speed it up and avoid hitting the D cache. > > Tested on Cortex-A53. Without acceleration: > > crc32: CRC_LE_BITS = 64, CRC_BE BITS = 64 > crc32: self tests passed, processed 225944 bytes in 192240 nsec > crc32c: CRC_LE_BITS = 64 > crc32c: self tests passed, processed 112972 bytes in 21360 nsec > > With acceleration: > > crc32: CRC_LE_BITS = 64, CRC_BE BITS = 64 > crc32: self tests passed, processed 225944 bytes in 53480 nsec > crc32c: CRC_LE_BITS = 64 > crc32c: self tests passed, processed 112972 bytes in 21480 nsec > > Signed-off-by: Kevin Bracey <kevin@bracey.fi> > Cc: Catalin Marinas <catalin.marinas@arm.com> > Cc: Will Deacon <will@kernel.org> > Cc: Ard Biesheuvel <ardb@kernel.org> I've given this a spin on LE and BE builds: Tested-by: Ard Biesheuvel <ardb@kernel.org> Some nits below > --- > arch/arm64/lib/crc32.S | 87 +++++++++++++++++++++++++++++++++++------- > 1 file changed, 73 insertions(+), 14 deletions(-) > > diff --git a/arch/arm64/lib/crc32.S b/arch/arm64/lib/crc32.S > index 0f9e10ecda23..906210441d76 100644 > --- a/arch/arm64/lib/crc32.S > +++ b/arch/arm64/lib/crc32.S > @@ -11,7 +11,44 @@ > > .arch armv8-a+crc > > - .macro __crc32, c > + .macro byteorder, reg, be > + .if \be Please indent the \be so it aligns with the rest of the right hand column. (below as well) > +CPU_LE( rev \reg, \reg ) > + .else > +CPU_BE( rev \reg, \reg ) > + .endif > + .endm > + > + .macro byteorder16, reg, be > + .if \be > +CPU_LE( rev16 \reg, \reg ) > + .else > +CPU_BE( rev16 \reg, \reg ) > + .endif > + .endm > + > + .macro bitorder, reg, be > + .if \be > + rbit \reg, \reg > + .endif > + .endm > + > + .macro bitorder16, reg, be > + .if \be > + rbit \reg, \reg > + lsr \reg, \reg, #16 > + .endif > + .endm > + > + .macro bitorder8, reg, be > + .if \be > + rbit \reg, \reg > + lsr \reg, \reg, #24 > + .endif > + .endm > + > + .macro __crc32, c, be=0 > + bitorder w0, \be > cmp x2, #16 > b.lt 8f // less than 16 bytes > > @@ -24,10 +61,14 @@ > add x8, x8, x1 > add x1, x1, x7 > ldp x5, x6, [x8] > -CPU_BE( rev x3, x3 ) > -CPU_BE( rev x4, x4 ) > -CPU_BE( rev x5, x5 ) > -CPU_BE( rev x6, x6 ) > + byteorder x3, \be > + byteorder x4, \be > + byteorder x5, \be > + byteorder x6, \be > + bitorder x3, \be > + bitorder x4, \be > + bitorder x5, \be > + bitorder x6, \be > > tst x7, #8 > crc32\c\()x w8, w0, x3 > @@ -55,33 +96,43 @@ CPU_BE( rev x6, x6 ) > 32: ldp x3, x4, [x1], #32 > sub x2, x2, #32 > ldp x5, x6, [x1, #-16] > -CPU_BE( rev x3, x3 ) > -CPU_BE( rev x4, x4 ) > -CPU_BE( rev x5, x5 ) > -CPU_BE( rev x6, x6 ) > + byteorder x3, \be > + byteorder x4, \be > + byteorder x5, \be > + byteorder x6, \be > + bitorder x3, \be > + bitorder x4, \be > + bitorder x5, \be > + bitorder x6, \be > crc32\c\()x w0, w0, x3 > crc32\c\()x w0, w0, x4 > crc32\c\()x w0, w0, x5 > crc32\c\()x w0, w0, x6 > cbnz x2, 32b > -0: ret > +0: bitorder w0, \be > + ret > > 8: tbz x2, #3, 4f > ldr x3, [x1], #8 > -CPU_BE( rev x3, x3 ) > + byteorder x3, \be > + bitorder x3, \be > crc32\c\()x w0, w0, x3 > 4: tbz x2, #2, 2f > ldr w3, [x1], #4 > -CPU_BE( rev w3, w3 ) > + byteorder w3, \be > + bitorder w3, \be > crc32\c\()w w0, w0, w3 > 2: tbz x2, #1, 1f > ldrh w3, [x1], #2 > -CPU_BE( rev16 w3, w3 ) > + byteorder16 w3, \be > + bitorder16 w3, \be > crc32\c\()h w0, w0, w3 > 1: tbz x2, #0, 0f > ldrb w3, [x1] > + bitorder8 w3, \be > crc32\c\()b w0, w0, w3 > -0: ret > +0: bitorder w0, \be > + ret > .endm > > .align 5 > @@ -99,3 +150,11 @@ alternative_if_not ARM64_HAS_CRC32 > alternative_else_nop_endif > __crc32 c > SYM_FUNC_END(__crc32c_le) > + > + .align 5 > +SYM_FUNC_START(crc32_be) > +alternative_if_not ARM64_HAS_CRC32 > + b crc32_be_base > +alternative_else_nop_endif > + __crc32 , 1 Please rewrite this as __crc32 be=1 instead of using positional arguments. > +SYM_FUNC_END(crc32_be) > -- > 2.25.1 > WIth those fixed. Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
diff --git a/arch/arm64/lib/crc32.S b/arch/arm64/lib/crc32.S index 0f9e10ecda23..906210441d76 100644 --- a/arch/arm64/lib/crc32.S +++ b/arch/arm64/lib/crc32.S @@ -11,7 +11,44 @@ .arch armv8-a+crc - .macro __crc32, c + .macro byteorder, reg, be + .if \be +CPU_LE( rev \reg, \reg ) + .else +CPU_BE( rev \reg, \reg ) + .endif + .endm + + .macro byteorder16, reg, be + .if \be +CPU_LE( rev16 \reg, \reg ) + .else +CPU_BE( rev16 \reg, \reg ) + .endif + .endm + + .macro bitorder, reg, be + .if \be + rbit \reg, \reg + .endif + .endm + + .macro bitorder16, reg, be + .if \be + rbit \reg, \reg + lsr \reg, \reg, #16 + .endif + .endm + + .macro bitorder8, reg, be + .if \be + rbit \reg, \reg + lsr \reg, \reg, #24 + .endif + .endm + + .macro __crc32, c, be=0 + bitorder w0, \be cmp x2, #16 b.lt 8f // less than 16 bytes @@ -24,10 +61,14 @@ add x8, x8, x1 add x1, x1, x7 ldp x5, x6, [x8] -CPU_BE( rev x3, x3 ) -CPU_BE( rev x4, x4 ) -CPU_BE( rev x5, x5 ) -CPU_BE( rev x6, x6 ) + byteorder x3, \be + byteorder x4, \be + byteorder x5, \be + byteorder x6, \be + bitorder x3, \be + bitorder x4, \be + bitorder x5, \be + bitorder x6, \be tst x7, #8 crc32\c\()x w8, w0, x3 @@ -55,33 +96,43 @@ CPU_BE( rev x6, x6 ) 32: ldp x3, x4, [x1], #32 sub x2, x2, #32 ldp x5, x6, [x1, #-16] -CPU_BE( rev x3, x3 ) -CPU_BE( rev x4, x4 ) -CPU_BE( rev x5, x5 ) -CPU_BE( rev x6, x6 ) + byteorder x3, \be + byteorder x4, \be + byteorder x5, \be + byteorder x6, \be + bitorder x3, \be + bitorder x4, \be + bitorder x5, \be + bitorder x6, \be crc32\c\()x w0, w0, x3 crc32\c\()x w0, w0, x4 crc32\c\()x w0, w0, x5 crc32\c\()x w0, w0, x6 cbnz x2, 32b -0: ret +0: bitorder w0, \be + ret 8: tbz x2, #3, 4f ldr x3, [x1], #8 -CPU_BE( rev x3, x3 ) + byteorder x3, \be + bitorder x3, \be crc32\c\()x w0, w0, x3 4: tbz x2, #2, 2f ldr w3, [x1], #4 -CPU_BE( rev w3, w3 ) + byteorder w3, \be + bitorder w3, \be crc32\c\()w w0, w0, w3 2: tbz x2, #1, 1f ldrh w3, [x1], #2 -CPU_BE( rev16 w3, w3 ) + byteorder16 w3, \be + bitorder16 w3, \be crc32\c\()h w0, w0, w3 1: tbz x2, #0, 0f ldrb w3, [x1] + bitorder8 w3, \be crc32\c\()b w0, w0, w3 -0: ret +0: bitorder w0, \be + ret .endm .align 5 @@ -99,3 +150,11 @@ alternative_if_not ARM64_HAS_CRC32 alternative_else_nop_endif __crc32 c SYM_FUNC_END(__crc32c_le) + + .align 5 +SYM_FUNC_START(crc32_be) +alternative_if_not ARM64_HAS_CRC32 + b crc32_be_base +alternative_else_nop_endif + __crc32 , 1 +SYM_FUNC_END(crc32_be)
It makes no sense to leave crc32_be using the generic code while we only accelerate the little-endian ops. Even though the big-endian form doesn't fit as smoothly into the arm64, we can speed it up and avoid hitting the D cache. Tested on Cortex-A53. Without acceleration: crc32: CRC_LE_BITS = 64, CRC_BE BITS = 64 crc32: self tests passed, processed 225944 bytes in 192240 nsec crc32c: CRC_LE_BITS = 64 crc32c: self tests passed, processed 112972 bytes in 21360 nsec With acceleration: crc32: CRC_LE_BITS = 64, CRC_BE BITS = 64 crc32: self tests passed, processed 225944 bytes in 53480 nsec crc32c: CRC_LE_BITS = 64 crc32c: self tests passed, processed 112972 bytes in 21480 nsec Signed-off-by: Kevin Bracey <kevin@bracey.fi> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: Will Deacon <will@kernel.org> Cc: Ard Biesheuvel <ardb@kernel.org> --- arch/arm64/lib/crc32.S | 87 +++++++++++++++++++++++++++++++++++------- 1 file changed, 73 insertions(+), 14 deletions(-)