[v4] riscv: Optimize crc32 with Zbc extension

Message ID	20240315063828.341306-1-xiao.w.wang@intel.com (mailing list archive)
State	New
Headers	show Return-Path: <linux-riscv-bounces+linux-riscv=archiver.kernel.org@lists.infradead.org> From: Xiao Wang <xiao.w.wang@intel.com> To: paul.walmsley@sifive.com, palmer@dabbelt.com, aou@eecs.berkeley.edu Cc: ajones@ventanamicro.com, conor.dooley@microchip.com, heiko@sntech.de, david.laight@aculab.com, charlie@rivosinc.com, haicheng.li@intel.com, linux-riscv@lists.infradead.org, linux-kernel@vger.kernel.org, Xiao Wang <xiao.w.wang@intel.com> Subject: [PATCH v4] riscv: Optimize crc32 with Zbc extension Date: Fri, 15 Mar 2024 14:38:28 +0800 Message-Id: <20240315063828.341306-1-xiao.w.wang@intel.com> MIME-Version: 1.0 Precedence: list Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Sender: "linux-riscv" <linux-riscv-bounces@lists.infradead.org> Errors-To: linux-riscv-bounces+linux-riscv=archiver.kernel.org@lists.infradead.org
Series	[v4] riscv: Optimize crc32 with Zbc extension \| expand [v4] riscv: Optimize crc32 with Zbc extension

Context	Check	Description
conchuod/vmtest-for-next-PR	success	PR summary
conchuod/patch-1-test-1	success	.github/scripts/patches/tests/build_rv32_defconfig.sh
conchuod/patch-1-test-2	success	.github/scripts/patches/tests/build_rv64_clang_allmodconfig.sh
conchuod/patch-1-test-3	success	.github/scripts/patches/tests/build_rv64_gcc_allmodconfig.sh
conchuod/patch-1-test-4	success	.github/scripts/patches/tests/build_rv64_nommu_k210_defconfig.sh
conchuod/patch-1-test-5	success	.github/scripts/patches/tests/build_rv64_nommu_virt_defconfig.sh
conchuod/patch-1-test-6	warning	.github/scripts/patches/tests/checkpatch.sh
conchuod/patch-1-test-7	success	.github/scripts/patches/tests/dtb_warn_rv64.sh
conchuod/patch-1-test-8	success	.github/scripts/patches/tests/header_inline.sh
conchuod/patch-1-test-9	success	.github/scripts/patches/tests/kdoc.sh
conchuod/patch-1-test-10	success	.github/scripts/patches/tests/module_param.sh
conchuod/patch-1-test-11	success	.github/scripts/patches/tests/verify_fixes.sh
conchuod/patch-1-test-12	success	.github/scripts/patches/tests/verify_signedoff.sh

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 0bfcfec67ed5..68e1248bbee4 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -610,6 +610,29 @@ config RISCV_ISA_ZBB If you don't know what to do here, say Y. +config TOOLCHAIN_HAS_ZBC + bool + default y + depends on !64BIT || $(cc-option,-mabi=lp64 -march=rv64ima_zbc) + depends on !32BIT || $(cc-option,-mabi=ilp32 -march=rv32ima_zbc) + depends on LLD_VERSION >= 150000 || LD_VERSION >= 23900 + depends on AS_HAS_OPTION_ARCH + +config RISCV_ISA_ZBC + bool "Zbc extension support for carry-less multiplication instructions" + depends on TOOLCHAIN_HAS_ZBC + depends on MMU + depends on RISCV_ALTERNATIVE + default y + help + Adds support to dynamically detect the presence of the Zbc + extension (carry-less multiplication) and enable its usage. + + The Zbc extension could accelerate CRC (cyclic redundancy check) + calculations. + + If you don't know what to do here, say Y. + config RISCV_ISA_ZICBOM bool "Zicbom extension support for non-coherent DMA operation" depends on MMU diff --git a/arch/riscv/lib/Makefile b/arch/riscv/lib/Makefile index bd6e6c1b0497..2b369f51b0a5 100644 --- a/arch/riscv/lib/Makefile +++ b/arch/riscv/lib/Makefile @@ -13,6 +13,7 @@ endif lib-$(CONFIG_MMU) += uaccess.o lib-$(CONFIG_64BIT) += tishift.o lib-$(CONFIG_RISCV_ISA_ZICBOZ) += clear_page.o +lib-$(CONFIG_RISCV_ISA_ZBC) += crc32.o obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o lib-$(CONFIG_RISCV_ISA_V) += xor.o diff --git a/arch/riscv/lib/crc32.c b/arch/riscv/lib/crc32.c new file mode 100644 index 000000000000..d7dc599af3ef --- /dev/null +++ b/arch/riscv/lib/crc32.c @@ -0,0 +1,294 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Accelerated CRC32 implementation with Zbc extension. + * + * Copyright (C) 2024 Intel Corporation + */ + +#include <asm/hwcap.h> +#include <asm/alternative-macros.h> +#include <asm/byteorder.h> + +#include <linux/types.h> +#include <linux/minmax.h> +#include <linux/crc32poly.h> +#include <linux/crc32.h> +#include <linux/byteorder/generic.h> + +/* + * Refer to https://www.corsix.org/content/barrett-reduction-polynomials for + * better understanding of how this math works. + * + * let "+" denotes polynomial add (XOR) + * let "-" denotes polynomial sub (XOR) + * let "*" denotes polynomial multiplication + * let "/" denotes polynomial floor division + * let "S" denotes source data, XLEN bit wide + * let "P" denotes CRC32 polynomial + * let "T" denotes 2^(XLEN+32) + * let "QT" denotes quotient of T/P, with the bit for 2^XLEN being implicit + * + * crc32(S, P) + * => S * (2^32) - S * (2^32) / P * P + * => lowest 32 bits of: S * (2^32) / P * P + * => lowest 32 bits of: S * (2^32) * (T / P) / T * P + * => lowest 32 bits of: S * (2^32) * quotient / T * P + * => lowest 32 bits of: S * quotient / 2^XLEN * P + * => lowest 32 bits of: (clmul_high_part(S, QT) + S) * P + * => clmul_low_part(clmul_high_part(S, QT) + S, P) + * + * In terms of below implementations, the BE case is more intuitive, since the + * higher order bit sits at more significant position. + */ + +#if __riscv_xlen == 64 +/* Slide by XLEN bits per iteration */ +# define STEP_ORDER 3 + +/* Each below polynomial quotient has an implicit bit for 2^XLEN */ + +/* Polynomial quotient of (2^(XLEN+32))/CRC32_POLY, in LE format */ +# define CRC32_POLY_QT_LE 0x5a72d812fb808b20 + +/* Polynomial quotient of (2^(XLEN+32))/CRC32C_POLY, in LE format */ +# define CRC32C_POLY_QT_LE 0xa434f61c6f5389f8 + +/* Polynomial quotient of (2^(XLEN+32))/CRC32_POLY, in BE format, it should be + * the same as the bit-reversed version of CRC32_POLY_QT_LE + */ +# define CRC32_POLY_QT_BE 0x04d101df481b4e5a + +static inline u64 crc32_le_prep(u32 crc, unsigned long const *ptr) +{ + return (u64)crc ^ (__force u64)__cpu_to_le64(*ptr); +} + +static inline u32 crc32_le_zbc(unsigned long s, u32 poly, unsigned long poly_qt) +{ + u32 crc; + + /* We don't have a "clmulrh" insn, so use clmul + slli instead. */ + asm volatile (".option push\n" + ".option arch,+zbc\n" + "clmul %0, %1, %2\n" + "slli %0, %0, 1\n" + "xor %0, %0, %1\n" + "clmulr %0, %0, %3\n" + "srli %0, %0, 32\n" + ".option pop\n" + : "=&r" (crc) + : "r" (s), + "r" (poly_qt), + "r" ((u64)poly << 32) + :); + return crc; +} + +static inline u64 crc32_be_prep(u32 crc, unsigned long const *ptr) +{ + return ((u64)crc << 32) ^ (__force u64)__cpu_to_be64(*ptr); +} + +#elif __riscv_xlen == 32 +# define STEP_ORDER 2 +/* Each quotient should match the upper half of its analog in RV64 */ +# define CRC32_POLY_QT_LE 0xfb808b20 +# define CRC32C_POLY_QT_LE 0x6f5389f8 +# define CRC32_POLY_QT_BE 0x04d101df + +static inline u32 crc32_le_prep(u32 crc, unsigned long const *ptr) +{ + return crc ^ (__force u32)__cpu_to_le32(*ptr); +} + +static inline u32 crc32_le_zbc(unsigned long s, u32 poly, unsigned long poly_qt) +{ + u32 crc; + + /* We don't have a "clmulrh" insn, so use clmul + slli instead. */ + asm volatile (".option push\n" + ".option arch,+zbc\n" + "clmul %0, %1, %2\n" + "slli %0, %0, 1\n" + "xor %0, %0, %1\n" + "clmulr %0, %0, %3\n" + ".option pop\n" + : "=&r" (crc) + : "r" (s), + "r" (poly_qt), + "r" (poly) + :); + return crc; +} + +static inline u32 crc32_be_prep(u32 crc, unsigned long const *ptr) +{ + return crc ^ (__force u32)__cpu_to_be32(*ptr); +} + +#else +# error "Unexpected __riscv_xlen" +#endif + +static inline u32 crc32_be_zbc(unsigned long s) +{ + u32 crc; + + asm volatile (".option push\n" + ".option arch,+zbc\n" + "clmulh %0, %1, %2\n" + "xor %0, %0, %1\n" + "clmul %0, %0, %3\n" + ".option pop\n" + : "=&r" (crc) + : "r" (s), + "r" (CRC32_POLY_QT_BE), + "r" (CRC32_POLY_BE) + :); + return crc; +} + +#define STEP (1 << STEP_ORDER) +#define OFFSET_MASK (STEP - 1) + +typedef u32 (*fallback)(u32 crc, unsigned char const *p, size_t len); + +static inline u32 crc32_le_unaligned(u32 crc, unsigned char const *p, + size_t len, u32 poly, + unsigned long poly_qt) +{ + size_t bits = len * 8; + unsigned long s = 0; + u32 crc_low = 0; + + for (int i = 0; i < len; i++) + s = ((unsigned long)*p++ << (__riscv_xlen - 8)) | (s >> 8); + + s ^= (unsigned long)crc << (__riscv_xlen - bits); + if (__riscv_xlen == 32 || len < sizeof(u32)) + crc_low = crc >> bits; + + crc = crc32_le_zbc(s, poly, poly_qt); + crc ^= crc_low; + + return crc; +} + +static inline u32 __pure crc32_le_generic(u32 crc, unsigned char const *p, + size_t len, u32 poly, + unsigned long poly_qt, + fallback crc_fb) +{ + size_t offset, head_len, tail_len; + unsigned long const *p_ul; + unsigned long s; + + asm goto(ALTERNATIVE("j %l[legacy]", "nop", 0, + RISCV_ISA_EXT_ZBC, 1) + : : : : legacy); + + /* Handle the unaligned head. */ + offset = (unsigned long)p & OFFSET_MASK; + if (offset && len) { + head_len = min(STEP - offset, len); + crc = crc32_le_unaligned(crc, p, head_len, poly, poly_qt); + p += head_len; + len -= head_len; + } + + tail_len = len & OFFSET_MASK; + len = len >> STEP_ORDER; + p_ul = (unsigned long const *)p; + + for (int i = 0; i < len; i++) { + s = crc32_le_prep(crc, p_ul); + crc = crc32_le_zbc(s, poly, poly_qt); + p_ul++; + } + + /* Handle the tail bytes. */ + p = (unsigned char const *)p_ul; + if (tail_len) + crc = crc32_le_unaligned(crc, p, tail_len, poly, poly_qt); + + return crc; + +legacy: + return crc_fb(crc, p, len); +} + +u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len) +{ + return crc32_le_generic(crc, p, len, CRC32_POLY_LE, CRC32_POLY_QT_LE, + crc32_le_base); +} + +u32 __pure __crc32c_le(u32 crc, unsigned char const *p, size_t len) +{ + return crc32_le_generic(crc, p, len, CRC32C_POLY_LE, + CRC32C_POLY_QT_LE, __crc32c_le_base); +} + +static inline u32 crc32_be_unaligned(u32 crc, unsigned char const *p, + size_t len) +{ + size_t bits = len * 8; + unsigned long s = 0; + u32 crc_low = 0; + + s = 0; + for (int i = 0; i < len; i++) + s = *p++ | (s << 8); + + if (__riscv_xlen == 32 || len < sizeof(u32)) { + s ^= crc >> (32 - bits); + crc_low = crc << bits; + } else { + s ^= (unsigned long)crc << (bits - 32); + } + + crc = crc32_be_zbc(s); + crc ^= crc_low; + + return crc; +} + +u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len) +{ + size_t offset, head_len, tail_len; + unsigned long const *p_ul; + unsigned long s; + + asm goto(ALTERNATIVE("j %l[legacy]", "nop", 0, + RISCV_ISA_EXT_ZBC, 1) + : : : : legacy); + + /* Handle the unaligned head. */ + offset = (unsigned long)p & OFFSET_MASK; + if (offset && len) { + head_len = min(STEP - offset, len); + crc = crc32_be_unaligned(crc, p, head_len); + p += head_len; + len -= head_len; + } + + tail_len = len & OFFSET_MASK; + len = len >> STEP_ORDER; + p_ul = (unsigned long const *)p; + + for (int i = 0; i < len; i++) { + s = crc32_be_prep(crc, p_ul); + crc = crc32_be_zbc(s); + p_ul++; + } + + /* Handle the tail bytes. */ + p = (unsigned char const *)p_ul; + if (tail_len) + crc = crc32_be_unaligned(crc, p, tail_len); + + return crc; + +legacy: + return crc32_be_base(crc, p, len); +} diff --git a/include/linux/crc32.h b/include/linux/crc32.h index 9e8a032c1788..87f788c0d607 100644 --- a/include/linux/crc32.h +++ b/include/linux/crc32.h @@ -9,7 +9,9 @@ #include <linux/bitrev.h> u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len); +u32 __pure crc32_le_base(u32 crc, unsigned char const *p, size_t len); u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len); +u32 __pure crc32_be_base(u32 crc, unsigned char const *p, size_t len); /** * crc32_le_combine - Combine two crc32 check values into one. For two @@ -37,6 +39,7 @@ static inline u32 crc32_le_combine(u32 crc1, u32 crc2, size_t len2) } u32 __pure __crc32c_le(u32 crc, unsigned char const *p, size_t len); +u32 __pure __crc32c_le_base(u32 crc, unsigned char const *p, size_t len); /** * __crc32c_le_combine - Combine two crc32c check values into one. For two

[v4] riscv: Optimize crc32 with Zbc extension

Checks

Commit Message

Patch