Message ID | 20230113212301.3534711-3-heiko@sntech.de (mailing list archive) |
---|---|
State | Accepted |
Commit | b6fcdb191e36f82336f9b5e126d51c02e7323480 |
Delegated to: | Palmer Dabbelt |
Headers | show |
Series | Zbb string optimizations | expand |
Context | Check | Description |
---|---|---|
conchuod/tree_selection | fail | Failed to apply to next/pending-fixes or riscv/for-next |
On Fri, Jan 13, 2023 at 10:23:01PM +0100, Heiko Stuebner wrote: > From: Heiko Stuebner <heiko.stuebner@vrull.eu> > > Add handling for ZBB extension and add support for using it as a > variant for optimized string functions. > > Support for the Zbb-str-variants is limited to the GNU-assembler > for now, as LLVM has not yet acquired the functionality to > selectively change the arch option in assembler code. > This is still under review at > https://reviews.llvm.org/D123515 > > Co-developed-by: Christoph Muellner <christoph.muellner@vrull.eu> > Signed-off-by: Christoph Muellner <christoph.muellner@vrull.eu> > Signed-off-by: Heiko Stuebner <heiko.stuebner@vrull.eu> Had a look through, again mostly examining b4 diff... The changes look to match the suggestions on v4, but I was happy with the un-optimised version so I have no taste. Reviewed-by: Conor Dooley <conor.dooley@microchip.com> Thanks, Conor.
On Fri, Jan 13, 2023 at 10:23:01PM +0100, Heiko Stuebner wrote: > From: Heiko Stuebner <heiko.stuebner@vrull.eu> > > Add handling for ZBB extension and add support for using it as a > variant for optimized string functions. > > Support for the Zbb-str-variants is limited to the GNU-assembler > for now, as LLVM has not yet acquired the functionality to > selectively change the arch option in assembler code. > This is still under review at > https://reviews.llvm.org/D123515 > > Co-developed-by: Christoph Muellner <christoph.muellner@vrull.eu> > Signed-off-by: Christoph Muellner <christoph.muellner@vrull.eu> > Signed-off-by: Heiko Stuebner <heiko.stuebner@vrull.eu> > --- > arch/riscv/Kconfig | 24 ++++++ > arch/riscv/include/asm/errata_list.h | 3 +- > arch/riscv/include/asm/hwcap.h | 1 + > arch/riscv/kernel/cpu.c | 1 + > arch/riscv/kernel/cpufeature.c | 18 +++++ > arch/riscv/lib/strcmp.S | 85 ++++++++++++++++++++++ > arch/riscv/lib/strlen.S | 105 +++++++++++++++++++++++++++ > arch/riscv/lib/strncmp.S | 98 +++++++++++++++++++++++++ > 8 files changed, 334 insertions(+), 1 deletion(-) > > diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig > index e2b656043abf..7c814fbf9527 100644 > --- a/arch/riscv/Kconfig > +++ b/arch/riscv/Kconfig > @@ -416,6 +416,30 @@ config RISCV_ISA_SVPBMT > > If you don't know what to do here, say Y. > > +config TOOLCHAIN_HAS_ZBB > + bool > + default y > + depends on !64BIT || $(cc-option,-mabi=lp64 -march=rv64ima_zbb) > + depends on !32BIT || $(cc-option,-mabi=ilp32 -march=rv32ima_zbb) > + depends on LLD_VERSION >= 150000 || LD_VERSION >= 23900 > + depends on AS_IS_GNU > + > +config RISCV_ISA_ZBB > + bool "Zbb extension support for bit manipulation instructions" > + depends on TOOLCHAIN_HAS_ZBB > + depends on !XIP_KERNEL && MMU > + select RISCV_ALTERNATIVE > + default y > + help > + Adds support to dynamically detect the presence of the ZBB > + extension (basic bit manipulation) and enable its usage. > + > + The Zbb extension provides instructions to accelerate a number > + of bit-specific operations (count bit population, sign extending, > + bitrotation, etc). > + > + If you don't know what to do here, say Y. > + > config TOOLCHAIN_HAS_ZICBOM > bool > default y > diff --git a/arch/riscv/include/asm/errata_list.h b/arch/riscv/include/asm/errata_list.h > index 4180312d2a70..95e626b7281e 100644 > --- a/arch/riscv/include/asm/errata_list.h > +++ b/arch/riscv/include/asm/errata_list.h > @@ -24,7 +24,8 @@ > > #define CPUFEATURE_SVPBMT 0 > #define CPUFEATURE_ZICBOM 1 > -#define CPUFEATURE_NUMBER 2 > +#define CPUFEATURE_ZBB 2 > +#define CPUFEATURE_NUMBER 3 > > #ifdef __ASSEMBLY__ > > diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h > index 57439da71c77..462d6cde9bac 100644 > --- a/arch/riscv/include/asm/hwcap.h > +++ b/arch/riscv/include/asm/hwcap.h > @@ -58,6 +58,7 @@ enum riscv_isa_ext_id { > RISCV_ISA_EXT_SSTC, > RISCV_ISA_EXT_SVINVAL, > RISCV_ISA_EXT_SVPBMT, > + RISCV_ISA_EXT_ZBB, > RISCV_ISA_EXT_ZICBOM, > RISCV_ISA_EXT_ZIHINTPAUSE, > RISCV_ISA_EXT_ID_MAX > diff --git a/arch/riscv/kernel/cpu.c b/arch/riscv/kernel/cpu.c > index 0bf1c7f663fc..420228e219f7 100644 > --- a/arch/riscv/kernel/cpu.c > +++ b/arch/riscv/kernel/cpu.c > @@ -185,6 +185,7 @@ arch_initcall(riscv_cpuinfo_init); > * New entries to this struct should follow the ordering rules described above. > */ > static struct riscv_isa_ext_data isa_ext_arr[] = { > + __RISCV_ISA_EXT_DATA(zbb, RISCV_ISA_EXT_ZBB), Zb* comes after Zi* according 27.11 "Subset Naming Convention". I can hear you groaning! The other lists are OK, because we decided to just keep those in alphabetical order. > __RISCV_ISA_EXT_DATA(zicbom, RISCV_ISA_EXT_ZICBOM), > __RISCV_ISA_EXT_DATA(zihintpause, RISCV_ISA_EXT_ZIHINTPAUSE), > __RISCV_ISA_EXT_DATA(sscofpmf, RISCV_ISA_EXT_SSCOFPMF), > diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c > index dde0e91d7668..9899806cef29 100644 > --- a/arch/riscv/kernel/cpufeature.c > +++ b/arch/riscv/kernel/cpufeature.c > @@ -227,6 +227,7 @@ void __init riscv_fill_hwcap(void) > SET_ISA_EXT_MAP("sstc", RISCV_ISA_EXT_SSTC); > SET_ISA_EXT_MAP("svinval", RISCV_ISA_EXT_SVINVAL); > SET_ISA_EXT_MAP("svpbmt", RISCV_ISA_EXT_SVPBMT); > + SET_ISA_EXT_MAP("zbb", RISCV_ISA_EXT_ZBB); > SET_ISA_EXT_MAP("zicbom", RISCV_ISA_EXT_ZICBOM); > SET_ISA_EXT_MAP("zihintpause", RISCV_ISA_EXT_ZIHINTPAUSE); > } > @@ -302,6 +303,20 @@ static bool __init_or_module cpufeature_probe_zicbom(unsigned int stage) > return true; > } > > +static bool __init_or_module cpufeature_probe_zbb(unsigned int stage) > +{ > + if (!IS_ENABLED(CONFIG_RISCV_ISA_ZBB)) > + return false; > + > + if (stage == RISCV_ALTERNATIVES_EARLY_BOOT) > + return false; > + > + if (!riscv_isa_extension_available(NULL, ZBB)) > + return false; > + > + return true; > +} > + > /* > * Probe presence of individual extensions. > * > @@ -320,6 +335,9 @@ static u32 __init_or_module cpufeature_probe(unsigned int stage) > if (cpufeature_probe_zicbom(stage)) > cpu_req_feature |= BIT(CPUFEATURE_ZICBOM); > > + if (cpufeature_probe_zbb(stage)) > + cpu_req_feature |= BIT(CPUFEATURE_ZBB); > + > return cpu_req_feature; > } > > diff --git a/arch/riscv/lib/strcmp.S b/arch/riscv/lib/strcmp.S > index 8babd712b958..8148b6418f61 100644 > --- a/arch/riscv/lib/strcmp.S > +++ b/arch/riscv/lib/strcmp.S > @@ -3,9 +3,14 @@ > #include <linux/linkage.h> > #include <asm/asm.h> > #include <asm-generic/export.h> > +#include <asm/alternative-macros.h> > +#include <asm/errata_list.h> > > /* int strcmp(const char *cs, const char *ct) */ > SYM_FUNC_START(strcmp) > + > + ALTERNATIVE("nop", "j strcmp_zbb", 0, CPUFEATURE_ZBB, CONFIG_RISCV_ISA_ZBB) > + > /* > * Returns > * a0 - comparison result, value like strcmp > @@ -33,4 +38,84 @@ SYM_FUNC_START(strcmp) > */ > sub a0, t0, t1 > ret > + > +/* > + * Variant of strcmp using the ZBB extension if available Could call out the source of this being Appendix A of the bitman manual. Same suggestion for the other variant headers comments as well. > + */ > +#ifdef CONFIG_RISCV_ISA_ZBB > +strcmp_zbb: > + > +.option push > +.option arch,+zbb > + > + /* > + * Returns > + * a0 - comparison result, value like strcmp > + * > + * Parameters > + * a0 - string1 > + * a1 - string2 > + * > + * Clobbers > + * t0, t1, t2, t3, t4, t5 t5 isn't used > + */ > + > + or t2, a0, a1 > + li t4, -1 > + and t2, t2, SZREG-1 > + bnez t2, 3f > + > + /* Main loop for aligned string. */ > + .p2align 3 > +1: > + REG_L t0, 0(a0) > + REG_L t1, 0(a1) > + orc.b t3, t0 > + bne t3, t4, 2f > + addi a0, a0, SZREG > + addi a1, a1, SZREG > + beq t0, t1, 1b > + > + /* > + * Words don't match, and no null byte in the first > + * word. Get bytes in big-endian order and compare. > + */ > +#ifndef CONFIG_CPU_BIG_ENDIAN > + rev8 t0, t0 > + rev8 t1, t1 > +#endif > + > + /* Synthesize (t0 >= t1) ? 1 : -1 in a branchless sequence. */ > + sltu a0, t0, t1 > + neg a0, a0 > + ori a0, a0, 1 > + ret > + > +2: > + /* > + * Found a null byte. > + * If words don't match, fall back to simple loop. > + */ > + bne t0, t1, 3f > + > + /* Otherwise, strings are equal. */ > + li a0, 0 > + ret > + > + /* Simple loop for misaligned strings. */ > + .p2align 3 > +3: > + lbu t0, 0(a0) > + lbu t1, 0(a1) > + addi a0, a0, 1 > + addi a1, a1, 1 > + bne t0, t1, 4f > + bnez t0, 3b > + > +4: > + sub a0, t0, t1 > + ret > + > +.option pop > +#endif > SYM_FUNC_END(strcmp) > diff --git a/arch/riscv/lib/strlen.S b/arch/riscv/lib/strlen.S > index 0a3b11853efd..0f9dbf93301a 100644 > --- a/arch/riscv/lib/strlen.S > +++ b/arch/riscv/lib/strlen.S > @@ -3,9 +3,14 @@ > #include <linux/linkage.h> > #include <asm/asm.h> > #include <asm-generic/export.h> > +#include <asm/alternative-macros.h> > +#include <asm/errata_list.h> > > /* int strlen(const char *s) */ > SYM_FUNC_START(strlen) > + > + ALTERNATIVE("nop", "j strlen_zbb", 0, CPUFEATURE_ZBB, CONFIG_RISCV_ISA_ZBB) > + > /* > * Returns > * a0 - string length > @@ -25,4 +30,104 @@ SYM_FUNC_START(strlen) > 2: > sub a0, t1, a0 > ret > + > +/* > + * Variant of strlen using the ZBB extension if available > + */ > +#ifdef CONFIG_RISCV_ISA_ZBB > +strlen_zbb: > + > +#ifdef CONFIG_CPU_BIG_ENDIAN > +# define CZ clz > +# define SHIFT sll > +#else > +# define CZ ctz > +# define SHIFT srl > +#endif > + > +.option push > +.option arch,+zbb > + > + /* > + * Returns > + * a0 - string length > + * > + * Parameters > + * a0 - String to measure > + * > + * Clobbers > + * t0, t1, t2, t3 > + */ > + > + /* Number of irrelevant bytes in the first word. */ > + andi t2, a0, SZREG-1 > + > + /* Align pointer. */ > + andi t0, a0, -SZREG > + > + li t3, SZREG > + sub t3, t3, t2 > + slli t2, t2, 3 > + > + /* Get the first word. */ > + REG_L t1, 0(t0) > + > + /* > + * Shift away the partial data we loaded to remove the irrelevant bytes > + * preceding the string with the effect of adding NUL bytes at the > + * end of the string's first word. > + */ > + SHIFT t1, t1, t2 > + > + /* Convert non-NUL into 0xff and NUL into 0x00. */ > + orc.b t1, t1 > + > + /* Convert non-NUL into 0x00 and NUL into 0xff. */ > + not t1, t1 > + > + /* > + * Search for the first set bit (corresponding to a NUL byte in the > + * original chunk). > + */ > + CZ t1, t1 > + > + /* > + * The first chunk is special: compare against the number > + * of valid bytes in this chunk. > + */ > + srli a0, t1, 3 > + bgtu t3, a0, 3f > + > + /* Prepare for the word comparison loop. */ > + addi t2, t0, SZREG > + li t3, -1 > + > + /* > + * Our critical loop is 4 instructions and processes data in > + * 4 byte or 8 byte chunks. > + */ > + .p2align 3 > +1: > + REG_L t1, SZREG(t0) > + addi t0, t0, SZREG > + orc.b t1, t1 > + beq t1, t3, 1b > +2: This 2 label is never used so it can be removed. I see the appendix also has an unused label here, .Lepilogue, which could also be removed. > + not t1, t1 > + CZ t1, t1 > + > + /* Get number of processed words. */ ^ bytes > + sub t2, t0, t2 > + > + /* Add number of characters in the first word. */ > + add a0, a0, t2 > + srli t1, t1, 3 I'd move this shift up under the CZ a few lines above or down under the next comment since it's not related to what the comment above it says. > + > + /* Add number of characters in the last word. */ > + add a0, a0, t1 > +3: > + ret > + > +.option pop > +#endif > SYM_FUNC_END(strlen) > diff --git a/arch/riscv/lib/strncmp.S b/arch/riscv/lib/strncmp.S > index 1f644d0a93f6..7940ddab2d48 100644 > --- a/arch/riscv/lib/strncmp.S > +++ b/arch/riscv/lib/strncmp.S > @@ -3,9 +3,14 @@ > #include <linux/linkage.h> > #include <asm/asm.h> > #include <asm-generic/export.h> > +#include <asm/alternative-macros.h> > +#include <asm/errata_list.h> > > /* int strncmp(const char *cs, const char *ct, size_t count) */ > SYM_FUNC_START(strncmp) > + > + ALTERNATIVE("nop", "j strncmp_zbb", 0, CPUFEATURE_ZBB, CONFIG_RISCV_ISA_ZBB) > + > /* > * Returns > * a0 - comparison result, value like strncmp > @@ -38,4 +43,97 @@ SYM_FUNC_START(strncmp) > */ > sub a0, t0, t1 > ret > + > +/* > + * Variant of strncmp using the ZBB extension if available > + */ > +#ifdef CONFIG_RISCV_ISA_ZBB > +strncmp_zbb: > + > +.option push > +.option arch,+zbb > + > + /* > + * Returns > + * a0 - comparison result, like strncmp > + * > + * Parameters > + * a0 - string1 > + * a1 - string2 > + * a2 - number of characters to compare > + * > + * Clobbers > + * t0, t1, t2, t3, t4, t5, t6 > + */ > + > + or t2, a0, a1 > + li t5, -1 > + and t2, t2, SZREG-1 > + add t4, a0, a2 > + bnez t2, 4f > + > + /* Adjust limit for fast-path. */ > + andi t6, t4, -SZREG > + > + /* Main loop for aligned string. */ > + .p2align 3 > +1: > + bgt a0, t6, 3f > + REG_L t0, 0(a0) > + REG_L t1, 0(a1) > + orc.b t3, t0 > + bne t3, t5, 2f > + addi a0, a0, SZREG > + addi a1, a1, SZREG > + beq t0, t1, 1b > + > + /* > + * Words don't match, and no null byte in the first > + * word. Get bytes in big-endian order and compare. > + */ > +#ifndef CONFIG_CPU_BIG_ENDIAN > + rev8 t0, t0 > + rev8 t1, t1 > +#endif > + > + /* Synthesize (t0 >= t1) ? 1 : -1 in a branchless sequence. */ > + sltu a0, t0, t1 > + neg a0, a0 > + ori a0, a0, 1 > + ret > + > +2: > + /* > + * Found a null byte. > + * If words don't match, fall back to simple loop. > + */ > + bne t0, t1, 3f > + > + /* Otherwise, strings are equal. */ > + li a0, 0 > + ret > + > + /* Simple loop for misaligned strings. */ > +3: This label isn't any different than the next, label 4, so it should be removed. > + /* Restore limit for slow-path. */ nit: We're not really "restoring" anything, but rather using limit which is saved in a different register. > + .p2align 3 > +4: > + bge a0, t4, 6f > + lbu t0, 0(a0) > + lbu t1, 0(a1) > + addi a0, a0, 1 > + addi a1, a1, 1 > + bne t0, t1, 5f > + bnez t0, 4b > + > +5: > + sub a0, t0, t1 > + ret > + > +6: > + li a0, 0 > + ret > + > +.option pop > +#endif > SYM_FUNC_END(strncmp) > -- > 2.35.1 > Thanks, drew
On Fri, Jan 13, 2023 at 10:23:01PM +0100, Heiko Stuebner wrote: ... > diff --git a/arch/riscv/lib/strcmp.S b/arch/riscv/lib/strcmp.S > index 8babd712b958..8148b6418f61 100644 > --- a/arch/riscv/lib/strcmp.S > +++ b/arch/riscv/lib/strcmp.S > @@ -3,9 +3,14 @@ > #include <linux/linkage.h> > #include <asm/asm.h> > #include <asm-generic/export.h> > +#include <asm/alternative-macros.h> > +#include <asm/errata_list.h> > > /* int strcmp(const char *cs, const char *ct) */ > SYM_FUNC_START(strcmp) > + > + ALTERNATIVE("nop", "j strcmp_zbb", 0, CPUFEATURE_ZBB, CONFIG_RISCV_ISA_ZBB) > + I have something similar for Zicboz (which I've just reread as I'm preparing v2). The difference is that I opted to penalize the non- optimized version with the unconditional jump and give the optimized version the nop. To do that here, it'd just need the label changed to strcmp_basic or whatever, push the "basic" code down into it the new label, and then put the new zbb code here. Thanks, drew
Hi, On Fri, Jan 13, 2023 at 10:23:01PM +0100, Heiko Stuebner wrote: > From: Heiko Stuebner <heiko.stuebner@vrull.eu> > > Add handling for ZBB extension and add support for using it as a > variant for optimized string functions. > > Support for the Zbb-str-variants is limited to the GNU-assembler > for now, as LLVM has not yet acquired the functionality to > selectively change the arch option in assembler code. > This is still under review at > https://reviews.llvm.org/D123515 > > Co-developed-by: Christoph Muellner <christoph.muellner@vrull.eu> > Signed-off-by: Christoph Muellner <christoph.muellner@vrull.eu> > Signed-off-by: Heiko Stuebner <heiko.stuebner@vrull.eu> With this patch in linux-next, the qemu "virt" emulation no longer finds a root filesystem. I don't know how it is related, but the problem is seen for all types of devices I tried to boot from. Booting from initrd still works. There are also repeated WARNING: CPU: 0 PID: 0 at arch/riscv/kernel/patch.c:63 patch_insn_write+0x222/0x2f6 in the log, but that appears to be unrelated. Guenter --- bisect log: # bad: [6ba8a227fd19d19779005fb66ad7562608e1df83] Add linux-next specific files for 20230210 # good: [4ec5183ec48656cec489c49f989c508b68b518e3] Linux 6.2-rc7 git bisect start 'HEAD' 'v6.2-rc7' # bad: [94613f0efc69ed41f9229ef5c294db3ec37145da] Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/cryptodev-2.6.git git bisect bad 94613f0efc69ed41f9229ef5c294db3ec37145da # bad: [8928ece68de4371dc6e1d3336d3029c1e18ae3b4] Merge branch 'for_next' of git://git.kernel.org/pub/scm/linux/kernel/git/jack/linux-fs.git git bisect bad 8928ece68de4371dc6e1d3336d3029c1e18ae3b4 # good: [78a9f460e33d103256f3abb38f02f4759710c7dc] soc: document merges git bisect good 78a9f460e33d103256f3abb38f02f4759710c7dc # good: [b35b2472ebafa29d0bbe79fbee1da6ef3c4ec619] Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/qcom/linux.git git bisect good b35b2472ebafa29d0bbe79fbee1da6ef3c4ec619 # bad: [57a87a64b520c37dd49f5fde84d09a4adb391180] Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/riscv/linux.git git bisect bad 57a87a64b520c37dd49f5fde84d09a4adb391180 # good: [cfc8ba01cc84b24ec6eb293ec9fba893f7cd4581] Merge branch 'clk-next' of git://git.kernel.org/pub/scm/linux/kernel/git/clk/linux.git git bisect good cfc8ba01cc84b24ec6eb293ec9fba893f7cd4581 # good: [6acecfa485d3de955c35a18730c106ddf1e7600e] powerpc/kcsan: Add KCSAN Support git bisect good 6acecfa485d3de955c35a18730c106ddf1e7600e # good: [8a16dea453dbc3e834b162640028e505882cd11e] Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git git bisect good 8a16dea453dbc3e834b162640028e505882cd11e # good: [6be1ff430dab9fc047762b10b2c9669399ea1f37] riscv: pgtable: Fixup comment for KERN_VIRT_SIZE git bisect good 6be1ff430dab9fc047762b10b2c9669399ea1f37 # good: [e0c267e03b0c77c9ac79ac08eada41ba8eb1b95f] riscv: module: move find_section to module.h git bisect good e0c267e03b0c77c9ac79ac08eada41ba8eb1b95f # good: [e8ad17d2b5f38e595d597a3e2419d6d7cc727b17] riscv: KVM: Switch has_svinval() to riscv_has_extension_unlikely() git bisect good e8ad17d2b5f38e595d597a3e2419d6d7cc727b17 # bad: [75ab93a244a516d1d3c03c4e27d5d0deff76ebfb] Merge patch series "Zbb string optimizations" git bisect bad 75ab93a244a516d1d3c03c4e27d5d0deff76ebfb # bad: [b6fcdb191e36f82336f9b5e126d51c02e7323480] RISC-V: add zbb support to string functions git bisect bad b6fcdb191e36f82336f9b5e126d51c02e7323480 # good: [56e0790c7f9e59ba6a0f4b59981d1d6fbf43efb0] RISC-V: add infrastructure to allow different str* implementations git bisect good 56e0790c7f9e59ba6a0f4b59981d1d6fbf43efb0 # first bad commit: [b6fcdb191e36f82336f9b5e126d51c02e7323480] RISC-V: add zbb support to string functions
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index e2b656043abf..7c814fbf9527 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -416,6 +416,30 @@ config RISCV_ISA_SVPBMT If you don't know what to do here, say Y. +config TOOLCHAIN_HAS_ZBB + bool + default y + depends on !64BIT || $(cc-option,-mabi=lp64 -march=rv64ima_zbb) + depends on !32BIT || $(cc-option,-mabi=ilp32 -march=rv32ima_zbb) + depends on LLD_VERSION >= 150000 || LD_VERSION >= 23900 + depends on AS_IS_GNU + +config RISCV_ISA_ZBB + bool "Zbb extension support for bit manipulation instructions" + depends on TOOLCHAIN_HAS_ZBB + depends on !XIP_KERNEL && MMU + select RISCV_ALTERNATIVE + default y + help + Adds support to dynamically detect the presence of the ZBB + extension (basic bit manipulation) and enable its usage. + + The Zbb extension provides instructions to accelerate a number + of bit-specific operations (count bit population, sign extending, + bitrotation, etc). + + If you don't know what to do here, say Y. + config TOOLCHAIN_HAS_ZICBOM bool default y diff --git a/arch/riscv/include/asm/errata_list.h b/arch/riscv/include/asm/errata_list.h index 4180312d2a70..95e626b7281e 100644 --- a/arch/riscv/include/asm/errata_list.h +++ b/arch/riscv/include/asm/errata_list.h @@ -24,7 +24,8 @@ #define CPUFEATURE_SVPBMT 0 #define CPUFEATURE_ZICBOM 1 -#define CPUFEATURE_NUMBER 2 +#define CPUFEATURE_ZBB 2 +#define CPUFEATURE_NUMBER 3 #ifdef __ASSEMBLY__ diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h index 57439da71c77..462d6cde9bac 100644 --- a/arch/riscv/include/asm/hwcap.h +++ b/arch/riscv/include/asm/hwcap.h @@ -58,6 +58,7 @@ enum riscv_isa_ext_id { RISCV_ISA_EXT_SSTC, RISCV_ISA_EXT_SVINVAL, RISCV_ISA_EXT_SVPBMT, + RISCV_ISA_EXT_ZBB, RISCV_ISA_EXT_ZICBOM, RISCV_ISA_EXT_ZIHINTPAUSE, RISCV_ISA_EXT_ID_MAX diff --git a/arch/riscv/kernel/cpu.c b/arch/riscv/kernel/cpu.c index 0bf1c7f663fc..420228e219f7 100644 --- a/arch/riscv/kernel/cpu.c +++ b/arch/riscv/kernel/cpu.c @@ -185,6 +185,7 @@ arch_initcall(riscv_cpuinfo_init); * New entries to this struct should follow the ordering rules described above. */ static struct riscv_isa_ext_data isa_ext_arr[] = { + __RISCV_ISA_EXT_DATA(zbb, RISCV_ISA_EXT_ZBB), __RISCV_ISA_EXT_DATA(zicbom, RISCV_ISA_EXT_ZICBOM), __RISCV_ISA_EXT_DATA(zihintpause, RISCV_ISA_EXT_ZIHINTPAUSE), __RISCV_ISA_EXT_DATA(sscofpmf, RISCV_ISA_EXT_SSCOFPMF), diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c index dde0e91d7668..9899806cef29 100644 --- a/arch/riscv/kernel/cpufeature.c +++ b/arch/riscv/kernel/cpufeature.c @@ -227,6 +227,7 @@ void __init riscv_fill_hwcap(void) SET_ISA_EXT_MAP("sstc", RISCV_ISA_EXT_SSTC); SET_ISA_EXT_MAP("svinval", RISCV_ISA_EXT_SVINVAL); SET_ISA_EXT_MAP("svpbmt", RISCV_ISA_EXT_SVPBMT); + SET_ISA_EXT_MAP("zbb", RISCV_ISA_EXT_ZBB); SET_ISA_EXT_MAP("zicbom", RISCV_ISA_EXT_ZICBOM); SET_ISA_EXT_MAP("zihintpause", RISCV_ISA_EXT_ZIHINTPAUSE); } @@ -302,6 +303,20 @@ static bool __init_or_module cpufeature_probe_zicbom(unsigned int stage) return true; } +static bool __init_or_module cpufeature_probe_zbb(unsigned int stage) +{ + if (!IS_ENABLED(CONFIG_RISCV_ISA_ZBB)) + return false; + + if (stage == RISCV_ALTERNATIVES_EARLY_BOOT) + return false; + + if (!riscv_isa_extension_available(NULL, ZBB)) + return false; + + return true; +} + /* * Probe presence of individual extensions. * @@ -320,6 +335,9 @@ static u32 __init_or_module cpufeature_probe(unsigned int stage) if (cpufeature_probe_zicbom(stage)) cpu_req_feature |= BIT(CPUFEATURE_ZICBOM); + if (cpufeature_probe_zbb(stage)) + cpu_req_feature |= BIT(CPUFEATURE_ZBB); + return cpu_req_feature; } diff --git a/arch/riscv/lib/strcmp.S b/arch/riscv/lib/strcmp.S index 8babd712b958..8148b6418f61 100644 --- a/arch/riscv/lib/strcmp.S +++ b/arch/riscv/lib/strcmp.S @@ -3,9 +3,14 @@ #include <linux/linkage.h> #include <asm/asm.h> #include <asm-generic/export.h> +#include <asm/alternative-macros.h> +#include <asm/errata_list.h> /* int strcmp(const char *cs, const char *ct) */ SYM_FUNC_START(strcmp) + + ALTERNATIVE("nop", "j strcmp_zbb", 0, CPUFEATURE_ZBB, CONFIG_RISCV_ISA_ZBB) + /* * Returns * a0 - comparison result, value like strcmp @@ -33,4 +38,84 @@ SYM_FUNC_START(strcmp) */ sub a0, t0, t1 ret + +/* + * Variant of strcmp using the ZBB extension if available + */ +#ifdef CONFIG_RISCV_ISA_ZBB +strcmp_zbb: + +.option push +.option arch,+zbb + + /* + * Returns + * a0 - comparison result, value like strcmp + * + * Parameters + * a0 - string1 + * a1 - string2 + * + * Clobbers + * t0, t1, t2, t3, t4, t5 + */ + + or t2, a0, a1 + li t4, -1 + and t2, t2, SZREG-1 + bnez t2, 3f + + /* Main loop for aligned string. */ + .p2align 3 +1: + REG_L t0, 0(a0) + REG_L t1, 0(a1) + orc.b t3, t0 + bne t3, t4, 2f + addi a0, a0, SZREG + addi a1, a1, SZREG + beq t0, t1, 1b + + /* + * Words don't match, and no null byte in the first + * word. Get bytes in big-endian order and compare. + */ +#ifndef CONFIG_CPU_BIG_ENDIAN + rev8 t0, t0 + rev8 t1, t1 +#endif + + /* Synthesize (t0 >= t1) ? 1 : -1 in a branchless sequence. */ + sltu a0, t0, t1 + neg a0, a0 + ori a0, a0, 1 + ret + +2: + /* + * Found a null byte. + * If words don't match, fall back to simple loop. + */ + bne t0, t1, 3f + + /* Otherwise, strings are equal. */ + li a0, 0 + ret + + /* Simple loop for misaligned strings. */ + .p2align 3 +3: + lbu t0, 0(a0) + lbu t1, 0(a1) + addi a0, a0, 1 + addi a1, a1, 1 + bne t0, t1, 4f + bnez t0, 3b + +4: + sub a0, t0, t1 + ret + +.option pop +#endif SYM_FUNC_END(strcmp) diff --git a/arch/riscv/lib/strlen.S b/arch/riscv/lib/strlen.S index 0a3b11853efd..0f9dbf93301a 100644 --- a/arch/riscv/lib/strlen.S +++ b/arch/riscv/lib/strlen.S @@ -3,9 +3,14 @@ #include <linux/linkage.h> #include <asm/asm.h> #include <asm-generic/export.h> +#include <asm/alternative-macros.h> +#include <asm/errata_list.h> /* int strlen(const char *s) */ SYM_FUNC_START(strlen) + + ALTERNATIVE("nop", "j strlen_zbb", 0, CPUFEATURE_ZBB, CONFIG_RISCV_ISA_ZBB) + /* * Returns * a0 - string length @@ -25,4 +30,104 @@ SYM_FUNC_START(strlen) 2: sub a0, t1, a0 ret + +/* + * Variant of strlen using the ZBB extension if available + */ +#ifdef CONFIG_RISCV_ISA_ZBB +strlen_zbb: + +#ifdef CONFIG_CPU_BIG_ENDIAN +# define CZ clz +# define SHIFT sll +#else +# define CZ ctz +# define SHIFT srl +#endif + +.option push +.option arch,+zbb + + /* + * Returns + * a0 - string length + * + * Parameters + * a0 - String to measure + * + * Clobbers + * t0, t1, t2, t3 + */ + + /* Number of irrelevant bytes in the first word. */ + andi t2, a0, SZREG-1 + + /* Align pointer. */ + andi t0, a0, -SZREG + + li t3, SZREG + sub t3, t3, t2 + slli t2, t2, 3 + + /* Get the first word. */ + REG_L t1, 0(t0) + + /* + * Shift away the partial data we loaded to remove the irrelevant bytes + * preceding the string with the effect of adding NUL bytes at the + * end of the string's first word. + */ + SHIFT t1, t1, t2 + + /* Convert non-NUL into 0xff and NUL into 0x00. */ + orc.b t1, t1 + + /* Convert non-NUL into 0x00 and NUL into 0xff. */ + not t1, t1 + + /* + * Search for the first set bit (corresponding to a NUL byte in the + * original chunk). + */ + CZ t1, t1 + + /* + * The first chunk is special: compare against the number + * of valid bytes in this chunk. + */ + srli a0, t1, 3 + bgtu t3, a0, 3f + + /* Prepare for the word comparison loop. */ + addi t2, t0, SZREG + li t3, -1 + + /* + * Our critical loop is 4 instructions and processes data in + * 4 byte or 8 byte chunks. + */ + .p2align 3 +1: + REG_L t1, SZREG(t0) + addi t0, t0, SZREG + orc.b t1, t1 + beq t1, t3, 1b +2: + not t1, t1 + CZ t1, t1 + + /* Get number of processed words. */ + sub t2, t0, t2 + + /* Add number of characters in the first word. */ + add a0, a0, t2 + srli t1, t1, 3 + + /* Add number of characters in the last word. */ + add a0, a0, t1 +3: + ret + +.option pop +#endif SYM_FUNC_END(strlen) diff --git a/arch/riscv/lib/strncmp.S b/arch/riscv/lib/strncmp.S index 1f644d0a93f6..7940ddab2d48 100644 --- a/arch/riscv/lib/strncmp.S +++ b/arch/riscv/lib/strncmp.S @@ -3,9 +3,14 @@ #include <linux/linkage.h> #include <asm/asm.h> #include <asm-generic/export.h> +#include <asm/alternative-macros.h> +#include <asm/errata_list.h> /* int strncmp(const char *cs, const char *ct, size_t count) */ SYM_FUNC_START(strncmp) + + ALTERNATIVE("nop", "j strncmp_zbb", 0, CPUFEATURE_ZBB, CONFIG_RISCV_ISA_ZBB) + /* * Returns * a0 - comparison result, value like strncmp @@ -38,4 +43,97 @@ SYM_FUNC_START(strncmp) */ sub a0, t0, t1 ret + +/* + * Variant of strncmp using the ZBB extension if available + */ +#ifdef CONFIG_RISCV_ISA_ZBB +strncmp_zbb: + +.option push +.option arch,+zbb + + /* + * Returns + * a0 - comparison result, like strncmp + * + * Parameters + * a0 - string1 + * a1 - string2 + * a2 - number of characters to compare + * + * Clobbers + * t0, t1, t2, t3, t4, t5, t6 + */ + + or t2, a0, a1 + li t5, -1 + and t2, t2, SZREG-1 + add t4, a0, a2 + bnez t2, 4f + + /* Adjust limit for fast-path. */ + andi t6, t4, -SZREG + + /* Main loop for aligned string. */ + .p2align 3 +1: + bgt a0, t6, 3f + REG_L t0, 0(a0) + REG_L t1, 0(a1) + orc.b t3, t0 + bne t3, t5, 2f + addi a0, a0, SZREG + addi a1, a1, SZREG + beq t0, t1, 1b + + /* + * Words don't match, and no null byte in the first + * word. Get bytes in big-endian order and compare. + */ +#ifndef CONFIG_CPU_BIG_ENDIAN + rev8 t0, t0 + rev8 t1, t1 +#endif + + /* Synthesize (t0 >= t1) ? 1 : -1 in a branchless sequence. */ + sltu a0, t0, t1 + neg a0, a0 + ori a0, a0, 1 + ret + +2: + /* + * Found a null byte. + * If words don't match, fall back to simple loop. + */ + bne t0, t1, 3f + + /* Otherwise, strings are equal. */ + li a0, 0 + ret + + /* Simple loop for misaligned strings. */ +3: + /* Restore limit for slow-path. */ + .p2align 3 +4: + bge a0, t4, 6f + lbu t0, 0(a0) + lbu t1, 0(a1) + addi a0, a0, 1 + addi a1, a1, 1 + bne t0, t1, 5f + bnez t0, 4b + +5: + sub a0, t0, t1 + ret + +6: + li a0, 0 + ret + +.option pop +#endif SYM_FUNC_END(strncmp)