Message ID | 20230130120128.1349464-5-ajones@ventanamicro.com (mailing list archive) |
---|---|
State | Superseded |
Delegated to: | Palmer Dabbelt |
Headers | show |
Series | RISC-V: Apply Zicboz to clear_page | expand |
Context | Check | Description |
---|---|---|
conchuod/tree_selection | fail | Failed to apply to next/pending-fixes or riscv/for-next |
On Mon, 30 Jan 2023 04:01:26 PST (-0800), ajones@ventanamicro.com wrote: > Using memset() to zero a 4K page takes 563 total instructions > where 20 are branches. clear_page() with Zicboz takes 150 total > instructions where 16 are branches. We could reduce the numbers > by further unrolling, but, since the cboz block size isn't fixed, > we'd need a Duff device to ensure we don't execute too many > unrolled steps. Also, cbo.zero doesn't take an offset, so each > unrolled step requires it and an add instruction. This increases > the chance for icache misses if we unroll many times. For these > reasons we only unroll four times. Unrolling four times should be > safe as it supports cboz block sizes up to 1K when used with 4K > pages and it's only 24 to 32 bytes of unrolled instructions. > > Another note about the Duff device idea is that it would probably > be best to store the number of steps needed at boot time and then > load the value in clear_page(). Calculating it in clear_page(), > particularly without the Zbb extension, would not be efficient. > > Signed-off-by: Andrew Jones <ajones@ventanamicro.com> > Acked-by: Conor Dooley <conor.dooley@microchip.com> > --- > arch/riscv/Kconfig | 13 +++++++++++ > arch/riscv/include/asm/insn-def.h | 4 ++++ > arch/riscv/include/asm/page.h | 6 +++++- > arch/riscv/lib/Makefile | 1 + > arch/riscv/lib/clear_page.S | 36 +++++++++++++++++++++++++++++++ > 5 files changed, 59 insertions(+), 1 deletion(-) > create mode 100644 arch/riscv/lib/clear_page.S > > diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig > index 33bbdc33cef8..3759a2f6edd5 100644 > --- a/arch/riscv/Kconfig > +++ b/arch/riscv/Kconfig > @@ -432,6 +432,19 @@ config RISCV_ISA_ZICBOM > > If you don't know what to do here, say Y. > > +config RISCV_ISA_ZICBOZ > + bool "Zicboz extension support for faster zeroing of memory" > + depends on !XIP_KERNEL && MMU > + select RISCV_ALTERNATIVE > + default y > + help > + Enable the use of the ZICBOZ extension (cbo.zero instruction) > + when available. > + > + The Zicboz extension is used for faster zeroing of memory. > + > + If you don't know what to do here, say Y. > + > config TOOLCHAIN_HAS_ZIHINTPAUSE > bool > default y > diff --git a/arch/riscv/include/asm/insn-def.h b/arch/riscv/include/asm/insn-def.h > index e01ab51f50d2..6960beb75f32 100644 > --- a/arch/riscv/include/asm/insn-def.h > +++ b/arch/riscv/include/asm/insn-def.h > @@ -192,4 +192,8 @@ > INSN_I(OPCODE_MISC_MEM, FUNC3(2), __RD(0), \ > RS1(base), SIMM12(2)) > > +#define CBO_zero(base) \ > + INSN_I(OPCODE_MISC_MEM, FUNC3(2), __RD(0), \ > + RS1(base), SIMM12(4)) > + > #endif /* __ASM_INSN_DEF_H */ > diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h > index 9f432c1b5289..ccd168fe29d2 100644 > --- a/arch/riscv/include/asm/page.h > +++ b/arch/riscv/include/asm/page.h > @@ -49,10 +49,14 @@ > > #ifndef __ASSEMBLY__ > > +#ifdef CONFIG_RISCV_ISA_ZICBOZ > +void clear_page(void *page); > +#else > #define clear_page(pgaddr) memset((pgaddr), 0, PAGE_SIZE) > +#endif > #define copy_page(to, from) memcpy((to), (from), PAGE_SIZE) > > -#define clear_user_page(pgaddr, vaddr, page) memset((pgaddr), 0, PAGE_SIZE) > +#define clear_user_page(pgaddr, vaddr, page) clear_page(pgaddr) > #define copy_user_page(vto, vfrom, vaddr, topg) \ > memcpy((vto), (vfrom), PAGE_SIZE) > > diff --git a/arch/riscv/lib/Makefile b/arch/riscv/lib/Makefile > index 25d5c9664e57..9ee5e2ab5143 100644 > --- a/arch/riscv/lib/Makefile > +++ b/arch/riscv/lib/Makefile > @@ -5,5 +5,6 @@ lib-y += memset.o > lib-y += memmove.o > lib-$(CONFIG_MMU) += uaccess.o > lib-$(CONFIG_64BIT) += tishift.o > +lib-$(CONFIG_RISCV_ISA_ZICBOZ) += clear_page.o > > obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o > diff --git a/arch/riscv/lib/clear_page.S b/arch/riscv/lib/clear_page.S > new file mode 100644 > index 000000000000..49f29139a5b6 > --- /dev/null > +++ b/arch/riscv/lib/clear_page.S > @@ -0,0 +1,36 @@ > +/* SPDX-License-Identifier: GPL-2.0-only */ > +/* > + * Copyright (c) 2023 Ventana Micro Systems Inc. > + */ > + > +#include <linux/linkage.h> > +#include <asm/asm.h> > +#include <asm/alternative-macros.h> > +#include <asm/hwcap.h> > +#include <asm/insn-def.h> > +#include <asm/page.h> > + > +/* void clear_page(void *page) */ > +ENTRY(__clear_page) > +WEAK(clear_page) > + li a2, PAGE_SIZE > + ALTERNATIVE("j .Lno_zicboz", "nop", > + 0, RISCV_ISA_EXT_ZICBOZ, CONFIG_RISCV_ISA_ZICBOZ) > + la a1, riscv_cboz_block_size > + lw a1, 0(a1) You should be able to just "lw a1, riscv_cboz_block_size", which can sometimes generate better code. > + add a2, a0, a2 > +.Lzero_loop: > + CBO_zero(a0) > + add a0, a0, a1 > + CBO_zero(a0) > + add a0, a0, a1 We were talking about this in the patchwork call: this risks overflow if the block size is bigger than a quarter of a page. That's probably pretty rare, but given that there's already an alternative for the jump it's easy to check. > + CBO_zero(a0) > + add a0, a0, a1 > + CBO_zero(a0) > + add a0, a0, a1 > + bltu a0, a2, .Lzero_loop > + ret > +.Lno_zicboz: > + li a1, 0 > + tail __memset > +END(__clear_page)
On Wed, Feb 01, 2023 at 08:35:54PM -0800, Palmer Dabbelt wrote: > On Mon, 30 Jan 2023 04:01:26 PST (-0800), ajones@ventanamicro.com wrote: > > Using memset() to zero a 4K page takes 563 total instructions > > where 20 are branches. clear_page() with Zicboz takes 150 total > > instructions where 16 are branches. We could reduce the numbers > > by further unrolling, but, since the cboz block size isn't fixed, > > we'd need a Duff device to ensure we don't execute too many > > unrolled steps. Also, cbo.zero doesn't take an offset, so each > > unrolled step requires it and an add instruction. This increases > > the chance for icache misses if we unroll many times. For these > > reasons we only unroll four times. Unrolling four times should be > > safe as it supports cboz block sizes up to 1K when used with 4K > > pages and it's only 24 to 32 bytes of unrolled instructions. > > > > Another note about the Duff device idea is that it would probably > > be best to store the number of steps needed at boot time and then > > load the value in clear_page(). Calculating it in clear_page(), > > particularly without the Zbb extension, would not be efficient. > > > > Signed-off-by: Andrew Jones <ajones@ventanamicro.com> > > Acked-by: Conor Dooley <conor.dooley@microchip.com> > > --- > > arch/riscv/Kconfig | 13 +++++++++++ > > arch/riscv/include/asm/insn-def.h | 4 ++++ > > arch/riscv/include/asm/page.h | 6 +++++- > > arch/riscv/lib/Makefile | 1 + > > arch/riscv/lib/clear_page.S | 36 +++++++++++++++++++++++++++++++ > > 5 files changed, 59 insertions(+), 1 deletion(-) > > create mode 100644 arch/riscv/lib/clear_page.S > > > > diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig > > index 33bbdc33cef8..3759a2f6edd5 100644 > > --- a/arch/riscv/Kconfig > > +++ b/arch/riscv/Kconfig > > @@ -432,6 +432,19 @@ config RISCV_ISA_ZICBOM > > > > If you don't know what to do here, say Y. > > > > +config RISCV_ISA_ZICBOZ > > + bool "Zicboz extension support for faster zeroing of memory" > > + depends on !XIP_KERNEL && MMU > > + select RISCV_ALTERNATIVE > > + default y > > + help > > + Enable the use of the ZICBOZ extension (cbo.zero instruction) > > + when available. > > + > > + The Zicboz extension is used for faster zeroing of memory. > > + > > + If you don't know what to do here, say Y. > > + > > config TOOLCHAIN_HAS_ZIHINTPAUSE > > bool > > default y > > diff --git a/arch/riscv/include/asm/insn-def.h b/arch/riscv/include/asm/insn-def.h > > index e01ab51f50d2..6960beb75f32 100644 > > --- a/arch/riscv/include/asm/insn-def.h > > +++ b/arch/riscv/include/asm/insn-def.h > > @@ -192,4 +192,8 @@ > > INSN_I(OPCODE_MISC_MEM, FUNC3(2), __RD(0), \ > > RS1(base), SIMM12(2)) > > > > +#define CBO_zero(base) \ > > + INSN_I(OPCODE_MISC_MEM, FUNC3(2), __RD(0), \ > > + RS1(base), SIMM12(4)) > > + > > #endif /* __ASM_INSN_DEF_H */ > > diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h > > index 9f432c1b5289..ccd168fe29d2 100644 > > --- a/arch/riscv/include/asm/page.h > > +++ b/arch/riscv/include/asm/page.h > > @@ -49,10 +49,14 @@ > > > > #ifndef __ASSEMBLY__ > > > > +#ifdef CONFIG_RISCV_ISA_ZICBOZ > > +void clear_page(void *page); > > +#else > > #define clear_page(pgaddr) memset((pgaddr), 0, PAGE_SIZE) > > +#endif > > #define copy_page(to, from) memcpy((to), (from), PAGE_SIZE) > > > > -#define clear_user_page(pgaddr, vaddr, page) memset((pgaddr), 0, PAGE_SIZE) > > +#define clear_user_page(pgaddr, vaddr, page) clear_page(pgaddr) > > #define copy_user_page(vto, vfrom, vaddr, topg) \ > > memcpy((vto), (vfrom), PAGE_SIZE) > > > > diff --git a/arch/riscv/lib/Makefile b/arch/riscv/lib/Makefile > > index 25d5c9664e57..9ee5e2ab5143 100644 > > --- a/arch/riscv/lib/Makefile > > +++ b/arch/riscv/lib/Makefile > > @@ -5,5 +5,6 @@ lib-y += memset.o > > lib-y += memmove.o > > lib-$(CONFIG_MMU) += uaccess.o > > lib-$(CONFIG_64BIT) += tishift.o > > +lib-$(CONFIG_RISCV_ISA_ZICBOZ) += clear_page.o > > > > obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o > > diff --git a/arch/riscv/lib/clear_page.S b/arch/riscv/lib/clear_page.S > > new file mode 100644 > > index 000000000000..49f29139a5b6 > > --- /dev/null > > +++ b/arch/riscv/lib/clear_page.S > > @@ -0,0 +1,36 @@ > > +/* SPDX-License-Identifier: GPL-2.0-only */ > > +/* > > + * Copyright (c) 2023 Ventana Micro Systems Inc. > > + */ > > + > > +#include <linux/linkage.h> > > +#include <asm/asm.h> > > +#include <asm/alternative-macros.h> > > +#include <asm/hwcap.h> > > +#include <asm/insn-def.h> > > +#include <asm/page.h> > > + > > +/* void clear_page(void *page) */ > > +ENTRY(__clear_page) > > +WEAK(clear_page) > > + li a2, PAGE_SIZE > > + ALTERNATIVE("j .Lno_zicboz", "nop", > > + 0, RISCV_ISA_EXT_ZICBOZ, CONFIG_RISCV_ISA_ZICBOZ) > > + la a1, riscv_cboz_block_size > > + lw a1, 0(a1) > > You should be able to just "lw a1, riscv_cboz_block_size", which can > sometimes generate better code. > > > + add a2, a0, a2 > > +.Lzero_loop: > > + CBO_zero(a0) > > + add a0, a0, a1 > > + CBO_zero(a0) > > + add a0, a0, a1 > > We were talking about this in the patchwork call: this risks overflow if the > block size is bigger than a quarter of a page. That's probably pretty rare, > but given that there's already an alternative for the jump it's easy to > check. I'll pull v4 together with your fixes. Thanks! drew > > > + CBO_zero(a0) > > + add a0, a0, a1 > > + CBO_zero(a0) > > + add a0, a0, a1 > > + bltu a0, a2, .Lzero_loop > > + ret > > +.Lno_zicboz: > > + li a1, 0 > > + tail __memset > > +END(__clear_page)
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 33bbdc33cef8..3759a2f6edd5 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -432,6 +432,19 @@ config RISCV_ISA_ZICBOM If you don't know what to do here, say Y. +config RISCV_ISA_ZICBOZ + bool "Zicboz extension support for faster zeroing of memory" + depends on !XIP_KERNEL && MMU + select RISCV_ALTERNATIVE + default y + help + Enable the use of the ZICBOZ extension (cbo.zero instruction) + when available. + + The Zicboz extension is used for faster zeroing of memory. + + If you don't know what to do here, say Y. + config TOOLCHAIN_HAS_ZIHINTPAUSE bool default y diff --git a/arch/riscv/include/asm/insn-def.h b/arch/riscv/include/asm/insn-def.h index e01ab51f50d2..6960beb75f32 100644 --- a/arch/riscv/include/asm/insn-def.h +++ b/arch/riscv/include/asm/insn-def.h @@ -192,4 +192,8 @@ INSN_I(OPCODE_MISC_MEM, FUNC3(2), __RD(0), \ RS1(base), SIMM12(2)) +#define CBO_zero(base) \ + INSN_I(OPCODE_MISC_MEM, FUNC3(2), __RD(0), \ + RS1(base), SIMM12(4)) + #endif /* __ASM_INSN_DEF_H */ diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h index 9f432c1b5289..ccd168fe29d2 100644 --- a/arch/riscv/include/asm/page.h +++ b/arch/riscv/include/asm/page.h @@ -49,10 +49,14 @@ #ifndef __ASSEMBLY__ +#ifdef CONFIG_RISCV_ISA_ZICBOZ +void clear_page(void *page); +#else #define clear_page(pgaddr) memset((pgaddr), 0, PAGE_SIZE) +#endif #define copy_page(to, from) memcpy((to), (from), PAGE_SIZE) -#define clear_user_page(pgaddr, vaddr, page) memset((pgaddr), 0, PAGE_SIZE) +#define clear_user_page(pgaddr, vaddr, page) clear_page(pgaddr) #define copy_user_page(vto, vfrom, vaddr, topg) \ memcpy((vto), (vfrom), PAGE_SIZE) diff --git a/arch/riscv/lib/Makefile b/arch/riscv/lib/Makefile index 25d5c9664e57..9ee5e2ab5143 100644 --- a/arch/riscv/lib/Makefile +++ b/arch/riscv/lib/Makefile @@ -5,5 +5,6 @@ lib-y += memset.o lib-y += memmove.o lib-$(CONFIG_MMU) += uaccess.o lib-$(CONFIG_64BIT) += tishift.o +lib-$(CONFIG_RISCV_ISA_ZICBOZ) += clear_page.o obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o diff --git a/arch/riscv/lib/clear_page.S b/arch/riscv/lib/clear_page.S new file mode 100644 index 000000000000..49f29139a5b6 --- /dev/null +++ b/arch/riscv/lib/clear_page.S @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2023 Ventana Micro Systems Inc. + */ + +#include <linux/linkage.h> +#include <asm/asm.h> +#include <asm/alternative-macros.h> +#include <asm/hwcap.h> +#include <asm/insn-def.h> +#include <asm/page.h> + +/* void clear_page(void *page) */ +ENTRY(__clear_page) +WEAK(clear_page) + li a2, PAGE_SIZE + ALTERNATIVE("j .Lno_zicboz", "nop", + 0, RISCV_ISA_EXT_ZICBOZ, CONFIG_RISCV_ISA_ZICBOZ) + la a1, riscv_cboz_block_size + lw a1, 0(a1) + add a2, a0, a2 +.Lzero_loop: + CBO_zero(a0) + add a0, a0, a1 + CBO_zero(a0) + add a0, a0, a1 + CBO_zero(a0) + add a0, a0, a1 + CBO_zero(a0) + add a0, a0, a1 + bltu a0, a2, .Lzero_loop + ret +.Lno_zicboz: + li a1, 0 + tail __memset +END(__clear_page)