diff mbox series

riscv: errata: thead: use riscv_nonstd_cache_ops for CMO

Message ID 20230912072740.2544-1-jszhang@kernel.org (mailing list archive)
State Superseded
Headers show
Series riscv: errata: thead: use riscv_nonstd_cache_ops for CMO | expand

Checks

Context Check Description
conchuod/tree_selection fail Failed to apply to next/pending-fixes, riscv/for-next or riscv/master
conchuod/vmtest-fixes-PR pending PR summary

Commit Message

Jisheng Zhang Sept. 12, 2023, 7:27 a.m. UTC
Previously, we use alternative mechanism to dynamically patch
the CMO operations for THEAD C906/C910 during boot for performance
reason. But as pointed out by Arnd, "there is already a significant
cost in accessing the invalidated cache lines afterwards, which is
likely going to be much higher than the cost of an indirect branch".
And indeed, there's no performance difference with GMAC and EMMC per
my test on Sipeed Lichee Pi 4A board.

Use riscv_nonstd_cache_ops for THEAD C906/C910 CMO to simplify
the alternative code, and to acchieve Arnd's goal -- "I think
moving the THEAD ops at the same level as all nonstandard operations
makes sense, but I'd still leave CMO as an explicit fast path that
avoids the indirect branch. This seems like the right thing to do both
for readability and for platforms on which the indirect branch has a
noticeable overhead."

Signed-off-by: Jisheng Zhang <jszhang@kernel.org>
---
 arch/riscv/Kconfig.errata            |  1 +
 arch/riscv/errata/thead/errata.c     | 76 +++++++++++++++++++++++++++-
 arch/riscv/include/asm/errata_list.h | 50 +++---------------
 3 files changed, 81 insertions(+), 46 deletions(-)

Comments

Conor Dooley Sept. 12, 2023, 8:58 a.m. UTC | #1
Hey Jisheng,

On Tue, Sep 12, 2023 at 03:27:40PM +0800, Jisheng Zhang wrote:
> Previously, we use alternative mechanism to dynamically patch
> the CMO operations for THEAD C906/C910 during boot for performance
> reason. But as pointed out by Arnd, "there is already a significant
> cost in accessing the invalidated cache lines afterwards, which is
> likely going to be much higher than the cost of an indirect branch".
> And indeed, there's no performance difference with GMAC and EMMC per
> my test on Sipeed Lichee Pi 4A board.

Cool, thanks for working on this.

> Use riscv_nonstd_cache_ops for THEAD C906/C910 CMO to simplify
> the alternative code, and to acchieve Arnd's goal -- "I think
> moving the THEAD ops at the same level as all nonstandard operations
> makes sense, but I'd still leave CMO as an explicit fast path that
> avoids the indirect branch. This seems like the right thing to do both
> for readability and for platforms on which the indirect branch has a
> noticeable overhead."
> 
> Signed-off-by: Jisheng Zhang <jszhang@kernel.org>
> ---
>  arch/riscv/Kconfig.errata            |  1 +
>  arch/riscv/errata/thead/errata.c     | 76 +++++++++++++++++++++++++++-
>  arch/riscv/include/asm/errata_list.h | 50 +++---------------
>  3 files changed, 81 insertions(+), 46 deletions(-)
> 
> diff --git a/arch/riscv/Kconfig.errata b/arch/riscv/Kconfig.errata
> index 566bcefeab50..d7972914f9de 100644
> --- a/arch/riscv/Kconfig.errata
> +++ b/arch/riscv/Kconfig.errata
> @@ -78,6 +78,7 @@ config ERRATA_THEAD_CMO
>  	bool "Apply T-Head cache management errata"
>  	depends on ERRATA_THEAD && MMU
>  	select RISCV_DMA_NONCOHERENT
> +	select RISCV_NONSTANDARD_CACHE_OPS
>  	default y
>  	help
>  	  This will apply the cache management errata to handle the
> diff --git a/arch/riscv/errata/thead/errata.c b/arch/riscv/errata/thead/errata.c
> index 0554ed4bf087..1c320abfe446 100644
> --- a/arch/riscv/errata/thead/errata.c
> +++ b/arch/riscv/errata/thead/errata.c
> @@ -12,8 +12,10 @@
>  #include <asm/alternative.h>
>  #include <asm/cacheflush.h>
>  #include <asm/cpufeature.h>
> +#include <asm/dma-noncoherent.h>
>  #include <asm/errata_list.h>
>  #include <asm/hwprobe.h>
> +#include <asm/io.h>
>  #include <asm/patch.h>
>  #include <asm/vendorid_list.h>
>  
> @@ -33,6 +35,75 @@ static bool errata_probe_pbmt(unsigned int stage,
>  	return false;
>  }
>  
> +/*
> + * dcache.ipa rs1 (invalidate, physical address)
> + * | 31 - 25 | 24 - 20 | 19 - 15 | 14 - 12 | 11 - 7 | 6 - 0 |
> + *   0000001    01010      rs1       000      00000  0001011
> + * dache.iva rs1 (invalida, virtual address)
> + *   0000001    00110      rs1       000      00000  0001011
> + *
> + * dcache.cpa rs1 (clean, physical address)
> + * | 31 - 25 | 24 - 20 | 19 - 15 | 14 - 12 | 11 - 7 | 6 - 0 |
> + *   0000001    01001      rs1       000      00000  0001011
> + * dcache.cva rs1 (clean, virtual address)
> + *   0000001    00101      rs1       000      00000  0001011
> + *
> + * dcache.cipa rs1 (clean then invalidate, physical address)
> + * | 31 - 25 | 24 - 20 | 19 - 15 | 14 - 12 | 11 - 7 | 6 - 0 |
> + *   0000001    01011      rs1       000      00000  0001011
> + * dcache.civa rs1 (... virtual address)
> + *   0000001    00111      rs1       000      00000  0001011
> + *
> + * sync.s (make sure all cache operations finished)
> + * | 31 - 25 | 24 - 20 | 19 - 15 | 14 - 12 | 11 - 7 | 6 - 0 |
> + *   0000000    11001     00000      000      00000  0001011
> + */
> +#define THEAD_inval_A0	".long 0x0265000b"
> +#define THEAD_clean_A0	".long 0x0255000b"
> +#define THEAD_flush_A0	".long 0x0275000b"
> +#define THEAD_SYNC_S	".long 0x0190000b"
> +
> +#define THEAD_CMO_OP(_op, _start, _size, _cachesize)			\
> +asm volatile("mv a0, %1\n\t"						\
> +	     "j 2f\n\t"							\
> +	     "3:\n\t"							\
> +	     THEAD_##_op##_A0 "\n\t"					\
> +	     "add a0, a0, %0\n\t"					\
> +	     "2:\n\t"							\
> +	     "bltu a0, %2, 3b\n\t"					\
> +	     THEAD_SYNC_S						\
> +	     : : "r"(_cachesize),					\
> +		 "r"((unsigned long)(_start) & ~((_cachesize) - 1UL)),	\
> +		 "r"((unsigned long)(_start) + (_size))			\
> +	     : "a0")
> +
> +static void thead_errata_cache_inv(phys_addr_t paddr, size_t size)
> +{
> +	void *vaddr = phys_to_virt(paddr);
> +
> +	THEAD_CMO_OP(inval, vaddr, size, riscv_cbom_block_size);
> +}
> +
> +static void thead_errata_cache_wback(phys_addr_t paddr, size_t size)
> +{
> +	void *vaddr = phys_to_virt(paddr);
> +
> +	THEAD_CMO_OP(clean, vaddr, size, riscv_cbom_block_size);
> +}
> +
> +static void thead_errata_cache_wback_inv(phys_addr_t paddr, size_t size)
> +{
> +	void *vaddr = phys_to_virt(paddr);
> +
> +	THEAD_CMO_OP(flush, vaddr, size, riscv_cbom_block_size);
> +}
> +
> +static const struct riscv_nonstd_cache_ops thead_errata_cmo_ops = {
> +	.wback = &thead_errata_cache_wback,
> +	.inv = &thead_errata_cache_inv,
> +	.wback_inv = &thead_errata_cache_wback_inv,
> +};
> +
>  static bool errata_probe_cmo(unsigned int stage,
>  			     unsigned long arch_id, unsigned long impid)
>  {
> @@ -48,6 +119,8 @@ static bool errata_probe_cmo(unsigned int stage,
>  	if (stage == RISCV_ALTERNATIVES_BOOT) {
>  		riscv_cbom_block_size = L1_CACHE_BYTES;
>  		riscv_noncoherent_supported();
> +		if (IS_ENABLED(CONFIG_RISCV_NONSTANDARD_CACHE_OPS))

This `if (IS_ENABLED())` seems unneeded - the function starts with:
	if (!IS_ENABLED(CONFIG_ERRATA_THEAD_CMO))
		return false;
so if we get to this point, the newly added if will always evaluate to
true AFAICT. Otherwise this looks good to me.

Thanks,
Conor.


> +			riscv_noncoherent_register_cache_ops(&thead_errata_cmo_ops);
>  	}
>  
>  	return true;
> @@ -77,8 +150,7 @@ static u32 thead_errata_probe(unsigned int stage,
>  	if (errata_probe_pbmt(stage, archid, impid))
>  		cpu_req_errata |= BIT(ERRATA_THEAD_PBMT);
>  
> -	if (errata_probe_cmo(stage, archid, impid))
> -		cpu_req_errata |= BIT(ERRATA_THEAD_CMO);
> +	errata_probe_cmo(stage, archid, impid);
>  
>  	if (errata_probe_pmu(stage, archid, impid))
>  		cpu_req_errata |= BIT(ERRATA_THEAD_PMU);
> diff --git a/arch/riscv/include/asm/errata_list.h b/arch/riscv/include/asm/errata_list.h
> index b55b434f0059..ea33288f8a25 100644
> --- a/arch/riscv/include/asm/errata_list.h
> +++ b/arch/riscv/include/asm/errata_list.h
> @@ -24,9 +24,8 @@
>  
>  #ifdef CONFIG_ERRATA_THEAD
>  #define	ERRATA_THEAD_PBMT 0
> -#define	ERRATA_THEAD_CMO 1
> -#define	ERRATA_THEAD_PMU 2
> -#define	ERRATA_THEAD_NUMBER 3
> +#define	ERRATA_THEAD_PMU 1
> +#define	ERRATA_THEAD_NUMBER 2
>  #endif
>  
>  #ifdef __ASSEMBLY__
> @@ -94,54 +93,17 @@ asm volatile(ALTERNATIVE(						\
>  #define ALT_THEAD_PMA(_val)
>  #endif
>  
> -/*
> - * dcache.ipa rs1 (invalidate, physical address)
> - * | 31 - 25 | 24 - 20 | 19 - 15 | 14 - 12 | 11 - 7 | 6 - 0 |
> - *   0000001    01010      rs1       000      00000  0001011
> - * dache.iva rs1 (invalida, virtual address)
> - *   0000001    00110      rs1       000      00000  0001011
> - *
> - * dcache.cpa rs1 (clean, physical address)
> - * | 31 - 25 | 24 - 20 | 19 - 15 | 14 - 12 | 11 - 7 | 6 - 0 |
> - *   0000001    01001      rs1       000      00000  0001011
> - * dcache.cva rs1 (clean, virtual address)
> - *   0000001    00101      rs1       000      00000  0001011
> - *
> - * dcache.cipa rs1 (clean then invalidate, physical address)
> - * | 31 - 25 | 24 - 20 | 19 - 15 | 14 - 12 | 11 - 7 | 6 - 0 |
> - *   0000001    01011      rs1       000      00000  0001011
> - * dcache.civa rs1 (... virtual address)
> - *   0000001    00111      rs1       000      00000  0001011
> - *
> - * sync.s (make sure all cache operations finished)
> - * | 31 - 25 | 24 - 20 | 19 - 15 | 14 - 12 | 11 - 7 | 6 - 0 |
> - *   0000000    11001     00000      000      00000  0001011
> - */
> -#define THEAD_inval_A0	".long 0x0265000b"
> -#define THEAD_clean_A0	".long 0x0255000b"
> -#define THEAD_flush_A0	".long 0x0275000b"
> -#define THEAD_SYNC_S	".long 0x0190000b"
> -
>  #define ALT_CMO_OP(_op, _start, _size, _cachesize)			\
> -asm volatile(ALTERNATIVE_2(						\
> -	__nops(6),							\
> +asm volatile(ALTERNATIVE(						\
> +	__nops(5),							\
>  	"mv a0, %1\n\t"							\
>  	"j 2f\n\t"							\
>  	"3:\n\t"							\
>  	CBO_##_op(a0)							\
>  	"add a0, a0, %0\n\t"						\
>  	"2:\n\t"							\
> -	"bltu a0, %2, 3b\n\t"						\
> -	"nop", 0, RISCV_ISA_EXT_ZICBOM, CONFIG_RISCV_ISA_ZICBOM,	\
> -	"mv a0, %1\n\t"							\
> -	"j 2f\n\t"							\
> -	"3:\n\t"							\
> -	THEAD_##_op##_A0 "\n\t"						\
> -	"add a0, a0, %0\n\t"						\
> -	"2:\n\t"							\
> -	"bltu a0, %2, 3b\n\t"						\
> -	THEAD_SYNC_S, THEAD_VENDOR_ID,					\
> -			ERRATA_THEAD_CMO, CONFIG_ERRATA_THEAD_CMO)	\
> +	"bltu a0, %2, 3b\n\t",						\
> +	0, RISCV_ISA_EXT_ZICBOM, CONFIG_RISCV_ISA_ZICBOM)		\
>  	: : "r"(_cachesize),						\
>  	    "r"((unsigned long)(_start) & ~((_cachesize) - 1UL)),	\
>  	    "r"((unsigned long)(_start) + (_size))			\
> -- 
> 2.40.1
>
Guo Ren Sept. 12, 2023, 10:53 a.m. UTC | #2
On Tue, Sep 12, 2023 at 03:27:40PM +0800, Jisheng Zhang wrote:
> Previously, we use alternative mechanism to dynamically patch
> the CMO operations for THEAD C906/C910 during boot for performance
> reason. But as pointed out by Arnd, "there is already a significant
> cost in accessing the invalidated cache lines afterwards, which is
> likely going to be much higher than the cost of an indirect branch".
> And indeed, there's no performance difference with GMAC and EMMC per
> my test on Sipeed Lichee Pi 4A board.
> 
> Use riscv_nonstd_cache_ops for THEAD C906/C910 CMO to simplify
> the alternative code, and to acchieve Arnd's goal -- "I think
> moving the THEAD ops at the same level as all nonstandard operations
> makes sense, but I'd still leave CMO as an explicit fast path that
> avoids the indirect branch. This seems like the right thing to do both
> for readability and for platforms on which the indirect branch has a
> noticeable overhead."
> 
> Signed-off-by: Jisheng Zhang <jszhang@kernel.org>
> ---
>  arch/riscv/Kconfig.errata            |  1 +
>  arch/riscv/errata/thead/errata.c     | 76 +++++++++++++++++++++++++++-
>  arch/riscv/include/asm/errata_list.h | 50 +++---------------
>  3 files changed, 81 insertions(+), 46 deletions(-)
> 
> diff --git a/arch/riscv/Kconfig.errata b/arch/riscv/Kconfig.errata
> index 566bcefeab50..d7972914f9de 100644
> --- a/arch/riscv/Kconfig.errata
> +++ b/arch/riscv/Kconfig.errata
> @@ -78,6 +78,7 @@ config ERRATA_THEAD_CMO
>  	bool "Apply T-Head cache management errata"
>  	depends on ERRATA_THEAD && MMU
>  	select RISCV_DMA_NONCOHERENT
> +	select RISCV_NONSTANDARD_CACHE_OPS
>  	default y
>  	help
>  	  This will apply the cache management errata to handle the
> diff --git a/arch/riscv/errata/thead/errata.c b/arch/riscv/errata/thead/errata.c
> index 0554ed4bf087..1c320abfe446 100644
> --- a/arch/riscv/errata/thead/errata.c
> +++ b/arch/riscv/errata/thead/errata.c
> @@ -12,8 +12,10 @@
>  #include <asm/alternative.h>
>  #include <asm/cacheflush.h>
>  #include <asm/cpufeature.h>
> +#include <asm/dma-noncoherent.h>
>  #include <asm/errata_list.h>
>  #include <asm/hwprobe.h>
> +#include <asm/io.h>
>  #include <asm/patch.h>
>  #include <asm/vendorid_list.h>
>  
> @@ -33,6 +35,75 @@ static bool errata_probe_pbmt(unsigned int stage,
>  	return false;
>  }
>  
> +/*
> + * dcache.ipa rs1 (invalidate, physical address)
> + * | 31 - 25 | 24 - 20 | 19 - 15 | 14 - 12 | 11 - 7 | 6 - 0 |
> + *   0000001    01010      rs1       000      00000  0001011
> + * dache.iva rs1 (invalida, virtual address)
> + *   0000001    00110      rs1       000      00000  0001011
Remove dache.iva rs1 ...

> + *
> + * dcache.cpa rs1 (clean, physical address)
> + * | 31 - 25 | 24 - 20 | 19 - 15 | 14 - 12 | 11 - 7 | 6 - 0 |
> + *   0000001    01001      rs1       000      00000  0001011
Remove dcache.cpa rs1 ...

> + * dcache.cva rs1 (clean, virtual address)
> + *   0000001    00101      rs1       000      00000  0001011
Remove dcache.cva rs1 ...

> + *
> + * dcache.cipa rs1 (clean then invalidate, physical address)
> + * | 31 - 25 | 24 - 20 | 19 - 15 | 14 - 12 | 11 - 7 | 6 - 0 |
> + *   0000001    01011      rs1       000      00000  0001011
> + * dcache.civa rs1 (... virtual address)
> + *   0000001    00111      rs1       000      00000  0001011
Remove dcache.civa rs1 ...

> + *
> + * sync.s (make sure all cache operations finished)
> + * | 31 - 25 | 24 - 20 | 19 - 15 | 14 - 12 | 11 - 7 | 6 - 0 |
> + *   0000000    11001     00000      000      00000  0001011
> + */
> +#define THEAD_inval_A0	".long 0x0265000b"
> +#define THEAD_clean_A0	".long 0x0255000b"
> +#define THEAD_flush_A0	".long 0x0275000b"
> +#define THEAD_SYNC_S	".long 0x0190000b"
> +
> +#define THEAD_CMO_OP(_op, _start, _size, _cachesize)			\
> +asm volatile("mv a0, %1\n\t"						\
> +	     "j 2f\n\t"							\
> +	     "3:\n\t"							\
> +	     THEAD_##_op##_A0 "\n\t"					\
> +	     "add a0, a0, %0\n\t"					\
> +	     "2:\n\t"							\
> +	     "bltu a0, %2, 3b\n\t"					\
> +	     THEAD_SYNC_S						\
> +	     : : "r"(_cachesize),					\
> +		 "r"((unsigned long)(_start) & ~((_cachesize) - 1UL)),	\
> +		 "r"((unsigned long)(_start) + (_size))			\
> +	     : "a0")
> +
> +static void thead_errata_cache_inv(phys_addr_t paddr, size_t size)
> +{
> +	void *vaddr = phys_to_virt(paddr);
No need to phys_to_virt, and we could use paddr directly (dcache.ipa
rs1).

> +
> +	THEAD_CMO_OP(inval, vaddr, size, riscv_cbom_block_size);
> +}
> +
> +static void thead_errata_cache_wback(phys_addr_t paddr, size_t size)
> +{
> +	void *vaddr = phys_to_virt(paddr);
> +
> +	THEAD_CMO_OP(clean, vaddr, size, riscv_cbom_block_size);
> +}
Please remove the thead_errata_cache_wback because T-HEAD processors
would prioritize using an invalid cacheline instead of evicting an
existing cacheline. When we do dcache clean, the following operations
are to let other interconnect masters read. So, keeping wback_inv for
T-HEAD processors is the best choice, and maybe some other processors'
vendor has a different idea, but please use the wback_inv instead of
wback_only for the T-HEAD processors.

> +
> +static void thead_errata_cache_wback_inv(phys_addr_t paddr, size_t size)
> +{
> +	void *vaddr = phys_to_virt(paddr);
Ditto.

> +
> +	THEAD_CMO_OP(flush, vaddr, size, riscv_cbom_block_size);
> +}
> +
> +static const struct riscv_nonstd_cache_ops thead_errata_cmo_ops = {
> +	.wback = &thead_errata_cache_wback,
Please  .wback = &thead_errata_cache_wback_inv,


> +	.inv = &thead_errata_cache_inv,
> +	.wback_inv = &thead_errata_cache_wback_inv,
> +};
> +
>  static bool errata_probe_cmo(unsigned int stage,
>  			     unsigned long arch_id, unsigned long impid)
>  {
> @@ -48,6 +119,8 @@ static bool errata_probe_cmo(unsigned int stage,
>  	if (stage == RISCV_ALTERNATIVES_BOOT) {
>  		riscv_cbom_block_size = L1_CACHE_BYTES;
>  		riscv_noncoherent_supported();
> +		if (IS_ENABLED(CONFIG_RISCV_NONSTANDARD_CACHE_OPS))
> +			riscv_noncoherent_register_cache_ops(&thead_errata_cmo_ops);
>  	}
>  
>  	return true;
> @@ -77,8 +150,7 @@ static u32 thead_errata_probe(unsigned int stage,
>  	if (errata_probe_pbmt(stage, archid, impid))
>  		cpu_req_errata |= BIT(ERRATA_THEAD_PBMT);
>  
> -	if (errata_probe_cmo(stage, archid, impid))
> -		cpu_req_errata |= BIT(ERRATA_THEAD_CMO);
> +	errata_probe_cmo(stage, archid, impid);
>  
>  	if (errata_probe_pmu(stage, archid, impid))
>  		cpu_req_errata |= BIT(ERRATA_THEAD_PMU);
> diff --git a/arch/riscv/include/asm/errata_list.h b/arch/riscv/include/asm/errata_list.h
> index b55b434f0059..ea33288f8a25 100644
> --- a/arch/riscv/include/asm/errata_list.h
> +++ b/arch/riscv/include/asm/errata_list.h
> @@ -24,9 +24,8 @@
>  
>  #ifdef CONFIG_ERRATA_THEAD
>  #define	ERRATA_THEAD_PBMT 0
> -#define	ERRATA_THEAD_CMO 1
> -#define	ERRATA_THEAD_PMU 2
> -#define	ERRATA_THEAD_NUMBER 3
> +#define	ERRATA_THEAD_PMU 1
> +#define	ERRATA_THEAD_NUMBER 2
>  #endif
>  
>  #ifdef __ASSEMBLY__
> @@ -94,54 +93,17 @@ asm volatile(ALTERNATIVE(						\
>  #define ALT_THEAD_PMA(_val)
>  #endif
>  
> -/*
> - * dcache.ipa rs1 (invalidate, physical address)
> - * | 31 - 25 | 24 - 20 | 19 - 15 | 14 - 12 | 11 - 7 | 6 - 0 |
> - *   0000001    01010      rs1       000      00000  0001011
> - * dache.iva rs1 (invalida, virtual address)
> - *   0000001    00110      rs1       000      00000  0001011
> - *
> - * dcache.cpa rs1 (clean, physical address)
> - * | 31 - 25 | 24 - 20 | 19 - 15 | 14 - 12 | 11 - 7 | 6 - 0 |
> - *   0000001    01001      rs1       000      00000  0001011
> - * dcache.cva rs1 (clean, virtual address)
> - *   0000001    00101      rs1       000      00000  0001011
> - *
> - * dcache.cipa rs1 (clean then invalidate, physical address)
> - * | 31 - 25 | 24 - 20 | 19 - 15 | 14 - 12 | 11 - 7 | 6 - 0 |
> - *   0000001    01011      rs1       000      00000  0001011
> - * dcache.civa rs1 (... virtual address)
> - *   0000001    00111      rs1       000      00000  0001011
> - *
> - * sync.s (make sure all cache operations finished)
> - * | 31 - 25 | 24 - 20 | 19 - 15 | 14 - 12 | 11 - 7 | 6 - 0 |
> - *   0000000    11001     00000      000      00000  0001011
> - */
> -#define THEAD_inval_A0	".long 0x0265000b"
> -#define THEAD_clean_A0	".long 0x0255000b"
> -#define THEAD_flush_A0	".long 0x0275000b"
> -#define THEAD_SYNC_S	".long 0x0190000b"
> -
>  #define ALT_CMO_OP(_op, _start, _size, _cachesize)			\
> -asm volatile(ALTERNATIVE_2(						\
> -	__nops(6),							\
> +asm volatile(ALTERNATIVE(						\
> +	__nops(5),							\
>  	"mv a0, %1\n\t"							\
>  	"j 2f\n\t"							\
>  	"3:\n\t"							\
>  	CBO_##_op(a0)							\
>  	"add a0, a0, %0\n\t"						\
>  	"2:\n\t"							\
> -	"bltu a0, %2, 3b\n\t"						\
> -	"nop", 0, RISCV_ISA_EXT_ZICBOM, CONFIG_RISCV_ISA_ZICBOM,	\
> -	"mv a0, %1\n\t"							\
> -	"j 2f\n\t"							\
> -	"3:\n\t"							\
> -	THEAD_##_op##_A0 "\n\t"						\
> -	"add a0, a0, %0\n\t"						\
> -	"2:\n\t"							\
> -	"bltu a0, %2, 3b\n\t"						\
> -	THEAD_SYNC_S, THEAD_VENDOR_ID,					\
> -			ERRATA_THEAD_CMO, CONFIG_ERRATA_THEAD_CMO)	\
> +	"bltu a0, %2, 3b\n\t",						\
> +	0, RISCV_ISA_EXT_ZICBOM, CONFIG_RISCV_ISA_ZICBOM)		\
>  	: : "r"(_cachesize),						\
>  	    "r"((unsigned long)(_start) & ~((_cachesize) - 1UL)),	\
>  	    "r"((unsigned long)(_start) + (_size))			\
> -- 
> 2.40.1
> 
> 
> _______________________________________________
> linux-riscv mailing list
> linux-riscv@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-riscv
>
Jessica Clarke Sept. 12, 2023, 7 p.m. UTC | #3
On 12 Sep 2023, at 11:53, Guo Ren <guoren@kernel.org> wrote:
> 
> On Tue, Sep 12, 2023 at 03:27:40PM +0800, Jisheng Zhang wrote:
>> Previously, we use alternative mechanism to dynamically patch
>> the CMO operations for THEAD C906/C910 during boot for performance
>> reason. But as pointed out by Arnd, "there is already a significant
>> cost in accessing the invalidated cache lines afterwards, which is
>> likely going to be much higher than the cost of an indirect branch".
>> And indeed, there's no performance difference with GMAC and EMMC per
>> my test on Sipeed Lichee Pi 4A board.
>> 
>> Use riscv_nonstd_cache_ops for THEAD C906/C910 CMO to simplify
>> the alternative code, and to acchieve Arnd's goal -- "I think
>> moving the THEAD ops at the same level as all nonstandard operations
>> makes sense, but I'd still leave CMO as an explicit fast path that
>> avoids the indirect branch. This seems like the right thing to do both
>> for readability and for platforms on which the indirect branch has a
>> noticeable overhead."
>> 
>> Signed-off-by: Jisheng Zhang <jszhang@kernel.org>
>> ---
>> arch/riscv/Kconfig.errata            |  1 +
>> arch/riscv/errata/thead/errata.c     | 76 +++++++++++++++++++++++++++-
>> arch/riscv/include/asm/errata_list.h | 50 +++---------------
>> 3 files changed, 81 insertions(+), 46 deletions(-)
>> 
>> diff --git a/arch/riscv/Kconfig.errata b/arch/riscv/Kconfig.errata
>> index 566bcefeab50..d7972914f9de 100644
>> --- a/arch/riscv/Kconfig.errata
>> +++ b/arch/riscv/Kconfig.errata
>> @@ -78,6 +78,7 @@ config ERRATA_THEAD_CMO
>> bool "Apply T-Head cache management errata"
>> depends on ERRATA_THEAD && MMU
>> select RISCV_DMA_NONCOHERENT
>> + select RISCV_NONSTANDARD_CACHE_OPS
>> default y
>> help
>>   This will apply the cache management errata to handle the
>> diff --git a/arch/riscv/errata/thead/errata.c b/arch/riscv/errata/thead/errata.c
>> index 0554ed4bf087..1c320abfe446 100644
>> --- a/arch/riscv/errata/thead/errata.c
>> +++ b/arch/riscv/errata/thead/errata.c
>> @@ -12,8 +12,10 @@
>> #include <asm/alternative.h>
>> #include <asm/cacheflush.h>
>> #include <asm/cpufeature.h>
>> +#include <asm/dma-noncoherent.h>
>> #include <asm/errata_list.h>
>> #include <asm/hwprobe.h>
>> +#include <asm/io.h>
>> #include <asm/patch.h>
>> #include <asm/vendorid_list.h>
>> 
>> @@ -33,6 +35,75 @@ static bool errata_probe_pbmt(unsigned int stage,
>> return false;
>> }
>> 
>> +/*
>> + * dcache.ipa rs1 (invalidate, physical address)
>> + * | 31 - 25 | 24 - 20 | 19 - 15 | 14 - 12 | 11 - 7 | 6 - 0 |
>> + *   0000001    01010      rs1       000      00000  0001011
>> + * dache.iva rs1 (invalida, virtual address)
>> + *   0000001    00110      rs1       000      00000  0001011
> Remove dache.iva rs1 ...
> 
>> + *
>> + * dcache.cpa rs1 (clean, physical address)
>> + * | 31 - 25 | 24 - 20 | 19 - 15 | 14 - 12 | 11 - 7 | 6 - 0 |
>> + *   0000001    01001      rs1       000      00000  0001011
> Remove dcache.cpa rs1 ...
> 
>> + * dcache.cva rs1 (clean, virtual address)
>> + *   0000001    00101      rs1       000      00000  0001011
> Remove dcache.cva rs1 ...
> 
>> + *
>> + * dcache.cipa rs1 (clean then invalidate, physical address)
>> + * | 31 - 25 | 24 - 20 | 19 - 15 | 14 - 12 | 11 - 7 | 6 - 0 |
>> + *   0000001    01011      rs1       000      00000  0001011
>> + * dcache.civa rs1 (... virtual address)
>> + *   0000001    00111      rs1       000      00000  0001011
> Remove dcache.civa rs1 ...
> 
>> + *
>> + * sync.s (make sure all cache operations finished)
>> + * | 31 - 25 | 24 - 20 | 19 - 15 | 14 - 12 | 11 - 7 | 6 - 0 |
>> + *   0000000    11001     00000      000      00000  0001011
>> + */
>> +#define THEAD_inval_A0 ".long 0x0265000b"
>> +#define THEAD_clean_A0 ".long 0x0255000b"
>> +#define THEAD_flush_A0 ".long 0x0275000b"
>> +#define THEAD_SYNC_S ".long 0x0190000b"
>> +
>> +#define THEAD_CMO_OP(_op, _start, _size, _cachesize) \
>> +asm volatile("mv a0, %1\n\t" \
>> +      "j 2f\n\t" \
>> +      "3:\n\t" \
>> +      THEAD_##_op##_A0 "\n\t" \
>> +      "add a0, a0, %0\n\t" \
>> +      "2:\n\t" \
>> +      "bltu a0, %2, 3b\n\t" \
>> +      THEAD_SYNC_S \
>> +      : : "r"(_cachesize), \
>> +  "r"((unsigned long)(_start) & ~((_cachesize) - 1UL)), \
>> +  "r"((unsigned long)(_start) + (_size)) \
>> +      : "a0")
>> +
>> +static void thead_errata_cache_inv(phys_addr_t paddr, size_t size)
>> +{
>> + void *vaddr = phys_to_virt(paddr);
> No need to phys_to_virt, and we could use paddr directly (dcache.ipa
> rs1).
> 
>> +
>> + THEAD_CMO_OP(inval, vaddr, size, riscv_cbom_block_size);
>> +}
>> +
>> +static void thead_errata_cache_wback(phys_addr_t paddr, size_t size)
>> +{
>> + void *vaddr = phys_to_virt(paddr);
>> +
>> + THEAD_CMO_OP(clean, vaddr, size, riscv_cbom_block_size);
>> +}
> Please remove the thead_errata_cache_wback because T-HEAD processors
> would prioritize using an invalid cacheline instead of evicting an
> existing cacheline. When we do dcache clean, the following operations
> are to let other interconnect masters read. So, keeping wback_inv for
> T-HEAD processors is the best choice, and maybe some other processors'
> vendor has a different idea, but please use the wback_inv instead of
> wback_only for the T-HEAD processors.

Unless you can demonstrate that your cores have significantly worse
performance when using wback instead of wback_inv I do not think the
non-standard implementation should deviate from the semantics of the
standard one. There are efforts to unify the implemented semantics of
the operations across architectures and this would obstruct those.

Jess

>> +
>> +static void thead_errata_cache_wback_inv(phys_addr_t paddr, size_t size)
>> +{
>> + void *vaddr = phys_to_virt(paddr);
> Ditto.
> 
>> +
>> + THEAD_CMO_OP(flush, vaddr, size, riscv_cbom_block_size);
>> +}
>> +
>> +static const struct riscv_nonstd_cache_ops thead_errata_cmo_ops = {
>> + .wback = &thead_errata_cache_wback,
> Please  .wback = &thead_errata_cache_wback_inv,
> 
> 
>> + .inv = &thead_errata_cache_inv,
>> + .wback_inv = &thead_errata_cache_wback_inv,
>> +};
>> +
>> static bool errata_probe_cmo(unsigned int stage,
>>      unsigned long arch_id, unsigned long impid)
>> {
>> @@ -48,6 +119,8 @@ static bool errata_probe_cmo(unsigned int stage,
>> if (stage == RISCV_ALTERNATIVES_BOOT) {
>> riscv_cbom_block_size = L1_CACHE_BYTES;
>> riscv_noncoherent_supported();
>> + if (IS_ENABLED(CONFIG_RISCV_NONSTANDARD_CACHE_OPS))
>> + riscv_noncoherent_register_cache_ops(&thead_errata_cmo_ops);
>> }
>> 
>> return true;
>> @@ -77,8 +150,7 @@ static u32 thead_errata_probe(unsigned int stage,
>> if (errata_probe_pbmt(stage, archid, impid))
>> cpu_req_errata |= BIT(ERRATA_THEAD_PBMT);
>> 
>> - if (errata_probe_cmo(stage, archid, impid))
>> - cpu_req_errata |= BIT(ERRATA_THEAD_CMO);
>> + errata_probe_cmo(stage, archid, impid);
>> 
>> if (errata_probe_pmu(stage, archid, impid))
>> cpu_req_errata |= BIT(ERRATA_THEAD_PMU);
>> diff --git a/arch/riscv/include/asm/errata_list.h b/arch/riscv/include/asm/errata_list.h
>> index b55b434f0059..ea33288f8a25 100644
>> --- a/arch/riscv/include/asm/errata_list.h
>> +++ b/arch/riscv/include/asm/errata_list.h
>> @@ -24,9 +24,8 @@
>> 
>> #ifdef CONFIG_ERRATA_THEAD
>> #define ERRATA_THEAD_PBMT 0
>> -#define ERRATA_THEAD_CMO 1
>> -#define ERRATA_THEAD_PMU 2
>> -#define ERRATA_THEAD_NUMBER 3
>> +#define ERRATA_THEAD_PMU 1
>> +#define ERRATA_THEAD_NUMBER 2
>> #endif
>> 
>> #ifdef __ASSEMBLY__
>> @@ -94,54 +93,17 @@ asm volatile(ALTERNATIVE( \
>> #define ALT_THEAD_PMA(_val)
>> #endif
>> 
>> -/*
>> - * dcache.ipa rs1 (invalidate, physical address)
>> - * | 31 - 25 | 24 - 20 | 19 - 15 | 14 - 12 | 11 - 7 | 6 - 0 |
>> - *   0000001    01010      rs1       000      00000  0001011
>> - * dache.iva rs1 (invalida, virtual address)
>> - *   0000001    00110      rs1       000      00000  0001011
>> - *
>> - * dcache.cpa rs1 (clean, physical address)
>> - * | 31 - 25 | 24 - 20 | 19 - 15 | 14 - 12 | 11 - 7 | 6 - 0 |
>> - *   0000001    01001      rs1       000      00000  0001011
>> - * dcache.cva rs1 (clean, virtual address)
>> - *   0000001    00101      rs1       000      00000  0001011
>> - *
>> - * dcache.cipa rs1 (clean then invalidate, physical address)
>> - * | 31 - 25 | 24 - 20 | 19 - 15 | 14 - 12 | 11 - 7 | 6 - 0 |
>> - *   0000001    01011      rs1       000      00000  0001011
>> - * dcache.civa rs1 (... virtual address)
>> - *   0000001    00111      rs1       000      00000  0001011
>> - *
>> - * sync.s (make sure all cache operations finished)
>> - * | 31 - 25 | 24 - 20 | 19 - 15 | 14 - 12 | 11 - 7 | 6 - 0 |
>> - *   0000000    11001     00000      000      00000  0001011
>> - */
>> -#define THEAD_inval_A0 ".long 0x0265000b"
>> -#define THEAD_clean_A0 ".long 0x0255000b"
>> -#define THEAD_flush_A0 ".long 0x0275000b"
>> -#define THEAD_SYNC_S ".long 0x0190000b"
>> -
>> #define ALT_CMO_OP(_op, _start, _size, _cachesize) \
>> -asm volatile(ALTERNATIVE_2( \
>> - __nops(6), \
>> +asm volatile(ALTERNATIVE( \
>> + __nops(5), \
>> "mv a0, %1\n\t" \
>> "j 2f\n\t" \
>> "3:\n\t" \
>> CBO_##_op(a0) \
>> "add a0, a0, %0\n\t" \
>> "2:\n\t" \
>> - "bltu a0, %2, 3b\n\t" \
>> - "nop", 0, RISCV_ISA_EXT_ZICBOM, CONFIG_RISCV_ISA_ZICBOM, \
>> - "mv a0, %1\n\t" \
>> - "j 2f\n\t" \
>> - "3:\n\t" \
>> - THEAD_##_op##_A0 "\n\t" \
>> - "add a0, a0, %0\n\t" \
>> - "2:\n\t" \
>> - "bltu a0, %2, 3b\n\t" \
>> - THEAD_SYNC_S, THEAD_VENDOR_ID, \
>> - ERRATA_THEAD_CMO, CONFIG_ERRATA_THEAD_CMO) \
>> + "bltu a0, %2, 3b\n\t", \
>> + 0, RISCV_ISA_EXT_ZICBOM, CONFIG_RISCV_ISA_ZICBOM) \
>> : : "r"(_cachesize), \
>>     "r"((unsigned long)(_start) & ~((_cachesize) - 1UL)), \
>>     "r"((unsigned long)(_start) + (_size)) \
>> -- 
>> 2.40.1
>> 
>> 
>> _______________________________________________
>> linux-riscv mailing list
>> linux-riscv@lists.infradead.org
>> http://lists.infradead.org/mailman/listinfo/linux-riscv
>> 
> 
> _______________________________________________
> linux-riscv mailing list
> linux-riscv@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-riscv
Guo Ren Sept. 13, 2023, 12:06 a.m. UTC | #4
On Wed, Sep 13, 2023 at 3:00 AM Jessica Clarke <jrtc27@jrtc27.com> wrote:
>
> On 12 Sep 2023, at 11:53, Guo Ren <guoren@kernel.org> wrote:
> >
> > On Tue, Sep 12, 2023 at 03:27:40PM +0800, Jisheng Zhang wrote:
> >> Previously, we use alternative mechanism to dynamically patch
> >> the CMO operations for THEAD C906/C910 during boot for performance
> >> reason. But as pointed out by Arnd, "there is already a significant
> >> cost in accessing the invalidated cache lines afterwards, which is
> >> likely going to be much higher than the cost of an indirect branch".
> >> And indeed, there's no performance difference with GMAC and EMMC per
> >> my test on Sipeed Lichee Pi 4A board.
> >>
> >> Use riscv_nonstd_cache_ops for THEAD C906/C910 CMO to simplify
> >> the alternative code, and to acchieve Arnd's goal -- "I think
> >> moving the THEAD ops at the same level as all nonstandard operations
> >> makes sense, but I'd still leave CMO as an explicit fast path that
> >> avoids the indirect branch. This seems like the right thing to do both
> >> for readability and for platforms on which the indirect branch has a
> >> noticeable overhead."
> >>
> >> Signed-off-by: Jisheng Zhang <jszhang@kernel.org>
> >> ---
> >> arch/riscv/Kconfig.errata            |  1 +
> >> arch/riscv/errata/thead/errata.c     | 76 +++++++++++++++++++++++++++-
> >> arch/riscv/include/asm/errata_list.h | 50 +++---------------
> >> 3 files changed, 81 insertions(+), 46 deletions(-)
> >>
> >> diff --git a/arch/riscv/Kconfig.errata b/arch/riscv/Kconfig.errata
> >> index 566bcefeab50..d7972914f9de 100644
> >> --- a/arch/riscv/Kconfig.errata
> >> +++ b/arch/riscv/Kconfig.errata
> >> @@ -78,6 +78,7 @@ config ERRATA_THEAD_CMO
> >> bool "Apply T-Head cache management errata"
> >> depends on ERRATA_THEAD && MMU
> >> select RISCV_DMA_NONCOHERENT
> >> + select RISCV_NONSTANDARD_CACHE_OPS
> >> default y
> >> help
> >>   This will apply the cache management errata to handle the
> >> diff --git a/arch/riscv/errata/thead/errata.c b/arch/riscv/errata/thead/errata.c
> >> index 0554ed4bf087..1c320abfe446 100644
> >> --- a/arch/riscv/errata/thead/errata.c
> >> +++ b/arch/riscv/errata/thead/errata.c
> >> @@ -12,8 +12,10 @@
> >> #include <asm/alternative.h>
> >> #include <asm/cacheflush.h>
> >> #include <asm/cpufeature.h>
> >> +#include <asm/dma-noncoherent.h>
> >> #include <asm/errata_list.h>
> >> #include <asm/hwprobe.h>
> >> +#include <asm/io.h>
> >> #include <asm/patch.h>
> >> #include <asm/vendorid_list.h>
> >>
> >> @@ -33,6 +35,75 @@ static bool errata_probe_pbmt(unsigned int stage,
> >> return false;
> >> }
> >>
> >> +/*
> >> + * dcache.ipa rs1 (invalidate, physical address)
> >> + * | 31 - 25 | 24 - 20 | 19 - 15 | 14 - 12 | 11 - 7 | 6 - 0 |
> >> + *   0000001    01010      rs1       000      00000  0001011
> >> + * dache.iva rs1 (invalida, virtual address)
> >> + *   0000001    00110      rs1       000      00000  0001011
> > Remove dache.iva rs1 ...
> >
> >> + *
> >> + * dcache.cpa rs1 (clean, physical address)
> >> + * | 31 - 25 | 24 - 20 | 19 - 15 | 14 - 12 | 11 - 7 | 6 - 0 |
> >> + *   0000001    01001      rs1       000      00000  0001011
> > Remove dcache.cpa rs1 ...
> >
> >> + * dcache.cva rs1 (clean, virtual address)
> >> + *   0000001    00101      rs1       000      00000  0001011
> > Remove dcache.cva rs1 ...
> >
> >> + *
> >> + * dcache.cipa rs1 (clean then invalidate, physical address)
> >> + * | 31 - 25 | 24 - 20 | 19 - 15 | 14 - 12 | 11 - 7 | 6 - 0 |
> >> + *   0000001    01011      rs1       000      00000  0001011
> >> + * dcache.civa rs1 (... virtual address)
> >> + *   0000001    00111      rs1       000      00000  0001011
> > Remove dcache.civa rs1 ...
> >
> >> + *
> >> + * sync.s (make sure all cache operations finished)
> >> + * | 31 - 25 | 24 - 20 | 19 - 15 | 14 - 12 | 11 - 7 | 6 - 0 |
> >> + *   0000000    11001     00000      000      00000  0001011
> >> + */
> >> +#define THEAD_inval_A0 ".long 0x0265000b"
> >> +#define THEAD_clean_A0 ".long 0x0255000b"
> >> +#define THEAD_flush_A0 ".long 0x0275000b"
> >> +#define THEAD_SYNC_S ".long 0x0190000b"
> >> +
> >> +#define THEAD_CMO_OP(_op, _start, _size, _cachesize) \
> >> +asm volatile("mv a0, %1\n\t" \
> >> +      "j 2f\n\t" \
> >> +      "3:\n\t" \
> >> +      THEAD_##_op##_A0 "\n\t" \
> >> +      "add a0, a0, %0\n\t" \
> >> +      "2:\n\t" \
> >> +      "bltu a0, %2, 3b\n\t" \
> >> +      THEAD_SYNC_S \
> >> +      : : "r"(_cachesize), \
> >> +  "r"((unsigned long)(_start) & ~((_cachesize) - 1UL)), \
> >> +  "r"((unsigned long)(_start) + (_size)) \
> >> +      : "a0")
> >> +
> >> +static void thead_errata_cache_inv(phys_addr_t paddr, size_t size)
> >> +{
> >> + void *vaddr = phys_to_virt(paddr);
> > No need to phys_to_virt, and we could use paddr directly (dcache.ipa
> > rs1).
> >
> >> +
> >> + THEAD_CMO_OP(inval, vaddr, size, riscv_cbom_block_size);
> >> +}
> >> +
> >> +static void thead_errata_cache_wback(phys_addr_t paddr, size_t size)
> >> +{
> >> + void *vaddr = phys_to_virt(paddr);
> >> +
> >> + THEAD_CMO_OP(clean, vaddr, size, riscv_cbom_block_size);
> >> +}
> > Please remove the thead_errata_cache_wback because T-HEAD processors
> > would prioritize using an invalid cacheline instead of evicting an
> > existing cacheline. When we do dcache clean, the following operations
> > are to let other interconnect masters read. So, keeping wback_inv for
> > T-HEAD processors is the best choice, and maybe some other processors'
> > vendor has a different idea, but please use the wback_inv instead of
> > wback_only for the T-HEAD processors.
>
> Unless you can demonstrate that your cores have significantly worse
> performance when using wback instead of wback_inv I do not think the
> non-standard implementation should deviate from the semantics of the
> standard one. There are efforts to unify the implemented semantics of
> the operations across architectures and this would obstruct those.
I'm afraid I have to disagree with the view that this obstructs
"unifying the implemented semantics of the operations across
architectures."

static const struct riscv_nonstd_cache_ops thead_errata_cmo_ops = {
-       .wback = &thead_errata_cache_wback,
+      .wback = &thead_errata_cache_wback_inv,
       .inv = &thead_errata_cache_inv,
       .wback_inv = &thead_errata_cache_wback_inv,

I don't see how the above patch obstructs unifying. On the contrary,
it decreases the custom function, which could help unify. Could you
give the least respect for the vendor's choice?

>
> Jess
>
> >> +
> >> +static void thead_errata_cache_wback_inv(phys_addr_t paddr, size_t size)
> >> +{
> >> + void *vaddr = phys_to_virt(paddr);
> > Ditto.
> >
> >> +
> >> + THEAD_CMO_OP(flush, vaddr, size, riscv_cbom_block_size);
> >> +}
> >> +
> >> +static const struct riscv_nonstd_cache_ops thead_errata_cmo_ops = {
> >> + .wback = &thead_errata_cache_wback,
> > Please  .wback = &thead_errata_cache_wback_inv,
> >
> >
> >> + .inv = &thead_errata_cache_inv,
> >> + .wback_inv = &thead_errata_cache_wback_inv,
> >> +};
> >> +
> >> static bool errata_probe_cmo(unsigned int stage,
> >>      unsigned long arch_id, unsigned long impid)
> >> {
> >> @@ -48,6 +119,8 @@ static bool errata_probe_cmo(unsigned int stage,
> >> if (stage == RISCV_ALTERNATIVES_BOOT) {
> >> riscv_cbom_block_size = L1_CACHE_BYTES;
> >> riscv_noncoherent_supported();
> >> + if (IS_ENABLED(CONFIG_RISCV_NONSTANDARD_CACHE_OPS))
> >> + riscv_noncoherent_register_cache_ops(&thead_errata_cmo_ops);
> >> }
> >>
> >> return true;
> >> @@ -77,8 +150,7 @@ static u32 thead_errata_probe(unsigned int stage,
> >> if (errata_probe_pbmt(stage, archid, impid))
> >> cpu_req_errata |= BIT(ERRATA_THEAD_PBMT);
> >>
> >> - if (errata_probe_cmo(stage, archid, impid))
> >> - cpu_req_errata |= BIT(ERRATA_THEAD_CMO);
> >> + errata_probe_cmo(stage, archid, impid);
> >>
> >> if (errata_probe_pmu(stage, archid, impid))
> >> cpu_req_errata |= BIT(ERRATA_THEAD_PMU);
> >> diff --git a/arch/riscv/include/asm/errata_list.h b/arch/riscv/include/asm/errata_list.h
> >> index b55b434f0059..ea33288f8a25 100644
> >> --- a/arch/riscv/include/asm/errata_list.h
> >> +++ b/arch/riscv/include/asm/errata_list.h
> >> @@ -24,9 +24,8 @@
> >>
> >> #ifdef CONFIG_ERRATA_THEAD
> >> #define ERRATA_THEAD_PBMT 0
> >> -#define ERRATA_THEAD_CMO 1
> >> -#define ERRATA_THEAD_PMU 2
> >> -#define ERRATA_THEAD_NUMBER 3
> >> +#define ERRATA_THEAD_PMU 1
> >> +#define ERRATA_THEAD_NUMBER 2
> >> #endif
> >>
> >> #ifdef __ASSEMBLY__
> >> @@ -94,54 +93,17 @@ asm volatile(ALTERNATIVE( \
> >> #define ALT_THEAD_PMA(_val)
> >> #endif
> >>
> >> -/*
> >> - * dcache.ipa rs1 (invalidate, physical address)
> >> - * | 31 - 25 | 24 - 20 | 19 - 15 | 14 - 12 | 11 - 7 | 6 - 0 |
> >> - *   0000001    01010      rs1       000      00000  0001011
> >> - * dache.iva rs1 (invalida, virtual address)
> >> - *   0000001    00110      rs1       000      00000  0001011
> >> - *
> >> - * dcache.cpa rs1 (clean, physical address)
> >> - * | 31 - 25 | 24 - 20 | 19 - 15 | 14 - 12 | 11 - 7 | 6 - 0 |
> >> - *   0000001    01001      rs1       000      00000  0001011
> >> - * dcache.cva rs1 (clean, virtual address)
> >> - *   0000001    00101      rs1       000      00000  0001011
> >> - *
> >> - * dcache.cipa rs1 (clean then invalidate, physical address)
> >> - * | 31 - 25 | 24 - 20 | 19 - 15 | 14 - 12 | 11 - 7 | 6 - 0 |
> >> - *   0000001    01011      rs1       000      00000  0001011
> >> - * dcache.civa rs1 (... virtual address)
> >> - *   0000001    00111      rs1       000      00000  0001011
> >> - *
> >> - * sync.s (make sure all cache operations finished)
> >> - * | 31 - 25 | 24 - 20 | 19 - 15 | 14 - 12 | 11 - 7 | 6 - 0 |
> >> - *   0000000    11001     00000      000      00000  0001011
> >> - */
> >> -#define THEAD_inval_A0 ".long 0x0265000b"
> >> -#define THEAD_clean_A0 ".long 0x0255000b"
> >> -#define THEAD_flush_A0 ".long 0x0275000b"
> >> -#define THEAD_SYNC_S ".long 0x0190000b"
> >> -
> >> #define ALT_CMO_OP(_op, _start, _size, _cachesize) \
> >> -asm volatile(ALTERNATIVE_2( \
> >> - __nops(6), \
> >> +asm volatile(ALTERNATIVE( \
> >> + __nops(5), \
> >> "mv a0, %1\n\t" \
> >> "j 2f\n\t" \
> >> "3:\n\t" \
> >> CBO_##_op(a0) \
> >> "add a0, a0, %0\n\t" \
> >> "2:\n\t" \
> >> - "bltu a0, %2, 3b\n\t" \
> >> - "nop", 0, RISCV_ISA_EXT_ZICBOM, CONFIG_RISCV_ISA_ZICBOM, \
> >> - "mv a0, %1\n\t" \
> >> - "j 2f\n\t" \
> >> - "3:\n\t" \
> >> - THEAD_##_op##_A0 "\n\t" \
> >> - "add a0, a0, %0\n\t" \
> >> - "2:\n\t" \
> >> - "bltu a0, %2, 3b\n\t" \
> >> - THEAD_SYNC_S, THEAD_VENDOR_ID, \
> >> - ERRATA_THEAD_CMO, CONFIG_ERRATA_THEAD_CMO) \
> >> + "bltu a0, %2, 3b\n\t", \
> >> + 0, RISCV_ISA_EXT_ZICBOM, CONFIG_RISCV_ISA_ZICBOM) \
> >> : : "r"(_cachesize), \
> >>     "r"((unsigned long)(_start) & ~((_cachesize) - 1UL)), \
> >>     "r"((unsigned long)(_start) + (_size)) \
> >> --
> >> 2.40.1
> >>
> >>
> >> _______________________________________________
> >> linux-riscv mailing list
> >> linux-riscv@lists.infradead.org
> >> http://lists.infradead.org/mailman/listinfo/linux-riscv
> >>
> >
> > _______________________________________________
> > linux-riscv mailing list
> > linux-riscv@lists.infradead.org
> > http://lists.infradead.org/mailman/listinfo/linux-riscv
>
>
Emil Renner Berthing Sept. 29, 2023, 10:12 p.m. UTC | #5
Jisheng Zhang wrote:
> Previously, we use alternative mechanism to dynamically patch
> the CMO operations for THEAD C906/C910 during boot for performance
> reason. But as pointed out by Arnd, "there is already a significant
> cost in accessing the invalidated cache lines afterwards, which is
> likely going to be much higher than the cost of an indirect branch".
> And indeed, there's no performance difference with GMAC and EMMC per
> my test on Sipeed Lichee Pi 4A board.
>
> Use riscv_nonstd_cache_ops for THEAD C906/C910 CMO to simplify
> the alternative code, and to acchieve Arnd's goal -- "I think
> moving the THEAD ops at the same level as all nonstandard operations
> makes sense, but I'd still leave CMO as an explicit fast path that
> avoids the indirect branch. This seems like the right thing to do both
> for readability and for platforms on which the indirect branch has a
> noticeable overhead."

Hi Jisheng,

Thank you for doing this! This boots fine on my Nezha board (Allwinner D1)
running btrfs on the sd-card which was quick to find bugs when this code
first went upstream.

Tested-by: Emil Renner Berthing <emil.renner.berthing@canonical.com>

> Signed-off-by: Jisheng Zhang <jszhang@kernel.org>
> ---
>  arch/riscv/Kconfig.errata            |  1 +
>  arch/riscv/errata/thead/errata.c     | 76 +++++++++++++++++++++++++++-
>  arch/riscv/include/asm/errata_list.h | 50 +++---------------
>  3 files changed, 81 insertions(+), 46 deletions(-)
>
> diff --git a/arch/riscv/Kconfig.errata b/arch/riscv/Kconfig.errata
> index 566bcefeab50..d7972914f9de 100644
> --- a/arch/riscv/Kconfig.errata
> +++ b/arch/riscv/Kconfig.errata
> @@ -78,6 +78,7 @@ config ERRATA_THEAD_CMO
>  	bool "Apply T-Head cache management errata"
>  	depends on ERRATA_THEAD && MMU
>  	select RISCV_DMA_NONCOHERENT
> +	select RISCV_NONSTANDARD_CACHE_OPS
>  	default y
>  	help
>  	  This will apply the cache management errata to handle the
> diff --git a/arch/riscv/errata/thead/errata.c b/arch/riscv/errata/thead/errata.c
> index 0554ed4bf087..1c320abfe446 100644
> --- a/arch/riscv/errata/thead/errata.c
> +++ b/arch/riscv/errata/thead/errata.c
> @@ -12,8 +12,10 @@
>  #include <asm/alternative.h>
>  #include <asm/cacheflush.h>
>  #include <asm/cpufeature.h>
> +#include <asm/dma-noncoherent.h>
>  #include <asm/errata_list.h>
>  #include <asm/hwprobe.h>
> +#include <asm/io.h>
>  #include <asm/patch.h>
>  #include <asm/vendorid_list.h>
>
> @@ -33,6 +35,75 @@ static bool errata_probe_pbmt(unsigned int stage,
>  	return false;
>  }
>
> +/*
> + * dcache.ipa rs1 (invalidate, physical address)
> + * | 31 - 25 | 24 - 20 | 19 - 15 | 14 - 12 | 11 - 7 | 6 - 0 |
> + *   0000001    01010      rs1       000      00000  0001011
> + * dache.iva rs1 (invalida, virtual address)
> + *   0000001    00110      rs1       000      00000  0001011
> + *
> + * dcache.cpa rs1 (clean, physical address)
> + * | 31 - 25 | 24 - 20 | 19 - 15 | 14 - 12 | 11 - 7 | 6 - 0 |
> + *   0000001    01001      rs1       000      00000  0001011
> + * dcache.cva rs1 (clean, virtual address)
> + *   0000001    00101      rs1       000      00000  0001011
> + *
> + * dcache.cipa rs1 (clean then invalidate, physical address)
> + * | 31 - 25 | 24 - 20 | 19 - 15 | 14 - 12 | 11 - 7 | 6 - 0 |
> + *   0000001    01011      rs1       000      00000  0001011
> + * dcache.civa rs1 (... virtual address)
> + *   0000001    00111      rs1       000      00000  0001011
> + *
> + * sync.s (make sure all cache operations finished)
> + * | 31 - 25 | 24 - 20 | 19 - 15 | 14 - 12 | 11 - 7 | 6 - 0 |
> + *   0000000    11001     00000      000      00000  0001011
> + */
> +#define THEAD_inval_A0	".long 0x0265000b"
> +#define THEAD_clean_A0	".long 0x0255000b"
> +#define THEAD_flush_A0	".long 0x0275000b"
> +#define THEAD_SYNC_S	".long 0x0190000b"
> +
> +#define THEAD_CMO_OP(_op, _start, _size, _cachesize)			\
> +asm volatile("mv a0, %1\n\t"						\
> +	     "j 2f\n\t"							\
> +	     "3:\n\t"							\
> +	     THEAD_##_op##_A0 "\n\t"					\
> +	     "add a0, a0, %0\n\t"					\
> +	     "2:\n\t"							\
> +	     "bltu a0, %2, 3b\n\t"					\
> +	     THEAD_SYNC_S						\
> +	     : : "r"(_cachesize),					\
> +		 "r"((unsigned long)(_start) & ~((_cachesize) - 1UL)),	\
> +		 "r"((unsigned long)(_start) + (_size))			\
> +	     : "a0")
> +
> +static void thead_errata_cache_inv(phys_addr_t paddr, size_t size)
> +{
> +	void *vaddr = phys_to_virt(paddr);
> +
> +	THEAD_CMO_OP(inval, vaddr, size, riscv_cbom_block_size);
> +}
> +
> +static void thead_errata_cache_wback(phys_addr_t paddr, size_t size)
> +{
> +	void *vaddr = phys_to_virt(paddr);
> +
> +	THEAD_CMO_OP(clean, vaddr, size, riscv_cbom_block_size);
> +}
> +
> +static void thead_errata_cache_wback_inv(phys_addr_t paddr, size_t size)
> +{
> +	void *vaddr = phys_to_virt(paddr);
> +
> +	THEAD_CMO_OP(flush, vaddr, size, riscv_cbom_block_size);
> +}
> +
> +static const struct riscv_nonstd_cache_ops thead_errata_cmo_ops = {
> +	.wback = &thead_errata_cache_wback,
> +	.inv = &thead_errata_cache_inv,
> +	.wback_inv = &thead_errata_cache_wback_inv,
> +};
> +
>  static bool errata_probe_cmo(unsigned int stage,
>  			     unsigned long arch_id, unsigned long impid)
>  {
> @@ -48,6 +119,8 @@ static bool errata_probe_cmo(unsigned int stage,
>  	if (stage == RISCV_ALTERNATIVES_BOOT) {
>  		riscv_cbom_block_size = L1_CACHE_BYTES;
>  		riscv_noncoherent_supported();
> +		if (IS_ENABLED(CONFIG_RISCV_NONSTANDARD_CACHE_OPS))
> +			riscv_noncoherent_register_cache_ops(&thead_errata_cmo_ops);
>  	}
>
>  	return true;
> @@ -77,8 +150,7 @@ static u32 thead_errata_probe(unsigned int stage,
>  	if (errata_probe_pbmt(stage, archid, impid))
>  		cpu_req_errata |= BIT(ERRATA_THEAD_PBMT);
>
> -	if (errata_probe_cmo(stage, archid, impid))
> -		cpu_req_errata |= BIT(ERRATA_THEAD_CMO);
> +	errata_probe_cmo(stage, archid, impid);
>
>  	if (errata_probe_pmu(stage, archid, impid))
>  		cpu_req_errata |= BIT(ERRATA_THEAD_PMU);
> diff --git a/arch/riscv/include/asm/errata_list.h b/arch/riscv/include/asm/errata_list.h
> index b55b434f0059..ea33288f8a25 100644
> --- a/arch/riscv/include/asm/errata_list.h
> +++ b/arch/riscv/include/asm/errata_list.h
> @@ -24,9 +24,8 @@
>
>  #ifdef CONFIG_ERRATA_THEAD
>  #define	ERRATA_THEAD_PBMT 0
> -#define	ERRATA_THEAD_CMO 1
> -#define	ERRATA_THEAD_PMU 2
> -#define	ERRATA_THEAD_NUMBER 3
> +#define	ERRATA_THEAD_PMU 1
> +#define	ERRATA_THEAD_NUMBER 2
>  #endif
>
>  #ifdef __ASSEMBLY__
> @@ -94,54 +93,17 @@ asm volatile(ALTERNATIVE(						\
>  #define ALT_THEAD_PMA(_val)
>  #endif
>
> -/*
> - * dcache.ipa rs1 (invalidate, physical address)
> - * | 31 - 25 | 24 - 20 | 19 - 15 | 14 - 12 | 11 - 7 | 6 - 0 |
> - *   0000001    01010      rs1       000      00000  0001011
> - * dache.iva rs1 (invalida, virtual address)
> - *   0000001    00110      rs1       000      00000  0001011
> - *
> - * dcache.cpa rs1 (clean, physical address)
> - * | 31 - 25 | 24 - 20 | 19 - 15 | 14 - 12 | 11 - 7 | 6 - 0 |
> - *   0000001    01001      rs1       000      00000  0001011
> - * dcache.cva rs1 (clean, virtual address)
> - *   0000001    00101      rs1       000      00000  0001011
> - *
> - * dcache.cipa rs1 (clean then invalidate, physical address)
> - * | 31 - 25 | 24 - 20 | 19 - 15 | 14 - 12 | 11 - 7 | 6 - 0 |
> - *   0000001    01011      rs1       000      00000  0001011
> - * dcache.civa rs1 (... virtual address)
> - *   0000001    00111      rs1       000      00000  0001011
> - *
> - * sync.s (make sure all cache operations finished)
> - * | 31 - 25 | 24 - 20 | 19 - 15 | 14 - 12 | 11 - 7 | 6 - 0 |
> - *   0000000    11001     00000      000      00000  0001011
> - */
> -#define THEAD_inval_A0	".long 0x0265000b"
> -#define THEAD_clean_A0	".long 0x0255000b"
> -#define THEAD_flush_A0	".long 0x0275000b"
> -#define THEAD_SYNC_S	".long 0x0190000b"
> -
>  #define ALT_CMO_OP(_op, _start, _size, _cachesize)			\
> -asm volatile(ALTERNATIVE_2(						\
> -	__nops(6),							\
> +asm volatile(ALTERNATIVE(						\
> +	__nops(5),							\
>  	"mv a0, %1\n\t"							\
>  	"j 2f\n\t"							\
>  	"3:\n\t"							\
>  	CBO_##_op(a0)							\
>  	"add a0, a0, %0\n\t"						\
>  	"2:\n\t"							\
> -	"bltu a0, %2, 3b\n\t"						\
> -	"nop", 0, RISCV_ISA_EXT_ZICBOM, CONFIG_RISCV_ISA_ZICBOM,	\
> -	"mv a0, %1\n\t"							\
> -	"j 2f\n\t"							\
> -	"3:\n\t"							\
> -	THEAD_##_op##_A0 "\n\t"						\
> -	"add a0, a0, %0\n\t"						\
> -	"2:\n\t"							\
> -	"bltu a0, %2, 3b\n\t"						\
> -	THEAD_SYNC_S, THEAD_VENDOR_ID,					\
> -			ERRATA_THEAD_CMO, CONFIG_ERRATA_THEAD_CMO)	\
> +	"bltu a0, %2, 3b\n\t",						\
> +	0, RISCV_ISA_EXT_ZICBOM, CONFIG_RISCV_ISA_ZICBOM)		\
>  	: : "r"(_cachesize),						\
>  	    "r"((unsigned long)(_start) & ~((_cachesize) - 1UL)),	\
>  	    "r"((unsigned long)(_start) + (_size))			\
> --
> 2.40.1
>
>
> _______________________________________________
> linux-riscv mailing list
> linux-riscv@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-riscv
Arnd Bergmann Oct. 2, 2023, 12:15 p.m. UTC | #6
On Wed, Sep 13, 2023, at 02:06, Guo Ren wrote:
> On Wed, Sep 13, 2023 at 3:00 AM Jessica Clarke <jrtc27@jrtc27.com> wrote:
>>
>> On 12 Sep 2023, at 11:53, Guo Ren <guoren@kernel.org> wrote:
>> > Please remove the thead_errata_cache_wback because T-HEAD processors
>> > would prioritize using an invalid cacheline instead of evicting an
>> > existing cacheline. When we do dcache clean, the following operations
>> > are to let other interconnect masters read. So, keeping wback_inv for
>> > T-HEAD processors is the best choice, and maybe some other processors'
>> > vendor has a different idea, but please use the wback_inv instead of
>> > wback_only for the T-HEAD processors.
>>
>> Unless you can demonstrate that your cores have significantly worse
>> performance when using wback instead of wback_inv I do not think the
>> non-standard implementation should deviate from the semantics of the
>> standard one. There are efforts to unify the implemented semantics of
>> the operations across architectures and this would obstruct those.
>
> I'm afraid I have to disagree with the view that this obstructs
> "unifying the implemented semantics of the operations across
> architectures."
>
> static const struct riscv_nonstd_cache_ops thead_errata_cmo_ops = {
> -       .wback = &thead_errata_cache_wback,
> +      .wback = &thead_errata_cache_wback_inv,
>        .inv = &thead_errata_cache_inv,
>        .wback_inv = &thead_errata_cache_wback_inv,
>
> I don't see how the above patch obstructs unifying. On the contrary,
> it decreases the custom function, which could help unify. Could you
> give the least respect for the vendor's choice?

Since the email thread popped up after the latest replies, I saw
that I had not replied yet. I agree with Jessica here: we need to
ensure that the callback functions do what the interface requires,
across all architectures and CPU implementations. The choice to
call wback or wback_inv is a matter of optimization in the caller,
and we may well end up calling .wback_inv() for cases that currently
rely on ,wback(), if we can show this to be faster on certain
CPUs, and other cases (e.g. repeatedly writing and flushing
individual bytes to single DMA buffer) clearly rely on keeping
the cache line.

Other policy questions like whether to use .wback or .inv before
a DMA from device also still need a better answer, which we must
decide globally rather than per CPU implementation.

      Arnd
diff mbox series

Patch

diff --git a/arch/riscv/Kconfig.errata b/arch/riscv/Kconfig.errata
index 566bcefeab50..d7972914f9de 100644
--- a/arch/riscv/Kconfig.errata
+++ b/arch/riscv/Kconfig.errata
@@ -78,6 +78,7 @@  config ERRATA_THEAD_CMO
 	bool "Apply T-Head cache management errata"
 	depends on ERRATA_THEAD && MMU
 	select RISCV_DMA_NONCOHERENT
+	select RISCV_NONSTANDARD_CACHE_OPS
 	default y
 	help
 	  This will apply the cache management errata to handle the
diff --git a/arch/riscv/errata/thead/errata.c b/arch/riscv/errata/thead/errata.c
index 0554ed4bf087..1c320abfe446 100644
--- a/arch/riscv/errata/thead/errata.c
+++ b/arch/riscv/errata/thead/errata.c
@@ -12,8 +12,10 @@ 
 #include <asm/alternative.h>
 #include <asm/cacheflush.h>
 #include <asm/cpufeature.h>
+#include <asm/dma-noncoherent.h>
 #include <asm/errata_list.h>
 #include <asm/hwprobe.h>
+#include <asm/io.h>
 #include <asm/patch.h>
 #include <asm/vendorid_list.h>
 
@@ -33,6 +35,75 @@  static bool errata_probe_pbmt(unsigned int stage,
 	return false;
 }
 
+/*
+ * dcache.ipa rs1 (invalidate, physical address)
+ * | 31 - 25 | 24 - 20 | 19 - 15 | 14 - 12 | 11 - 7 | 6 - 0 |
+ *   0000001    01010      rs1       000      00000  0001011
+ * dache.iva rs1 (invalida, virtual address)
+ *   0000001    00110      rs1       000      00000  0001011
+ *
+ * dcache.cpa rs1 (clean, physical address)
+ * | 31 - 25 | 24 - 20 | 19 - 15 | 14 - 12 | 11 - 7 | 6 - 0 |
+ *   0000001    01001      rs1       000      00000  0001011
+ * dcache.cva rs1 (clean, virtual address)
+ *   0000001    00101      rs1       000      00000  0001011
+ *
+ * dcache.cipa rs1 (clean then invalidate, physical address)
+ * | 31 - 25 | 24 - 20 | 19 - 15 | 14 - 12 | 11 - 7 | 6 - 0 |
+ *   0000001    01011      rs1       000      00000  0001011
+ * dcache.civa rs1 (... virtual address)
+ *   0000001    00111      rs1       000      00000  0001011
+ *
+ * sync.s (make sure all cache operations finished)
+ * | 31 - 25 | 24 - 20 | 19 - 15 | 14 - 12 | 11 - 7 | 6 - 0 |
+ *   0000000    11001     00000      000      00000  0001011
+ */
+#define THEAD_inval_A0	".long 0x0265000b"
+#define THEAD_clean_A0	".long 0x0255000b"
+#define THEAD_flush_A0	".long 0x0275000b"
+#define THEAD_SYNC_S	".long 0x0190000b"
+
+#define THEAD_CMO_OP(_op, _start, _size, _cachesize)			\
+asm volatile("mv a0, %1\n\t"						\
+	     "j 2f\n\t"							\
+	     "3:\n\t"							\
+	     THEAD_##_op##_A0 "\n\t"					\
+	     "add a0, a0, %0\n\t"					\
+	     "2:\n\t"							\
+	     "bltu a0, %2, 3b\n\t"					\
+	     THEAD_SYNC_S						\
+	     : : "r"(_cachesize),					\
+		 "r"((unsigned long)(_start) & ~((_cachesize) - 1UL)),	\
+		 "r"((unsigned long)(_start) + (_size))			\
+	     : "a0")
+
+static void thead_errata_cache_inv(phys_addr_t paddr, size_t size)
+{
+	void *vaddr = phys_to_virt(paddr);
+
+	THEAD_CMO_OP(inval, vaddr, size, riscv_cbom_block_size);
+}
+
+static void thead_errata_cache_wback(phys_addr_t paddr, size_t size)
+{
+	void *vaddr = phys_to_virt(paddr);
+
+	THEAD_CMO_OP(clean, vaddr, size, riscv_cbom_block_size);
+}
+
+static void thead_errata_cache_wback_inv(phys_addr_t paddr, size_t size)
+{
+	void *vaddr = phys_to_virt(paddr);
+
+	THEAD_CMO_OP(flush, vaddr, size, riscv_cbom_block_size);
+}
+
+static const struct riscv_nonstd_cache_ops thead_errata_cmo_ops = {
+	.wback = &thead_errata_cache_wback,
+	.inv = &thead_errata_cache_inv,
+	.wback_inv = &thead_errata_cache_wback_inv,
+};
+
 static bool errata_probe_cmo(unsigned int stage,
 			     unsigned long arch_id, unsigned long impid)
 {
@@ -48,6 +119,8 @@  static bool errata_probe_cmo(unsigned int stage,
 	if (stage == RISCV_ALTERNATIVES_BOOT) {
 		riscv_cbom_block_size = L1_CACHE_BYTES;
 		riscv_noncoherent_supported();
+		if (IS_ENABLED(CONFIG_RISCV_NONSTANDARD_CACHE_OPS))
+			riscv_noncoherent_register_cache_ops(&thead_errata_cmo_ops);
 	}
 
 	return true;
@@ -77,8 +150,7 @@  static u32 thead_errata_probe(unsigned int stage,
 	if (errata_probe_pbmt(stage, archid, impid))
 		cpu_req_errata |= BIT(ERRATA_THEAD_PBMT);
 
-	if (errata_probe_cmo(stage, archid, impid))
-		cpu_req_errata |= BIT(ERRATA_THEAD_CMO);
+	errata_probe_cmo(stage, archid, impid);
 
 	if (errata_probe_pmu(stage, archid, impid))
 		cpu_req_errata |= BIT(ERRATA_THEAD_PMU);
diff --git a/arch/riscv/include/asm/errata_list.h b/arch/riscv/include/asm/errata_list.h
index b55b434f0059..ea33288f8a25 100644
--- a/arch/riscv/include/asm/errata_list.h
+++ b/arch/riscv/include/asm/errata_list.h
@@ -24,9 +24,8 @@ 
 
 #ifdef CONFIG_ERRATA_THEAD
 #define	ERRATA_THEAD_PBMT 0
-#define	ERRATA_THEAD_CMO 1
-#define	ERRATA_THEAD_PMU 2
-#define	ERRATA_THEAD_NUMBER 3
+#define	ERRATA_THEAD_PMU 1
+#define	ERRATA_THEAD_NUMBER 2
 #endif
 
 #ifdef __ASSEMBLY__
@@ -94,54 +93,17 @@  asm volatile(ALTERNATIVE(						\
 #define ALT_THEAD_PMA(_val)
 #endif
 
-/*
- * dcache.ipa rs1 (invalidate, physical address)
- * | 31 - 25 | 24 - 20 | 19 - 15 | 14 - 12 | 11 - 7 | 6 - 0 |
- *   0000001    01010      rs1       000      00000  0001011
- * dache.iva rs1 (invalida, virtual address)
- *   0000001    00110      rs1       000      00000  0001011
- *
- * dcache.cpa rs1 (clean, physical address)
- * | 31 - 25 | 24 - 20 | 19 - 15 | 14 - 12 | 11 - 7 | 6 - 0 |
- *   0000001    01001      rs1       000      00000  0001011
- * dcache.cva rs1 (clean, virtual address)
- *   0000001    00101      rs1       000      00000  0001011
- *
- * dcache.cipa rs1 (clean then invalidate, physical address)
- * | 31 - 25 | 24 - 20 | 19 - 15 | 14 - 12 | 11 - 7 | 6 - 0 |
- *   0000001    01011      rs1       000      00000  0001011
- * dcache.civa rs1 (... virtual address)
- *   0000001    00111      rs1       000      00000  0001011
- *
- * sync.s (make sure all cache operations finished)
- * | 31 - 25 | 24 - 20 | 19 - 15 | 14 - 12 | 11 - 7 | 6 - 0 |
- *   0000000    11001     00000      000      00000  0001011
- */
-#define THEAD_inval_A0	".long 0x0265000b"
-#define THEAD_clean_A0	".long 0x0255000b"
-#define THEAD_flush_A0	".long 0x0275000b"
-#define THEAD_SYNC_S	".long 0x0190000b"
-
 #define ALT_CMO_OP(_op, _start, _size, _cachesize)			\
-asm volatile(ALTERNATIVE_2(						\
-	__nops(6),							\
+asm volatile(ALTERNATIVE(						\
+	__nops(5),							\
 	"mv a0, %1\n\t"							\
 	"j 2f\n\t"							\
 	"3:\n\t"							\
 	CBO_##_op(a0)							\
 	"add a0, a0, %0\n\t"						\
 	"2:\n\t"							\
-	"bltu a0, %2, 3b\n\t"						\
-	"nop", 0, RISCV_ISA_EXT_ZICBOM, CONFIG_RISCV_ISA_ZICBOM,	\
-	"mv a0, %1\n\t"							\
-	"j 2f\n\t"							\
-	"3:\n\t"							\
-	THEAD_##_op##_A0 "\n\t"						\
-	"add a0, a0, %0\n\t"						\
-	"2:\n\t"							\
-	"bltu a0, %2, 3b\n\t"						\
-	THEAD_SYNC_S, THEAD_VENDOR_ID,					\
-			ERRATA_THEAD_CMO, CONFIG_ERRATA_THEAD_CMO)	\
+	"bltu a0, %2, 3b\n\t",						\
+	0, RISCV_ISA_EXT_ZICBOM, CONFIG_RISCV_ISA_ZICBOM)		\
 	: : "r"(_cachesize),						\
 	    "r"((unsigned long)(_start) & ~((_cachesize) - 1UL)),	\
 	    "r"((unsigned long)(_start) + (_size))			\