Message ID | 1518829066-3558-1-git-send-email-shankerd@codeaurora.org (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Fri, Feb 16, 2018 at 06:57:46PM -0600, Shanker Donthineni wrote: > Two point of unification cache maintenance operations 'DC CVAU' and > 'IC IVAU' are optional for implementors as per ARMv8 specification. > This patch parses the updated CTR_EL0 register definition and adds > the required changes to skip POU operations if the hardware reports > CTR_EL0.IDC and/or CTR_EL0.IDC. > > CTR_EL0.DIC: Instruction cache invalidation requirements for > instruction to data coherence. The meaning of this bit[29]. > 0: Instruction cache invalidation to the point of unification > is required for instruction to data coherence. > 1: Instruction cache cleaning to the point of unification is > not required for instruction to data coherence. > > CTR_EL0.IDC: Data cache clean requirements for instruction to data > coherence. The meaning of this bit[28]. > 0: Data cache clean to the point of unification is required for > instruction to data coherence, unless CLIDR_EL1.LoC == 0b000 > or (CLIDR_EL1.LoUIS == 0b000 && CLIDR_EL1.LoUU == 0b000). > 1: Data cache clean to the point of unification is not required > for instruction to data coherence. There is a difference between cache maintenance to PoU "is not required" and the actual instructions being optional (i.e. undef when executed). If your caches are transparent and DC CVAU/IC IVAU is not required, these instructions should behave as NOPs. So, are you trying to improve the performance of the cache maintenance routines in the kernel? If yes, please show some (relative) numbers and a better description in the commit log. On the patch, I'd rather have an alternative framework entry for no VAU cache maint required and some ret instruction at the beginning of the cache maint function rather than jumping out of the loop somewhere inside the cache maintenance code, penalising the CPUs that do require it.
Hi Shanker, On Fri, Feb 16, 2018 at 06:57:46PM -0600, Shanker Donthineni wrote: > Two point of unification cache maintenance operations 'DC CVAU' and > 'IC IVAU' are optional for implementors as per ARMv8 specification. > This patch parses the updated CTR_EL0 register definition and adds > the required changes to skip POU operations if the hardware reports > CTR_EL0.IDC and/or CTR_EL0.IDC. > > CTR_EL0.DIC: Instruction cache invalidation requirements for > instruction to data coherence. The meaning of this bit[29]. > 0: Instruction cache invalidation to the point of unification > is required for instruction to data coherence. > 1: Instruction cache cleaning to the point of unification is > not required for instruction to data coherence. > > CTR_EL0.IDC: Data cache clean requirements for instruction to data > coherence. The meaning of this bit[28]. > 0: Data cache clean to the point of unification is required for > instruction to data coherence, unless CLIDR_EL1.LoC == 0b000 > or (CLIDR_EL1.LoUIS == 0b000 && CLIDR_EL1.LoUU == 0b000). > 1: Data cache clean to the point of unification is not required > for instruction to data coherence. > > Signed-off-by: Philip Elcan <pelcan@codeaurora.org> > Signed-off-by: Shanker Donthineni <shankerd@codeaurora.org> > --- > arch/arm64/include/asm/assembler.h | 48 ++++++++++++++++++++++++-------------- > arch/arm64/include/asm/cache.h | 2 ++ > arch/arm64/kernel/cpufeature.c | 2 ++ > arch/arm64/mm/cache.S | 26 ++++++++++++++------- > 4 files changed, 51 insertions(+), 27 deletions(-) I was looking at our CTR_EL0 code last week but forgot to post the patch I wrote fixing up some of the fields. I just send it now, so please can you rebase on top of: http://lists.infradead.org/pipermail/linux-arm-kernel/2018-February/560488.html Also: > diff --git a/arch/arm64/include/asm/cache.h b/arch/arm64/include/asm/cache.h > index ea9bb4e..aea533b 100644 > --- a/arch/arm64/include/asm/cache.h > +++ b/arch/arm64/include/asm/cache.h > @@ -22,6 +22,8 @@ > #define CTR_L1IP_MASK 3 > #define CTR_CWG_SHIFT 24 > #define CTR_CWG_MASK 15 > +#define CTR_IDC_SHIFT 28 > +#define CTR_DIC_SHIFT 29 > > #define CTR_L1IP(ctr) (((ctr) >> CTR_L1IP_SHIFT) & CTR_L1IP_MASK) > > diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c > index 29b1f87..f42bb5a 100644 > --- a/arch/arm64/kernel/cpufeature.c > +++ b/arch/arm64/kernel/cpufeature.c > @@ -200,6 +200,8 @@ static int __init register_cpu_hwcaps_dumper(void) > > static const struct arm64_ftr_bits ftr_ctr[] = { > ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, 31, 1, 1), /* RAO */ > + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, CTR_DIC_SHIFT, 1, 0), /* DIC */ > + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, CTR_IDC_SHIFT, 1, 0), /* IDC */ > ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_HIGHER_SAFE, 24, 4, 0), /* CWG */ > ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, 20, 4, 0), /* ERG */ > ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, 16, 4, 1), /* DminLine */ Could you update the other table entries here to use the CTR_*_SHIFT values as well? Thanks, Will
Hi Catalin, On 02/19/2018 08:38 AM, Catalin Marinas wrote: > On Fri, Feb 16, 2018 at 06:57:46PM -0600, Shanker Donthineni wrote: >> Two point of unification cache maintenance operations 'DC CVAU' and >> 'IC IVAU' are optional for implementors as per ARMv8 specification. >> This patch parses the updated CTR_EL0 register definition and adds >> the required changes to skip POU operations if the hardware reports >> CTR_EL0.IDC and/or CTR_EL0.IDC. >> >> CTR_EL0.DIC: Instruction cache invalidation requirements for >> instruction to data coherence. The meaning of this bit[29]. >> 0: Instruction cache invalidation to the point of unification >> is required for instruction to data coherence. >> 1: Instruction cache cleaning to the point of unification is >> not required for instruction to data coherence. >> >> CTR_EL0.IDC: Data cache clean requirements for instruction to data >> coherence. The meaning of this bit[28]. >> 0: Data cache clean to the point of unification is required for >> instruction to data coherence, unless CLIDR_EL1.LoC == 0b000 >> or (CLIDR_EL1.LoUIS == 0b000 && CLIDR_EL1.LoUU == 0b000). >> 1: Data cache clean to the point of unification is not required >> for instruction to data coherence. > > There is a difference between cache maintenance to PoU "is not required" > and the actual instructions being optional (i.e. undef when executed). > If your caches are transparent and DC CVAU/IC IVAU is not required, > these instructions should behave as NOPs. So, are you trying to improve > the performance of the cache maintenance routines in the kernel? If yes, > please show some (relative) numbers and a better description in the > commit log. > Yes, I agree with you, POU instructions are NOPs if the caches are transparent. There was no issue as per correctness point of view. But causing the unnecessary overhead in ASM routines where code goes thorough VA range incremented by cache line size. This overhead is noticeable with 64K PAGE, especially with sections mappings. I'll reword the commit text to reflect your comments in v2 patch. e.g. 512M section with 64K PAGE_SIZE kernel, assume 64Bytes cache size. flush_icache_range() consumes around 256M cpu cycles Icache loop overhead: 512Mbytes / 64Bytes * 4 instructions per loop Dcache loop overhead: 512Mbytes / 64Bytes * 4 instructions per loop With this patch it takes less than ~1K cycles. > On the patch, I'd rather have an alternative framework entry for no VAU > cache maint required and some ret instruction at the beginning of the > cache maint function rather than jumping out of the loop somewhere > inside the cache maintenance code, penalising the CPUs that do require > it. > Alternative framework might break things in case of CPU hotplug. I need one more confirmation from you on incorporating alternative framework.
Hi Will, On 02/19/2018 08:43 AM, Will Deacon wrote: > Hi Shanker, > > On Fri, Feb 16, 2018 at 06:57:46PM -0600, Shanker Donthineni wrote: >> Two point of unification cache maintenance operations 'DC CVAU' and >> 'IC IVAU' are optional for implementors as per ARMv8 specification. >> This patch parses the updated CTR_EL0 register definition and adds >> the required changes to skip POU operations if the hardware reports >> CTR_EL0.IDC and/or CTR_EL0.IDC. >> >> CTR_EL0.DIC: Instruction cache invalidation requirements for >> instruction to data coherence. The meaning of this bit[29]. >> 0: Instruction cache invalidation to the point of unification >> is required for instruction to data coherence. >> 1: Instruction cache cleaning to the point of unification is >> not required for instruction to data coherence. >> >> CTR_EL0.IDC: Data cache clean requirements for instruction to data >> coherence. The meaning of this bit[28]. >> 0: Data cache clean to the point of unification is required for >> instruction to data coherence, unless CLIDR_EL1.LoC == 0b000 >> or (CLIDR_EL1.LoUIS == 0b000 && CLIDR_EL1.LoUU == 0b000). >> 1: Data cache clean to the point of unification is not required >> for instruction to data coherence. >> >> Signed-off-by: Philip Elcan <pelcan@codeaurora.org> >> Signed-off-by: Shanker Donthineni <shankerd@codeaurora.org> >> --- >> arch/arm64/include/asm/assembler.h | 48 ++++++++++++++++++++++++-------------- >> arch/arm64/include/asm/cache.h | 2 ++ >> arch/arm64/kernel/cpufeature.c | 2 ++ >> arch/arm64/mm/cache.S | 26 ++++++++++++++------- >> 4 files changed, 51 insertions(+), 27 deletions(-) > > I was looking at our CTR_EL0 code last week but forgot to post the patch I > wrote fixing up some of the fields. I just send it now, so please can > you rebase on top of: > > http://lists.infradead.org/pipermail/linux-arm-kernel/2018-February/560488.html > > Also: > >> diff --git a/arch/arm64/include/asm/cache.h b/arch/arm64/include/asm/cache.h >> index ea9bb4e..aea533b 100644 >> --- a/arch/arm64/include/asm/cache.h >> +++ b/arch/arm64/include/asm/cache.h >> @@ -22,6 +22,8 @@ >> #define CTR_L1IP_MASK 3 >> #define CTR_CWG_SHIFT 24 >> #define CTR_CWG_MASK 15 >> +#define CTR_IDC_SHIFT 28 >> +#define CTR_DIC_SHIFT 29 >> >> #define CTR_L1IP(ctr) (((ctr) >> CTR_L1IP_SHIFT) & CTR_L1IP_MASK) >> >> diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c >> index 29b1f87..f42bb5a 100644 >> --- a/arch/arm64/kernel/cpufeature.c >> +++ b/arch/arm64/kernel/cpufeature.c >> @@ -200,6 +200,8 @@ static int __init register_cpu_hwcaps_dumper(void) >> >> static const struct arm64_ftr_bits ftr_ctr[] = { >> ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, 31, 1, 1), /* RAO */ >> + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, CTR_DIC_SHIFT, 1, 0), /* DIC */ >> + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, CTR_IDC_SHIFT, 1, 0), /* IDC */ >> ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_HIGHER_SAFE, 24, 4, 0), /* CWG */ >> ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, 20, 4, 0), /* ERG */ >> ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, 16, 4, 1), /* DminLine */ > > Could you update the other table entries here to use the CTR_*_SHIFT values > as well? > I'll do. > Thanks, > > Will > > _______________________________________________ > linux-arm-kernel mailing list > linux-arm-kernel@lists.infradead.org > http://lists.infradead.org/mailman/listinfo/linux-arm-kernel >
On Mon, Feb 19, 2018 at 10:35:30AM -0600, Shanker Donthineni wrote: > On 02/19/2018 08:38 AM, Catalin Marinas wrote: > > On the patch, I'd rather have an alternative framework entry for no VAU > > cache maint required and some ret instruction at the beginning of the > > cache maint function rather than jumping out of the loop somewhere > > inside the cache maintenance code, penalising the CPUs that do require > > it. > > Alternative framework might break things in case of CPU hotplug. I need one > more confirmation from you on incorporating alternative framework. CPU hotplug can be an issue but it should be handled like other similar cases: if a CPU comes online late and its features are incompatible, it should not be brought online. The cpufeature code handles this. With Will's patch for CTR_EL0, we handle different CPU features during boot, defaulting to the lowest value for the IDC/DIC bits. I suggest you add new ARM64_HAS_* feature bits and enable them based on CTR_EL0.IDC and DIC. You could check for both being 1 with a single feature bit but I guess an implementation is allowed to have these different (e.g. DIC == 0 and IDC == 1).
Thanks Catalin for your comments. On 02/19/2018 11:18 AM, Catalin Marinas wrote: > On Mon, Feb 19, 2018 at 10:35:30AM -0600, Shanker Donthineni wrote: >> On 02/19/2018 08:38 AM, Catalin Marinas wrote: >>> On the patch, I'd rather have an alternative framework entry for no VAU >>> cache maint required and some ret instruction at the beginning of the >>> cache maint function rather than jumping out of the loop somewhere >>> inside the cache maintenance code, penalising the CPUs that do require >>> it. >> >> Alternative framework might break things in case of CPU hotplug. I need one >> more confirmation from you on incorporating alternative framework. > > CPU hotplug can be an issue but it should be handled like other similar > cases: if a CPU comes online late and its features are incompatible, it > should not be brought online. The cpufeature code handles this. > > With Will's patch for CTR_EL0, we handle different CPU features during > boot, defaulting to the lowest value for the IDC/DIC bits. > > I suggest you add new ARM64_HAS_* feature bits and enable them based on > CTR_EL0.IDC and DIC. You could check for both being 1 with a single > feature bit but I guess an implementation is allowed to have these > different (e.g. DIC == 0 and IDC == 1). > I'll add two new features ARM64_HAS_DIC and ARM64_HAS_IDC to support all implementations. Unfortunately QCOM server chips supports IDC not DIC.
Hi Shanker, Thank you for the patch! Yet something to improve: [auto build test ERROR on linus/master] [also build test ERROR on v4.16-rc2 next-20180219] [cannot apply to arm64/for-next/core] [if your patch is applied to the wrong git tree, please drop us a note to help improve the system] url: https://github.com/0day-ci/linux/commits/Shanker-Donthineni/arm64-Add-support-for-new-control-bits-CTR_EL0-IDC-and-CTR_EL0-IDC/20180219-031155 config: arm64-defconfig (attached as .config) compiler: aarch64-linux-gnu-gcc (Debian 7.2.0-11) 7.2.0 reproduce: wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross chmod +x ~/bin/make.cross # save the attached .config to linux build tree make.cross ARCH=arm64 All errors (new ones prefixed by >>): arch/arm64/kernel/hibernate-asm.S: Assembler messages: >> arch/arm64/kernel/hibernate-asm.S:101: Error: unexpected comma after the mnemonic name `mrs' -- `mrs ,ctr_el0' >> arch/arm64/kernel/hibernate-asm.S:101: Error: operand 2 must be an integer register -- `ubfm x3,,#16,#19' -- arch/arm64/kernel/relocate_kernel.S: Assembler messages: >> arch/arm64/kernel/relocate_kernel.S:37: Error: unexpected comma after the mnemonic name `mrs' -- `mrs ,ctr_el0' >> arch/arm64/kernel/relocate_kernel.S:37: Error: operand 2 must be an integer register -- `ubfm x0,,#16,#19' vim +101 arch/arm64/kernel/hibernate-asm.S 82869ac57 James Morse 2016-04-27 46 82869ac57 James Morse 2016-04-27 47 82869ac57 James Morse 2016-04-27 48 /* 82869ac57 James Morse 2016-04-27 49 * Resume from hibernate 82869ac57 James Morse 2016-04-27 50 * 82869ac57 James Morse 2016-04-27 51 * Loads temporary page tables then restores the memory image. 82869ac57 James Morse 2016-04-27 52 * Finally branches to cpu_resume() to restore the state saved by 82869ac57 James Morse 2016-04-27 53 * swsusp_arch_suspend(). 82869ac57 James Morse 2016-04-27 54 * 82869ac57 James Morse 2016-04-27 55 * Because this code has to be copied to a 'safe' page, it can't call out to 82869ac57 James Morse 2016-04-27 56 * other functions by PC-relative address. Also remember that it may be 82869ac57 James Morse 2016-04-27 57 * mid-way through over-writing other functions. For this reason it contains 82869ac57 James Morse 2016-04-27 58 * code from flush_icache_range() and uses the copy_page() macro. 82869ac57 James Morse 2016-04-27 59 * 82869ac57 James Morse 2016-04-27 60 * This 'safe' page is mapped via ttbr0, and executed from there. This function 82869ac57 James Morse 2016-04-27 61 * switches to a copy of the linear map in ttbr1, performs the restore, then 82869ac57 James Morse 2016-04-27 62 * switches ttbr1 to the original kernel's swapper_pg_dir. 82869ac57 James Morse 2016-04-27 63 * 82869ac57 James Morse 2016-04-27 64 * All of memory gets written to, including code. We need to clean the kernel 82869ac57 James Morse 2016-04-27 65 * text to the Point of Coherence (PoC) before secondary cores can be booted. 82869ac57 James Morse 2016-04-27 66 * Because the kernel modules and executable pages mapped to user space are 82869ac57 James Morse 2016-04-27 67 * also written as data, we clean all pages we touch to the Point of 82869ac57 James Morse 2016-04-27 68 * Unification (PoU). 82869ac57 James Morse 2016-04-27 69 * 82869ac57 James Morse 2016-04-27 70 * x0: physical address of temporary page tables 82869ac57 James Morse 2016-04-27 71 * x1: physical address of swapper page tables 82869ac57 James Morse 2016-04-27 72 * x2: address of cpu_resume 82869ac57 James Morse 2016-04-27 73 * x3: linear map address of restore_pblist in the current kernel 82869ac57 James Morse 2016-04-27 74 * x4: physical address of __hyp_stub_vectors, or 0 82869ac57 James Morse 2016-04-27 75 * x5: physical address of a zero page that remains zero after resume 82869ac57 James Morse 2016-04-27 76 */ 82869ac57 James Morse 2016-04-27 77 .pushsection ".hibernate_exit.text", "ax" 82869ac57 James Morse 2016-04-27 78 ENTRY(swsusp_arch_suspend_exit) 82869ac57 James Morse 2016-04-27 79 /* 82869ac57 James Morse 2016-04-27 80 * We execute from ttbr0, change ttbr1 to our copied linear map tables 82869ac57 James Morse 2016-04-27 81 * with a break-before-make via the zero page 82869ac57 James Morse 2016-04-27 82 */ 529c4b05a Kristina Martsenko 2017-12-13 83 break_before_make_ttbr_switch x5, x0, x6 82869ac57 James Morse 2016-04-27 84 82869ac57 James Morse 2016-04-27 85 mov x21, x1 82869ac57 James Morse 2016-04-27 86 mov x30, x2 82869ac57 James Morse 2016-04-27 87 mov x24, x4 82869ac57 James Morse 2016-04-27 88 mov x25, x5 82869ac57 James Morse 2016-04-27 89 82869ac57 James Morse 2016-04-27 90 /* walk the restore_pblist and use copy_page() to over-write memory */ 82869ac57 James Morse 2016-04-27 91 mov x19, x3 82869ac57 James Morse 2016-04-27 92 82869ac57 James Morse 2016-04-27 93 1: ldr x10, [x19, #HIBERN_PBE_ORIG] 82869ac57 James Morse 2016-04-27 94 mov x0, x10 82869ac57 James Morse 2016-04-27 95 ldr x1, [x19, #HIBERN_PBE_ADDR] 82869ac57 James Morse 2016-04-27 96 82869ac57 James Morse 2016-04-27 97 copy_page x0, x1, x2, x3, x4, x5, x6, x7, x8, x9 82869ac57 James Morse 2016-04-27 98 82869ac57 James Morse 2016-04-27 99 add x1, x10, #PAGE_SIZE 82869ac57 James Morse 2016-04-27 100 /* Clean the copied page to PoU - based on flush_icache_range() */ 072f0a633 Suzuki K Poulose 2016-09-09 @101 raw_dcache_line_size x2, x3 82869ac57 James Morse 2016-04-27 102 sub x3, x2, #1 82869ac57 James Morse 2016-04-27 103 bic x4, x10, x3 82869ac57 James Morse 2016-04-27 104 2: dc cvau, x4 /* clean D line / unified line */ 82869ac57 James Morse 2016-04-27 105 add x4, x4, x2 82869ac57 James Morse 2016-04-27 106 cmp x4, x1 82869ac57 James Morse 2016-04-27 107 b.lo 2b 82869ac57 James Morse 2016-04-27 108 82869ac57 James Morse 2016-04-27 109 ldr x19, [x19, #HIBERN_PBE_NEXT] 82869ac57 James Morse 2016-04-27 110 cbnz x19, 1b 82869ac57 James Morse 2016-04-27 111 dsb ish /* wait for PoU cleaning to finish */ 82869ac57 James Morse 2016-04-27 112 82869ac57 James Morse 2016-04-27 113 /* switch to the restored kernels page tables */ 529c4b05a Kristina Martsenko 2017-12-13 114 break_before_make_ttbr_switch x25, x21, x6 82869ac57 James Morse 2016-04-27 115 82869ac57 James Morse 2016-04-27 116 ic ialluis 82869ac57 James Morse 2016-04-27 117 dsb ish 82869ac57 James Morse 2016-04-27 118 isb 82869ac57 James Morse 2016-04-27 119 82869ac57 James Morse 2016-04-27 120 cbz x24, 3f /* Do we need to re-initialise EL2? */ 82869ac57 James Morse 2016-04-27 121 hvc #0 82869ac57 James Morse 2016-04-27 122 3: ret 82869ac57 James Morse 2016-04-27 123 82869ac57 James Morse 2016-04-27 124 .ltorg 82869ac57 James Morse 2016-04-27 125 ENDPROC(swsusp_arch_suspend_exit) 82869ac57 James Morse 2016-04-27 126 :::::: The code at line 101 was first introduced by commit :::::: 072f0a633838aca13b5a8b211eb64f5c445cfd7c arm64: Introduce raw_{d,i}cache_line_size :::::: TO: Suzuki K Poulose <suzuki.poulose@arm.com> :::::: CC: Will Deacon <will.deacon@arm.com> --- 0-DAY kernel test infrastructure Open Source Technology Center https://lists.01.org/pipermail/kbuild-all Intel Corporation
diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index 3c78835..9eaa948 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -30,6 +30,7 @@ #include <asm/pgtable-hwdef.h> #include <asm/ptrace.h> #include <asm/thread_info.h> +#include <asm/cache.h> .macro save_and_disable_daif, flags mrs \flags, daif @@ -334,9 +335,9 @@ * raw_dcache_line_size - get the minimum D-cache line size on this CPU * from the CTR register. */ - .macro raw_dcache_line_size, reg, tmp - mrs \tmp, ctr_el0 // read CTR - ubfm \tmp, \tmp, #16, #19 // cache line size encoding + .macro raw_dcache_line_size, reg, tmp, ctr + mrs \ctr, ctr_el0 // read CTR + ubfm \tmp, \ctr, #16, #19 // cache line size encoding mov \reg, #4 // bytes per word lsl \reg, \reg, \tmp // actual cache line size .endm @@ -344,9 +345,9 @@ /* * dcache_line_size - get the safe D-cache line size across all CPUs */ - .macro dcache_line_size, reg, tmp - read_ctr \tmp - ubfm \tmp, \tmp, #16, #19 // cache line size encoding + .macro dcache_line_size, reg, tmp, ctr + read_ctr \ctr + ubfm \tmp, \ctr, #16, #19 // cache line size encoding mov \reg, #4 // bytes per word lsl \reg, \reg, \tmp // actual cache line size .endm @@ -355,9 +356,9 @@ * raw_icache_line_size - get the minimum I-cache line size on this CPU * from the CTR register. */ - .macro raw_icache_line_size, reg, tmp - mrs \tmp, ctr_el0 // read CTR - and \tmp, \tmp, #0xf // cache line size encoding + .macro raw_icache_line_size, reg, tmp, ctr + mrs \ctr, ctr_el0 // read CTR + and \tmp, \ctr, #0xf // cache line size encoding mov \reg, #4 // bytes per word lsl \reg, \reg, \tmp // actual cache line size .endm @@ -365,9 +366,9 @@ /* * icache_line_size - get the safe I-cache line size across all CPUs */ - .macro icache_line_size, reg, tmp - read_ctr \tmp - and \tmp, \tmp, #0xf // cache line size encoding + .macro icache_line_size, reg, tmp, ctr + read_ctr \ctr + and \tmp, \ctr, #0xf // cache line size encoding mov \reg, #4 // bytes per word lsl \reg, \reg, \tmp // actual cache line size .endm @@ -408,13 +409,21 @@ * size: size of the region * Corrupts: kaddr, size, tmp1, tmp2 */ - .macro dcache_by_line_op op, domain, kaddr, size, tmp1, tmp2 - dcache_line_size \tmp1, \tmp2 + .macro dcache_by_line_op op, domain, kaddr, size, tmp1, tmp2, tmp3 + dcache_line_size \tmp1, \tmp2, \tmp3 add \size, \kaddr, \size sub \tmp2, \tmp1, #1 bic \kaddr, \kaddr, \tmp2 9998: - .if (\op == cvau || \op == cvac) + .if (\op == cvau) +alternative_if_not ARM64_WORKAROUND_CLEAN_CACHE + tbnz \tmp3, #CTR_IDC_SHIFT, 9997f + dc cvau, \kaddr +alternative_else + dc civac, \kaddr + nop +alternative_endif + .elseif (\op == cvac) alternative_if_not ARM64_WORKAROUND_CLEAN_CACHE dc \op, \kaddr alternative_else @@ -433,6 +442,7 @@ cmp \kaddr, \size b.lo 9998b dsb \domain +9997: .endm /* @@ -441,10 +451,11 @@ * * start, end: virtual addresses describing the region * label: A label to branch to on user fault. - * Corrupts: tmp1, tmp2 + * Corrupts: tmp1, tmp2, tmp3 */ - .macro invalidate_icache_by_line start, end, tmp1, tmp2, label - icache_line_size \tmp1, \tmp2 + .macro invalidate_icache_by_line start, end, tmp1, tmp2, tmp3, label + icache_line_size \tmp1, \tmp2, \tmp3 + tbnz \tmp3, #CTR_DIC_SHIFT, 9996f sub \tmp2, \tmp1, #1 bic \tmp2, \start, \tmp2 9997: @@ -454,6 +465,7 @@ b.lo 9997b dsb ish isb +9996: .endm /* diff --git a/arch/arm64/include/asm/cache.h b/arch/arm64/include/asm/cache.h index ea9bb4e..aea533b 100644 --- a/arch/arm64/include/asm/cache.h +++ b/arch/arm64/include/asm/cache.h @@ -22,6 +22,8 @@ #define CTR_L1IP_MASK 3 #define CTR_CWG_SHIFT 24 #define CTR_CWG_MASK 15 +#define CTR_IDC_SHIFT 28 +#define CTR_DIC_SHIFT 29 #define CTR_L1IP(ctr) (((ctr) >> CTR_L1IP_SHIFT) & CTR_L1IP_MASK) diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 29b1f87..f42bb5a 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -200,6 +200,8 @@ static int __init register_cpu_hwcaps_dumper(void) static const struct arm64_ftr_bits ftr_ctr[] = { ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, 31, 1, 1), /* RAO */ + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, CTR_DIC_SHIFT, 1, 0), /* DIC */ + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, CTR_IDC_SHIFT, 1, 0), /* IDC */ ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_HIGHER_SAFE, 24, 4, 0), /* CWG */ ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, 20, 4, 0), /* ERG */ ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, 16, 4, 1), /* DminLine */ diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S index 758bde7..5764af8 100644 --- a/arch/arm64/mm/cache.S +++ b/arch/arm64/mm/cache.S @@ -24,6 +24,7 @@ #include <asm/cpufeature.h> #include <asm/alternative.h> #include <asm/asm-uaccess.h> +#include <asm/cache.h> /* * flush_icache_range(start,end) @@ -50,7 +51,12 @@ ENTRY(flush_icache_range) */ ENTRY(__flush_cache_user_range) uaccess_ttbr0_enable x2, x3, x4 - dcache_line_size x2, x3 + dcache_line_size x2, x3, x4 +alternative_if_not ARM64_WORKAROUND_CLEAN_CACHE + tbnz x4, #CTR_IDC_SHIFT, 8f +alternative_else + nop +alternative_endif sub x3, x2, #1 bic x4, x0, x3 1: @@ -60,7 +66,9 @@ user_alt 9f, "dc cvau, x4", "dc civac, x4", ARM64_WORKAROUND_CLEAN_CACHE b.lo 1b dsb ish - invalidate_icache_by_line x0, x1, x2, x3, 9f +8: + invalidate_icache_by_line x0, x1, x2, x3, x4, 9f + mov x0, #0 1: uaccess_ttbr0_disable x1, x2 @@ -82,7 +90,7 @@ ENDPROC(__flush_cache_user_range) ENTRY(invalidate_icache_range) uaccess_ttbr0_enable x2, x3, x4 - invalidate_icache_by_line x0, x1, x2, x3, 2f + invalidate_icache_by_line x0, x1, x2, x3, x4, 2f mov x0, xzr 1: uaccess_ttbr0_disable x1, x2 @@ -102,7 +110,7 @@ ENDPROC(invalidate_icache_range) * - size - size in question */ ENTRY(__flush_dcache_area) - dcache_by_line_op civac, sy, x0, x1, x2, x3 + dcache_by_line_op civac, sy, x0, x1, x2, x3, x4 ret ENDPIPROC(__flush_dcache_area) @@ -116,7 +124,7 @@ ENDPIPROC(__flush_dcache_area) * - size - size in question */ ENTRY(__clean_dcache_area_pou) - dcache_by_line_op cvau, ish, x0, x1, x2, x3 + dcache_by_line_op cvau, ish, x0, x1, x2, x3, x4 ret ENDPROC(__clean_dcache_area_pou) @@ -140,7 +148,7 @@ ENTRY(__inval_dcache_area) */ __dma_inv_area: add x1, x1, x0 - dcache_line_size x2, x3 + dcache_line_size x2, x3, x4 sub x3, x2, #1 tst x1, x3 // end cache line aligned? bic x1, x1, x3 @@ -178,7 +186,7 @@ ENTRY(__clean_dcache_area_poc) * - size - size in question */ __dma_clean_area: - dcache_by_line_op cvac, sy, x0, x1, x2, x3 + dcache_by_line_op cvac, sy, x0, x1, x2, x3, x4 ret ENDPIPROC(__clean_dcache_area_poc) ENDPROC(__dma_clean_area) @@ -193,7 +201,7 @@ ENDPROC(__dma_clean_area) * - size - size in question */ ENTRY(__clean_dcache_area_pop) - dcache_by_line_op cvap, sy, x0, x1, x2, x3 + dcache_by_line_op cvap, sy, x0, x1, x2, x3, x4 ret ENDPIPROC(__clean_dcache_area_pop) @@ -206,7 +214,7 @@ ENDPIPROC(__clean_dcache_area_pop) * - size - size in question */ ENTRY(__dma_flush_area) - dcache_by_line_op civac, sy, x0, x1, x2, x3 + dcache_by_line_op civac, sy, x0, x1, x2, x3, x4 ret ENDPIPROC(__dma_flush_area)