diff mbox

[RFC] mixture of cleanups to cache-v7.S

Message ID 20150402225759.GY24899@n2100.arm.linux.org.uk (mailing list archive)
State New, archived
Headers show

Commit Message

Russell King - ARM Linux April 2, 2015, 10:57 p.m. UTC
On Thu, Apr 02, 2015 at 11:49:47PM +0100, Russell King - ARM Linux wrote:
> Several cleanups are in the patch below... I'll separate them out, but
> I'd like to hear comments on them.  Basically:
> 
> 1. cache-v7.S is built for ARMv7 CPUs, so there's no reason not to
>    use movw and movt when loading large constants, rather than using
>    "ldr rd,=constant"
> 
> 2. we can do a much more efficient check for the errata in
>    v7_flush_dcache_louis than we were doing - rather than putting the
>    work-around code in the fast path, we can re-organise this such that
>    we only try to run the workaround code if the LoU field is zero.
> 
> 3. shift the bitfield we want to extract in the CLIDR to the appropriate
>    bit position prior to masking; this reduces the complexity of the
>    code, particularly with the SMP differences in v7_flush_dcache_louis.
> 
> 4. pre-shift the Cortex A9 MIDR value to be checked, and shift the
>    actual MIDR to lose the bottom four revision bits.
> 
> 5. as the v7_flush_dcache_louis code is more optimal, I see no reason not
>    to enable this workaround by default now - if people really want it to
>    be disabled, they can still choose that option.  This is in addition
>    to Versatile Express enabling it.  Given the memory corrupting abilities
>    of not having this errata enabled, I think it's only sane that it's
>    something that should be encouraged to be enabled, even though it only
>    affects r0pX CPUs.
> 
> One obvious issue comes up here though - in the case that the LoU bits
> are validly zero, we merely return from v7_flush_dcache_louis with no
> DSB or ISB.  However v7_flush_dcache_all always has a DSB or ISB at the
> end, even if LoC is zero.  Is this an intentional difference, or should
> v7_flush_dcache_louis always end with a DSB+ISB ?

I should point out that if the DSB+ISB is needed, then the code can
instead become as below - basically, we just move the CLIDR into the
appropriate position and call start_flush_levels, which does the DMB,
applies the mask to extract the appropriate field, and then decides
whether it has any levels to process.

Comments

Russell King - ARM Linux April 3, 2015, 10:08 a.m. UTC | #1
On Thu, Apr 02, 2015 at 11:57:59PM +0100, Russell King - ARM Linux wrote:
> On Thu, Apr 02, 2015 at 11:49:47PM +0100, Russell King - ARM Linux wrote:
> > Several cleanups are in the patch below... I'll separate them out, but
> > I'd like to hear comments on them.  Basically:
> > 
> > 1. cache-v7.S is built for ARMv7 CPUs, so there's no reason not to
> >    use movw and movt when loading large constants, rather than using
> >    "ldr rd,=constant"
> > 
> > 2. we can do a much more efficient check for the errata in
> >    v7_flush_dcache_louis than we were doing - rather than putting the
> >    work-around code in the fast path, we can re-organise this such that
> >    we only try to run the workaround code if the LoU field is zero.
> > 
> > 3. shift the bitfield we want to extract in the CLIDR to the appropriate
> >    bit position prior to masking; this reduces the complexity of the
> >    code, particularly with the SMP differences in v7_flush_dcache_louis.
> > 
> > 4. pre-shift the Cortex A9 MIDR value to be checked, and shift the
> >    actual MIDR to lose the bottom four revision bits.
> > 
> > 5. as the v7_flush_dcache_louis code is more optimal, I see no reason not
> >    to enable this workaround by default now - if people really want it to
> >    be disabled, they can still choose that option.  This is in addition
> >    to Versatile Express enabling it.  Given the memory corrupting abilities
> >    of not having this errata enabled, I think it's only sane that it's
> >    something that should be encouraged to be enabled, even though it only
> >    affects r0pX CPUs.
> > 
> > One obvious issue comes up here though - in the case that the LoU bits
> > are validly zero, we merely return from v7_flush_dcache_louis with no
> > DSB or ISB.  However v7_flush_dcache_all always has a DSB or ISB at the
> > end, even if LoC is zero.  Is this an intentional difference, or should
> > v7_flush_dcache_louis always end with a DSB+ISB ?
> 
> I should point out that if the DSB+ISB is needed, then the code can
> instead become as below - basically, we just move the CLIDR into the
> appropriate position and call start_flush_levels, which does the DMB,
> applies the mask to extract the appropriate field, and then decides
> whether it has any levels to process.

I've now tested this patch on the Versatile Express and SDP4430, and
both seem to work fine with the patch below.

> diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
> index 2eb6de9465bf..c26dfef393cd 100644
> --- a/arch/arm/Kconfig
> +++ b/arch/arm/Kconfig
> @@ -1139,6 +1139,7 @@ config ARM_ERRATA_742231
>  config ARM_ERRATA_643719
>  	bool "ARM errata: LoUIS bit field in CLIDR register is incorrect"
>  	depends on CPU_V7 && SMP
> +	default y
>  	help
>  	  This option enables the workaround for the 643719 Cortex-A9 (prior to
>  	  r1p0) erratum. On affected cores the LoUIS bit field of the CLIDR
> diff --git a/arch/arm/mm/cache-v7.S b/arch/arm/mm/cache-v7.S
> index b966656d2c2d..14bfdd584385 100644
> --- a/arch/arm/mm/cache-v7.S
> +++ b/arch/arm/mm/cache-v7.S
> @@ -36,10 +36,10 @@ ENTRY(v7_invalidate_l1)
>         mcr     p15, 2, r0, c0, c0, 0
>         mrc     p15, 1, r0, c0, c0, 0
>  
> -       ldr     r1, =0x7fff
> +       movw    r1, #0x7fff
>         and     r2, r1, r0, lsr #13
>  
> -       ldr     r1, =0x3ff
> +       movw    r1, #0x3ff
>  
>         and     r3, r1, r0, lsr #3      @ NumWays - 1
>         add     r2, r2, #1              @ NumSets
> @@ -88,23 +88,20 @@ ENDPROC(v7_flush_icache_all)
>   */
>  
>  ENTRY(v7_flush_dcache_louis)
> -	dmb					@ ensure ordering with previous memory accesses
>  	mrc	p15, 1, r0, c0, c0, 1		@ read clidr, r0 = clidr
> -	ALT_SMP(ands	r3, r0, #(7 << 21))	@ extract LoUIS from clidr
> -	ALT_UP(ands	r3, r0, #(7 << 27))	@ extract LoUU from clidr
> +ALT_SMP(mov	r3, r0, lsr #20)		@ move LoUIS into position
> +ALT_UP(	mov	r3, r0, lsr #26)		@ move LoUU into position
>  #ifdef CONFIG_ARM_ERRATA_643719
> -	ALT_SMP(mrceq	p15, 0, r2, c0, c0, 0)	@ read main ID register
> -	ALT_UP(reteq	lr)			@ LoUU is zero, so nothing to do
> -	ldreq	r1, =0x410fc090                 @ ID of ARM Cortex A9 r0p?
> -	biceq	r2, r2, #0x0000000f             @ clear minor revision number
> -	teqeq	r2, r1                          @ test for errata affected core and if so...
> -	orreqs	r3, #(1 << 21)			@   fix LoUIS value (and set flags state to 'ne')
> +ALT_SMP(ands	r3, r3, #7 << 1)		@ extract LoU field from clidr
> +ALT_UP(	b	start_flush_levels)
> +	bne	start_flush_levels		@ LoU != 0, start flushing
> +	mrc	p15, 0, r2, c0, c0, 0		@ read main ID register
> +	movw	r1, #:lower16:(0x410fc090 >> 4)	@ ID of ARM Cortex A9 r0p?
> +	movt	r1, #:upper16:(0x410fc090 >> 4)
> +	teq	r1, r2, lsr #4			@ test for errata affected core and if so...
> +	moveq	r3, #1 << 1			@ fix LoUIS value (and set flags state to 'ne')
>  #endif
> -	ALT_SMP(mov	r3, r3, lsr #20)	@ r3 = LoUIS * 2
> -	ALT_UP(mov	r3, r3, lsr #26)	@ r3 = LoUU * 2
> -	reteq	lr				@ return if level == 0
> -	mov	r10, #0				@ r10 (starting level) = 0
> -	b	flush_levels			@ start flushing cache levels
> +	b	start_flush_levels		@ start flushing cache levels
>  ENDPROC(v7_flush_dcache_louis)
>  
>  /*
> @@ -117,10 +114,11 @@ ENDPROC(v7_flush_dcache_louis)
>   *	- mm    - mm_struct describing address space
>   */
>  ENTRY(v7_flush_dcache_all)
> -	dmb					@ ensure ordering with previous memory accesses
>  	mrc	p15, 1, r0, c0, c0, 1		@ read clidr
> -	ands	r3, r0, #0x7000000		@ extract loc from clidr
> -	mov	r3, r3, lsr #23			@ left align loc bit field
> +	mov	r3, r0, lsr #23			@ align LoC
> +start_flush_levels:
> +	dmb					@ ensure ordering with previous memory accesses
> +	ands	r3, r3, #7 << 1			@ extract loc from clidr
>  	beq	finished			@ if loc is 0, then no need to clean
>  	mov	r10, #0				@ start clean at cache level 0
>  flush_levels:
> @@ -140,10 +138,10 @@ flush_levels:
>  #endif
>  	and	r2, r1, #7			@ extract the length of the cache lines
>  	add	r2, r2, #4			@ add 4 (line length offset)
> -	ldr	r4, =0x3ff
> +	movw	r4, #0x3ff
>  	ands	r4, r4, r1, lsr #3		@ find maximum number on the way size
>  	clz	r5, r4				@ find bit position of way size increment
> -	ldr	r7, =0x7fff
> +	movw	r7, #0x7fff
>  	ands	r7, r7, r1, lsr #13		@ extract max number of the index size
>  loop1:
>  	mov	r9, r7				@ create working copy of max index
> 
> -- 
> FTTC broadband for 0.8mile line: currently at 10.5Mbps down 400kbps up
> according to speedtest.net.
> 
> _______________________________________________
> linux-arm-kernel mailing list
> linux-arm-kernel@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
Russell King - ARM Linux April 3, 2015, 10:53 a.m. UTC | #2
On Fri, Apr 03, 2015 at 11:08:48AM +0100, Russell King - ARM Linux wrote:
> On Thu, Apr 02, 2015 at 11:57:59PM +0100, Russell King - ARM Linux wrote:
> > On Thu, Apr 02, 2015 at 11:49:47PM +0100, Russell King - ARM Linux wrote:
> > > Several cleanups are in the patch below... I'll separate them out, but
> > > I'd like to hear comments on them.  Basically:
> > > 
> > > 1. cache-v7.S is built for ARMv7 CPUs, so there's no reason not to
> > >    use movw and movt when loading large constants, rather than using
> > >    "ldr rd,=constant"
> > > 
> > > 2. we can do a much more efficient check for the errata in
> > >    v7_flush_dcache_louis than we were doing - rather than putting the
> > >    work-around code in the fast path, we can re-organise this such that
> > >    we only try to run the workaround code if the LoU field is zero.
> > > 
> > > 3. shift the bitfield we want to extract in the CLIDR to the appropriate
> > >    bit position prior to masking; this reduces the complexity of the
> > >    code, particularly with the SMP differences in v7_flush_dcache_louis.
> > > 
> > > 4. pre-shift the Cortex A9 MIDR value to be checked, and shift the
> > >    actual MIDR to lose the bottom four revision bits.
> > > 
> > > 5. as the v7_flush_dcache_louis code is more optimal, I see no reason not
> > >    to enable this workaround by default now - if people really want it to
> > >    be disabled, they can still choose that option.  This is in addition
> > >    to Versatile Express enabling it.  Given the memory corrupting abilities
> > >    of not having this errata enabled, I think it's only sane that it's
> > >    something that should be encouraged to be enabled, even though it only
> > >    affects r0pX CPUs.
> > > 
> > > One obvious issue comes up here though - in the case that the LoU bits
> > > are validly zero, we merely return from v7_flush_dcache_louis with no
> > > DSB or ISB.  However v7_flush_dcache_all always has a DSB or ISB at the
> > > end, even if LoC is zero.  Is this an intentional difference, or should
> > > v7_flush_dcache_louis always end with a DSB+ISB ?
> > 
> > I should point out that if the DSB+ISB is needed, then the code can
> > instead become as below - basically, we just move the CLIDR into the
> > appropriate position and call start_flush_levels, which does the DMB,
> > applies the mask to extract the appropriate field, and then decides
> > whether it has any levels to process.
> 
> I've now tested this patch on the Versatile Express and SDP4430, and
> both seem to work fine with the patch below.

Here's the patches broken out.  My intention is to put the first five,
which should be entirely non-contravercial, into my for-next branch.
The last two, I'll wait until I hear back from you after the Easter
break.
diff mbox

Patch

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 2eb6de9465bf..c26dfef393cd 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1139,6 +1139,7 @@  config ARM_ERRATA_742231
 config ARM_ERRATA_643719
 	bool "ARM errata: LoUIS bit field in CLIDR register is incorrect"
 	depends on CPU_V7 && SMP
+	default y
 	help
 	  This option enables the workaround for the 643719 Cortex-A9 (prior to
 	  r1p0) erratum. On affected cores the LoUIS bit field of the CLIDR
diff --git a/arch/arm/mm/cache-v7.S b/arch/arm/mm/cache-v7.S
index b966656d2c2d..14bfdd584385 100644
--- a/arch/arm/mm/cache-v7.S
+++ b/arch/arm/mm/cache-v7.S
@@ -36,10 +36,10 @@  ENTRY(v7_invalidate_l1)
        mcr     p15, 2, r0, c0, c0, 0
        mrc     p15, 1, r0, c0, c0, 0
 
-       ldr     r1, =0x7fff
+       movw    r1, #0x7fff
        and     r2, r1, r0, lsr #13
 
-       ldr     r1, =0x3ff
+       movw    r1, #0x3ff
 
        and     r3, r1, r0, lsr #3      @ NumWays - 1
        add     r2, r2, #1              @ NumSets
@@ -88,23 +88,20 @@  ENDPROC(v7_flush_icache_all)
  */
 
 ENTRY(v7_flush_dcache_louis)
-	dmb					@ ensure ordering with previous memory accesses
 	mrc	p15, 1, r0, c0, c0, 1		@ read clidr, r0 = clidr
-	ALT_SMP(ands	r3, r0, #(7 << 21))	@ extract LoUIS from clidr
-	ALT_UP(ands	r3, r0, #(7 << 27))	@ extract LoUU from clidr
+ALT_SMP(mov	r3, r0, lsr #20)		@ move LoUIS into position
+ALT_UP(	mov	r3, r0, lsr #26)		@ move LoUU into position
 #ifdef CONFIG_ARM_ERRATA_643719
-	ALT_SMP(mrceq	p15, 0, r2, c0, c0, 0)	@ read main ID register
-	ALT_UP(reteq	lr)			@ LoUU is zero, so nothing to do
-	ldreq	r1, =0x410fc090                 @ ID of ARM Cortex A9 r0p?
-	biceq	r2, r2, #0x0000000f             @ clear minor revision number
-	teqeq	r2, r1                          @ test for errata affected core and if so...
-	orreqs	r3, #(1 << 21)			@   fix LoUIS value (and set flags state to 'ne')
+ALT_SMP(ands	r3, r3, #7 << 1)		@ extract LoU field from clidr
+ALT_UP(	b	start_flush_levels)
+	bne	start_flush_levels		@ LoU != 0, start flushing
+	mrc	p15, 0, r2, c0, c0, 0		@ read main ID register
+	movw	r1, #:lower16:(0x410fc090 >> 4)	@ ID of ARM Cortex A9 r0p?
+	movt	r1, #:upper16:(0x410fc090 >> 4)
+	teq	r1, r2, lsr #4			@ test for errata affected core and if so...
+	moveq	r3, #1 << 1			@ fix LoUIS value (and set flags state to 'ne')
 #endif
-	ALT_SMP(mov	r3, r3, lsr #20)	@ r3 = LoUIS * 2
-	ALT_UP(mov	r3, r3, lsr #26)	@ r3 = LoUU * 2
-	reteq	lr				@ return if level == 0
-	mov	r10, #0				@ r10 (starting level) = 0
-	b	flush_levels			@ start flushing cache levels
+	b	start_flush_levels		@ start flushing cache levels
 ENDPROC(v7_flush_dcache_louis)
 
 /*
@@ -117,10 +114,11 @@  ENDPROC(v7_flush_dcache_louis)
  *	- mm    - mm_struct describing address space
  */
 ENTRY(v7_flush_dcache_all)
-	dmb					@ ensure ordering with previous memory accesses
 	mrc	p15, 1, r0, c0, c0, 1		@ read clidr
-	ands	r3, r0, #0x7000000		@ extract loc from clidr
-	mov	r3, r3, lsr #23			@ left align loc bit field
+	mov	r3, r0, lsr #23			@ align LoC
+start_flush_levels:
+	dmb					@ ensure ordering with previous memory accesses
+	ands	r3, r3, #7 << 1			@ extract loc from clidr
 	beq	finished			@ if loc is 0, then no need to clean
 	mov	r10, #0				@ start clean at cache level 0
 flush_levels:
@@ -140,10 +138,10 @@  flush_levels:
 #endif
 	and	r2, r1, #7			@ extract the length of the cache lines
 	add	r2, r2, #4			@ add 4 (line length offset)
-	ldr	r4, =0x3ff
+	movw	r4, #0x3ff
 	ands	r4, r4, r1, lsr #3		@ find maximum number on the way size
 	clz	r5, r4				@ find bit position of way size increment
-	ldr	r7, =0x7fff
+	movw	r7, #0x7fff
 	ands	r7, r7, r1, lsr #13		@ extract max number of the index size
 loop1:
 	mov	r9, r7				@ create working copy of max index