diff mbox series

ARM: decompressor: cover BSS in cache clean and reorder with MMU disable on v7

Message ID 20210122152012.30075-1-ardb@kernel.org (mailing list archive)
State New, archived
Headers show
Series ARM: decompressor: cover BSS in cache clean and reorder with MMU disable on v7 | expand

Commit Message

Ard Biesheuvel Jan. 22, 2021, 3:20 p.m. UTC
To ensure that no cache lines cover any of the data that is accessed by
the booting kernel with the MMU off, cover the uncompressed kernel's BSS
region in the cache clean operation.

Also, to ensure that no cachelines are allocated while the cache is being
cleaned, perform the cache clean operation *after* disabling the MMU and
caches when running on v7 or later, by making a tail call to the clean
routine from the cache_off routine. This requires passing the VA range
to cache_off(), which means some care needs to be taken to preserve
R0 and R1 across the call to cache_off().

Since this makes the first cache clean redundant, call it with the
range reduced to zero. This only affects v7, as all other versions
ignore R0/R1 entirely.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 arch/arm/boot/compressed/head.S | 27 ++++++++++++++++++++++-----
 1 file changed, 22 insertions(+), 5 deletions(-)

Comments

Russell King (Oracle) Jan. 22, 2021, 4:13 p.m. UTC | #1
On Fri, Jan 22, 2021 at 04:20:12PM +0100, Ard Biesheuvel wrote:
> To ensure that no cache lines cover any of the data that is accessed by
> the booting kernel with the MMU off, cover the uncompressed kernel's BSS
> region in the cache clean operation.
> 
> Also, to ensure that no cachelines are allocated while the cache is being
> cleaned, perform the cache clean operation *after* disabling the MMU and
> caches when running on v7 or later, by making a tail call to the clean
> routine from the cache_off routine. This requires passing the VA range
> to cache_off(), which means some care needs to be taken to preserve
> R0 and R1 across the call to cache_off().
> 
> Since this makes the first cache clean redundant, call it with the
> range reduced to zero. This only affects v7, as all other versions
> ignore R0/R1 entirely.
> 
> Signed-off-by: Ard Biesheuvel <ardb@kernel.org>

Seems to work, thanks! I'd suggest we follow up with this patch which
gets rid of all the register shuffling:

8<===
From: Russell King <rmk+kernel@armlinux.org.uk>
Subject: [PATCH] ARM: decompressor: tidy up register usage

Tidy up the registers so we don't end up having to shuffle values
between registers to work around other register usages.

Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
---
 arch/arm/boot/compressed/head.S | 41 +++++++++++++++------------------
 1 file changed, 19 insertions(+), 22 deletions(-)

diff --git a/arch/arm/boot/compressed/head.S b/arch/arm/boot/compressed/head.S
index b44738110095..c0a13004c5d4 100644
--- a/arch/arm/boot/compressed/head.S
+++ b/arch/arm/boot/compressed/head.S
@@ -930,6 +930,7 @@ ENDPROC(__setup_mmu)
  *  r2  = corrupted
  *  r3  = block offset
  *  r9  = corrupted
+ *  r10 = corrupted
  *  r12 = corrupted
  */
 
@@ -949,10 +950,10 @@ call_cache_fn:	adr	r12, proc_types
 #else
 		ldr	r9, =CONFIG_PROCESSOR_ID
 #endif
-1:		ldr	r1, [r12, #0]		@ get value
+1:		ldr	r10, [r12, #0]		@ get value
 		ldr	r2, [r12, #4]		@ get mask
-		eor	r1, r1, r9		@ (real ^ match)
-		tst	r1, r2			@       & mask
+		eor	r10, r10, r9		@ (real ^ match)
+		tst	r10, r2			@       & mask
  ARM(		addeq	pc, r12, r3		) @ call cache function
  THUMB(		addeq	r12, r3			)
  THUMB(		moveq	pc, r12			) @ call cache function
@@ -1139,8 +1140,6 @@ call_cache_fn:	adr	r12, proc_types
  */
 		.align	5
 cache_off:	mov	r3, #12			@ cache_off function
-		mov	r10, r0
-		mov	r11, r1
 		b	call_cache_fn
 
 __armv4_mpu_cache_off:
@@ -1173,22 +1172,21 @@ cache_off:	mov	r3, #12			@ cache_off function
 		mov	pc, lr
 
 __armv7_mmu_cache_off:
-		mrc	p15, 0, r0, c1, c0
+		mrc	p15, 0, r3, c1, c0
 #ifdef CONFIG_MMU
-		bic	r0, r0, #0x000d
+		bic	r3, r3, #0x000d
 #else
-		bic	r0, r0, #0x000c
+		bic	r3, r3, #0x000c
 #endif
-		mcr	p15, 0, r0, c1, c0	@ turn MMU and cache off
-		mov	r0, #0
+		mcr	p15, 0, r3, c1, c0	@ turn MMU and cache off
+		mov	r3, #0
 #ifdef CONFIG_MMU
-		mcr	p15, 0, r0, c8, c7, 0	@ invalidate whole TLB
+		mcr	p15, 0, r3, c8, c7, 0	@ invalidate whole TLB
 #endif
-		mcr	p15, 0, r0, c7, c5, 6	@ invalidate BTC
-		mcr	p15, 0, r0, c7, c10, 4	@ DSB
-		mcr	p15, 0, r0, c7, c5, 4	@ ISB
+		mcr	p15, 0, r3, c7, c5, 6	@ invalidate BTC
+		mcr	p15, 0, r3, c7, c10, 4	@ DSB
+		mcr	p15, 0, r3, c7, c5, 4	@ ISB
 
-		mov	r0, r10
 		b	__armv7_mmu_cache_flush
 
 /*
@@ -1205,7 +1203,6 @@ cache_off:	mov	r3, #12			@ cache_off function
 		.align	5
 cache_clean_flush:
 		mov	r3, #16
-		mov	r11, r1
 		b	call_cache_fn
 
 __armv4_mpu_cache_flush:
@@ -1256,15 +1253,15 @@ cache_off:	mov	r3, #12			@ cache_off function
 		mcr	p15, 0, r10, c7, c14, 0	@ clean+invalidate D
 		b	iflush
 hierarchical:
-		dcache_line_size r1, r2		@ r1 := dcache min line size
-		sub	r2, r1, #1		@ r2 := line size mask
+		dcache_line_size r11, r2	@ r11 := dcache min line size
+		sub	r2, r11, #1		@ r2 := line size mask
 		bic	r0, r0, r2		@ round down start to line size
-		sub	r11, r11, #1		@ end address is exclusive
-		bic	r11, r11, r2		@ round down end to line size
-0:		cmp	r0, r11			@ finished?
+		sub	r1, r1, #1		@ end address is exclusive
+		bic	r1, r1, r2		@ round down end to line size
+0:		cmp	r0, r1			@ finished?
 		bgt	iflush
 		mcr	p15, 0, r0, c7, c14, 1	@ Dcache clean/invalidate by VA
-		add	r0, r0, r1
+		add	r0, r0, r11
 		b	0b
 iflush:
 		mcr	p15, 0, r10, c7, c10, 4	@ DSB
Ard Biesheuvel Jan. 22, 2021, 4:32 p.m. UTC | #2
On Fri, 22 Jan 2021 at 17:13, Russell King - ARM Linux admin
<linux@armlinux.org.uk> wrote:
>
> On Fri, Jan 22, 2021 at 04:20:12PM +0100, Ard Biesheuvel wrote:
> > To ensure that no cache lines cover any of the data that is accessed by
> > the booting kernel with the MMU off, cover the uncompressed kernel's BSS
> > region in the cache clean operation.
> >
> > Also, to ensure that no cachelines are allocated while the cache is being
> > cleaned, perform the cache clean operation *after* disabling the MMU and
> > caches when running on v7 or later, by making a tail call to the clean
> > routine from the cache_off routine. This requires passing the VA range
> > to cache_off(), which means some care needs to be taken to preserve
> > R0 and R1 across the call to cache_off().
> >
> > Since this makes the first cache clean redundant, call it with the
> > range reduced to zero. This only affects v7, as all other versions
> > ignore R0/R1 entirely.
> >
> > Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
>
> Seems to work, thanks! I'd suggest we follow up with this patch which
> gets rid of all the register shuffling:
>

Agreed

> 8<===
> From: Russell King <rmk+kernel@armlinux.org.uk>
> Subject: [PATCH] ARM: decompressor: tidy up register usage
>
> Tidy up the registers so we don't end up having to shuffle values
> between registers to work around other register usages.
>
> Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>

Acked-by: Ard Biesheuvel <ardb@kernel.org>

> ---
>  arch/arm/boot/compressed/head.S | 41 +++++++++++++++------------------
>  1 file changed, 19 insertions(+), 22 deletions(-)
>
> diff --git a/arch/arm/boot/compressed/head.S b/arch/arm/boot/compressed/head.S
> index b44738110095..c0a13004c5d4 100644
> --- a/arch/arm/boot/compressed/head.S
> +++ b/arch/arm/boot/compressed/head.S
> @@ -930,6 +930,7 @@ ENDPROC(__setup_mmu)

Can we drop the r1 = corrupted here now?

>   *  r2  = corrupted
>   *  r3  = block offset
>   *  r9  = corrupted
> + *  r10 = corrupted
>   *  r12 = corrupted
>   */
>
> @@ -949,10 +950,10 @@ call_cache_fn:    adr     r12, proc_types
>  #else
>                 ldr     r9, =CONFIG_PROCESSOR_ID
>  #endif
> -1:             ldr     r1, [r12, #0]           @ get value
> +1:             ldr     r10, [r12, #0]          @ get value
>                 ldr     r2, [r12, #4]           @ get mask
> -               eor     r1, r1, r9              @ (real ^ match)
> -               tst     r1, r2                  @       & mask
> +               eor     r10, r10, r9            @ (real ^ match)
> +               tst     r10, r2                 @       & mask
>   ARM(          addeq   pc, r12, r3             ) @ call cache function
>   THUMB(                addeq   r12, r3                 )
>   THUMB(                moveq   pc, r12                 ) @ call cache function
> @@ -1139,8 +1140,6 @@ call_cache_fn:    adr     r12, proc_types
>   */
>                 .align  5
>  cache_off:     mov     r3, #12                 @ cache_off function
> -               mov     r10, r0
> -               mov     r11, r1
>                 b       call_cache_fn
>
>  __armv4_mpu_cache_off:
> @@ -1173,22 +1172,21 @@ cache_off:      mov     r3, #12                 @ cache_off function
>                 mov     pc, lr
>
>  __armv7_mmu_cache_off:
> -               mrc     p15, 0, r0, c1, c0
> +               mrc     p15, 0, r3, c1, c0
>  #ifdef CONFIG_MMU
> -               bic     r0, r0, #0x000d
> +               bic     r3, r3, #0x000d
>  #else
> -               bic     r0, r0, #0x000c
> +               bic     r3, r3, #0x000c
>  #endif
> -               mcr     p15, 0, r0, c1, c0      @ turn MMU and cache off
> -               mov     r0, #0
> +               mcr     p15, 0, r3, c1, c0      @ turn MMU and cache off
> +               mov     r3, #0
>  #ifdef CONFIG_MMU
> -               mcr     p15, 0, r0, c8, c7, 0   @ invalidate whole TLB
> +               mcr     p15, 0, r3, c8, c7, 0   @ invalidate whole TLB
>  #endif
> -               mcr     p15, 0, r0, c7, c5, 6   @ invalidate BTC
> -               mcr     p15, 0, r0, c7, c10, 4  @ DSB
> -               mcr     p15, 0, r0, c7, c5, 4   @ ISB
> +               mcr     p15, 0, r3, c7, c5, 6   @ invalidate BTC
> +               mcr     p15, 0, r3, c7, c10, 4  @ DSB
> +               mcr     p15, 0, r3, c7, c5, 4   @ ISB
>
> -               mov     r0, r10
>                 b       __armv7_mmu_cache_flush
>
>  /*
> @@ -1205,7 +1203,6 @@ cache_off:        mov     r3, #12                 @ cache_off function
>                 .align  5
>  cache_clean_flush:
>                 mov     r3, #16
> -               mov     r11, r1
>                 b       call_cache_fn
>
>  __armv4_mpu_cache_flush:
> @@ -1256,15 +1253,15 @@ cache_off:      mov     r3, #12                 @ cache_off function
>                 mcr     p15, 0, r10, c7, c14, 0 @ clean+invalidate D
>                 b       iflush
>  hierarchical:
> -               dcache_line_size r1, r2         @ r1 := dcache min line size
> -               sub     r2, r1, #1              @ r2 := line size mask
> +               dcache_line_size r11, r2        @ r11 := dcache min line size
> +               sub     r2, r11, #1             @ r2 := line size mask
>                 bic     r0, r0, r2              @ round down start to line size
> -               sub     r11, r11, #1            @ end address is exclusive
> -               bic     r11, r11, r2            @ round down end to line size
> -0:             cmp     r0, r11                 @ finished?
> +               sub     r1, r1, #1              @ end address is exclusive
> +               bic     r1, r1, r2              @ round down end to line size
> +0:             cmp     r0, r1                  @ finished?
>                 bgt     iflush
>                 mcr     p15, 0, r0, c7, c14, 1  @ Dcache clean/invalidate by VA
> -               add     r0, r0, r1
> +               add     r0, r0, r11
>                 b       0b
>  iflush:
>                 mcr     p15, 0, r10, c7, c10, 4 @ DSB
> --
> 2.20.1
>
> --
> RMK's Patch system: https://www.armlinux.org.uk/developer/patches/
> FTTP is here! 40Mbps down 10Mbps up. Decent connectivity at last!
Ard Biesheuvel Jan. 24, 2021, 1:35 p.m. UTC | #3
On Fri, 22 Jan 2021 at 17:32, Ard Biesheuvel <ardb@kernel.org> wrote:
>
> On Fri, 22 Jan 2021 at 17:13, Russell King - ARM Linux admin
> <linux@armlinux.org.uk> wrote:
> >
> > On Fri, Jan 22, 2021 at 04:20:12PM +0100, Ard Biesheuvel wrote:
> > > To ensure that no cache lines cover any of the data that is accessed by
> > > the booting kernel with the MMU off, cover the uncompressed kernel's BSS
> > > region in the cache clean operation.
> > >
> > > Also, to ensure that no cachelines are allocated while the cache is being
> > > cleaned, perform the cache clean operation *after* disabling the MMU and
> > > caches when running on v7 or later, by making a tail call to the clean
> > > routine from the cache_off routine. This requires passing the VA range
> > > to cache_off(), which means some care needs to be taken to preserve
> > > R0 and R1 across the call to cache_off().
> > >
> > > Since this makes the first cache clean redundant, call it with the
> > > range reduced to zero. This only affects v7, as all other versions
> > > ignore R0/R1 entirely.
> > >
> > > Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
> >
> > Seems to work, thanks! I'd suggest we follow up with this patch which
> > gets rid of all the register shuffling:
> >
>
> Agreed
>

So what I think is happening is the following:

In v5.7 and before, the set/way operations trap into KVM, which sets
another trap bit to ensure that second trap occurs the next time the
MMU is disabled. So if any cachelines are allocated after the call to
cache_clean_flush(), they will be invalidated again when KVM
invalidates the VM's entire IPA space.

According to DDI0406C.d paragraph B3.2.1, it is implementation defined
whether non-cacheable accesses that occur with MMU/caches disabled may
hit in the data cache.

So after v5.7, without set/way instructions being issued, the second
trap is never set, and so the only cache clean+invalidate that occurs
is the one that the decompressor performs itself, and the one that KVM
does on the guest's behalf at cache_off() time is omitted. This
results in clean cachelines being allocated that shadow the
mini-stack, which are hit by the non-cacheable accesses that occur
before the kernel proper enables the MMU again.

Reordering the clean+invalidate with the MMU/cache disabling prevent
the issue, as disabling the MMU and caches first disables any mappings
that the cache could perform speculative linefills from, and so the
mini-stack memory access cannot be served from the cache.


> > 8<===
> > From: Russell King <rmk+kernel@armlinux.org.uk>
> > Subject: [PATCH] ARM: decompressor: tidy up register usage
> >
> > Tidy up the registers so we don't end up having to shuffle values
> > between registers to work around other register usages.
> >
> > Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
>
> Acked-by: Ard Biesheuvel <ardb@kernel.org>
>
> > ---
> >  arch/arm/boot/compressed/head.S | 41 +++++++++++++++------------------
> >  1 file changed, 19 insertions(+), 22 deletions(-)
> >
> > diff --git a/arch/arm/boot/compressed/head.S b/arch/arm/boot/compressed/head.S
> > index b44738110095..c0a13004c5d4 100644
> > --- a/arch/arm/boot/compressed/head.S
> > +++ b/arch/arm/boot/compressed/head.S
> > @@ -930,6 +930,7 @@ ENDPROC(__setup_mmu)
>
> Can we drop the r1 = corrupted here now?
>
> >   *  r2  = corrupted
> >   *  r3  = block offset
> >   *  r9  = corrupted
> > + *  r10 = corrupted
> >   *  r12 = corrupted
> >   */
> >
> > @@ -949,10 +950,10 @@ call_cache_fn:    adr     r12, proc_types
> >  #else
> >                 ldr     r9, =CONFIG_PROCESSOR_ID
> >  #endif
> > -1:             ldr     r1, [r12, #0]           @ get value
> > +1:             ldr     r10, [r12, #0]          @ get value
> >                 ldr     r2, [r12, #4]           @ get mask
> > -               eor     r1, r1, r9              @ (real ^ match)
> > -               tst     r1, r2                  @       & mask
> > +               eor     r10, r10, r9            @ (real ^ match)
> > +               tst     r10, r2                 @       & mask
> >   ARM(          addeq   pc, r12, r3             ) @ call cache function
> >   THUMB(                addeq   r12, r3                 )
> >   THUMB(                moveq   pc, r12                 ) @ call cache function
> > @@ -1139,8 +1140,6 @@ call_cache_fn:    adr     r12, proc_types
> >   */
> >                 .align  5
> >  cache_off:     mov     r3, #12                 @ cache_off function
> > -               mov     r10, r0
> > -               mov     r11, r1
> >                 b       call_cache_fn
> >
> >  __armv4_mpu_cache_off:
> > @@ -1173,22 +1172,21 @@ cache_off:      mov     r3, #12                 @ cache_off function
> >                 mov     pc, lr
> >
> >  __armv7_mmu_cache_off:
> > -               mrc     p15, 0, r0, c1, c0
> > +               mrc     p15, 0, r3, c1, c0
> >  #ifdef CONFIG_MMU
> > -               bic     r0, r0, #0x000d
> > +               bic     r3, r3, #0x000d
> >  #else
> > -               bic     r0, r0, #0x000c
> > +               bic     r3, r3, #0x000c
> >  #endif
> > -               mcr     p15, 0, r0, c1, c0      @ turn MMU and cache off
> > -               mov     r0, #0
> > +               mcr     p15, 0, r3, c1, c0      @ turn MMU and cache off
> > +               mov     r3, #0
> >  #ifdef CONFIG_MMU
> > -               mcr     p15, 0, r0, c8, c7, 0   @ invalidate whole TLB
> > +               mcr     p15, 0, r3, c8, c7, 0   @ invalidate whole TLB
> >  #endif
> > -               mcr     p15, 0, r0, c7, c5, 6   @ invalidate BTC
> > -               mcr     p15, 0, r0, c7, c10, 4  @ DSB
> > -               mcr     p15, 0, r0, c7, c5, 4   @ ISB
> > +               mcr     p15, 0, r3, c7, c5, 6   @ invalidate BTC
> > +               mcr     p15, 0, r3, c7, c10, 4  @ DSB
> > +               mcr     p15, 0, r3, c7, c5, 4   @ ISB
> >
> > -               mov     r0, r10
> >                 b       __armv7_mmu_cache_flush
> >
> >  /*
> > @@ -1205,7 +1203,6 @@ cache_off:        mov     r3, #12                 @ cache_off function
> >                 .align  5
> >  cache_clean_flush:
> >                 mov     r3, #16
> > -               mov     r11, r1
> >                 b       call_cache_fn
> >
> >  __armv4_mpu_cache_flush:
> > @@ -1256,15 +1253,15 @@ cache_off:      mov     r3, #12                 @ cache_off function
> >                 mcr     p15, 0, r10, c7, c14, 0 @ clean+invalidate D
> >                 b       iflush
> >  hierarchical:
> > -               dcache_line_size r1, r2         @ r1 := dcache min line size
> > -               sub     r2, r1, #1              @ r2 := line size mask
> > +               dcache_line_size r11, r2        @ r11 := dcache min line size
> > +               sub     r2, r11, #1             @ r2 := line size mask
> >                 bic     r0, r0, r2              @ round down start to line size
> > -               sub     r11, r11, #1            @ end address is exclusive
> > -               bic     r11, r11, r2            @ round down end to line size
> > -0:             cmp     r0, r11                 @ finished?
> > +               sub     r1, r1, #1              @ end address is exclusive
> > +               bic     r1, r1, r2              @ round down end to line size
> > +0:             cmp     r0, r1                  @ finished?
> >                 bgt     iflush
> >                 mcr     p15, 0, r0, c7, c14, 1  @ Dcache clean/invalidate by VA
> > -               add     r0, r0, r1
> > +               add     r0, r0, r11
> >                 b       0b
> >  iflush:
> >                 mcr     p15, 0, r10, c7, c10, 4 @ DSB
> > --
> > 2.20.1
> >
> > --
> > RMK's Patch system: https://www.armlinux.org.uk/developer/patches/
> > FTTP is here! 40Mbps down 10Mbps up. Decent connectivity at last!
Russell King (Oracle) Jan. 24, 2021, 3:21 p.m. UTC | #4
On Sun, Jan 24, 2021 at 02:35:31PM +0100, Ard Biesheuvel wrote:
> So what I think is happening is the following:
> 
> In v5.7 and before, the set/way operations trap into KVM, which sets
> another trap bit to ensure that second trap occurs the next time the
> MMU is disabled. So if any cachelines are allocated after the call to
> cache_clean_flush(), they will be invalidated again when KVM
> invalidates the VM's entire IPA space.
> 
> According to DDI0406C.d paragraph B3.2.1, it is implementation defined
> whether non-cacheable accesses that occur with MMU/caches disabled may
> hit in the data cache.
> 
> So after v5.7, without set/way instructions being issued, the second
> trap is never set, and so the only cache clean+invalidate that occurs
> is the one that the decompressor performs itself, and the one that KVM
> does on the guest's behalf at cache_off() time is omitted. This
> results in clean cachelines being allocated that shadow the
> mini-stack, which are hit by the non-cacheable accesses that occur
> before the kernel proper enables the MMU again.
> 
> Reordering the clean+invalidate with the MMU/cache disabling prevent
> the issue, as disabling the MMU and caches first disables any mappings
> that the cache could perform speculative linefills from, and so the
> mini-stack memory access cannot be served from the cache.

This may be part of the story, but it doesn't explain all of the
observed behaviour.

First, some backround...

We have three levels of cache on the Armada 8040 - there are the two
levels inside the A72 clusters, as designed by Arm Ltd. There is a
third level designed by Marvell which is common to all CPUs, which is
an exclusive cache. This means that if the higher levels of cache
contain a cache line, the L3 cache will not.

Next, consider the state leading up to this point inside the guest:

- the decompressor code has been copied, overlapping the BSS and the
  mini-stack.
- the decompressor code and data has been C+I using the by-MVA
  instructions. This should push the data out to DDR.
- the decompressor has run, writing a large amount of data (that being
  the decompressed kernel image.)

At this precise point where we write to the mini-cache, the data cache
and MMU are both turned off, but the instruction cache is left enabled.

The action around the mini-stack involves writing the following hex
values to the mini-stack, located at 0x40e69420 - note it's alignment:

   ffffffff 48000000 09000401 40003000 00000000 4820071d 40008090

It has been observed that immediately after writing, reading the values
read back have been observed to be (when incorrect, these are a couple
of examples):

   ffffffff 48000000 09000401 ee020f30 ee030f10 e3a00903 ee050f30 (1)
   ffffffff 48000000 09000401 ee020f30 00000000 4820071d 40008090 (2)

and after v1_invalidate_l1, it always seems to be:

   ee060f37 e3a00080 ee020f10 ee020f30 ee030f10 e3a00903 ee050f30

v1_invalidate_l1 operates by issuing set/way instructions that target
only the L1 cache - its purpose is to initialise the at-reset undefined
state of the L1 cache. These invalidates must not target lower level
caches, since these may contain valid data from other CPUs already
brought up in the system.

To be absolutely clear about these two observed cases:

case 1:
write: ffffffff 48000000 09000401 40003000 00000000 4820071d 40008090
read : ffffffff 48000000 09000401 ee020f30 ee030f10 e3a00903 ee050f30
read : ee060f37 e3a00080 ee020f10 ee020f30 ee030f10 e3a00903 ee050f30

case 2:
write: ffffffff 48000000 09000401 40003000 00000000 4820071d 40008090
read : ffffffff 48000000 09000401 ee020f30 00000000 4820071d 40008090
read : ee060f37 e3a00080 ee020f10 ee020f30 ee030f10 e3a00903 ee050f30

If we look at the captured data above, there are a few things to note:
1) the point at which we read-back wrong data is part way through
   a cache line.
2) case 2 shows only one value is wrong initially, mid-way through the
   stack.
3) after v1_invalidate_l1, it seems that all data is incorrect. This
   could be a result of the actions of v1_invalidate_l1, or merely
   due to time passing and there being pressure from other system
   activity to evict lines from the various levels of caches.

Considering your theory that there are clean cache lines overlapping
the mini-stack, and that non-cacheable accesses hit those cache lines,
then the stmia write should hit those cache lines and mark them dirty.
The subsequent read-back should also hit those cache lines, and return
consistent data. If the cache lines are evicted back to RAM, then a
read will not hit any cache lines, and should still return the data
that was written. Therefore, we should not be seeing any effects at
all, and the data should be consistent. This does not fit with the
observations.

If we consider an alternative theory - that there are clean cache lines
overlapping the mini-stack, and non-cacheable accesses do not hit the
cache lines. This means that the stmia write bypasses the caches and
hits the RAM directly, and reads would also fetch from the RAM. The
only way in this case that we would see data change is if the cache
line were in fact dirty, and it gets written back to RAM between our
non-cacheable write and a subsequent non-cacheable read. This also does
not fit the observations, particularly case (2) that I highlight above
where only _one_ value was seen to be incorrect.

There is another theory along this though - the L1 and L2 have
differing behaviour to non-cacheable accesses from L3, and when a
clean cache line is discarded from L1/L2, it is placed in L3. For
example, if non-cacheable accesses bypass L1 and L2 but not L3. Now we
have a _possibility_ to explain this behaviour. Initially, L1/L2
contain a clean cache line overlapping this area. Accesses initially
bypass the clean cache line, until it gets evicted into L3, where
accesses hit it instead. When it gets evicted from L3, as it was clean,
it doesn't get written back, and we see the in-DDR data. The reverse
could also be true - L1/L2 could be hit by an uncached access but not
L3, and I'd suggest similar effects would be possible. However, this
does not fully explain case (2).

So, I don't think we have a full and proper idea of what is really
behind this.
Ard Biesheuvel Jan. 24, 2021, 3:45 p.m. UTC | #5
On Sun, 24 Jan 2021 at 16:21, Russell King - ARM Linux admin
<linux@armlinux.org.uk> wrote:
>
> On Sun, Jan 24, 2021 at 02:35:31PM +0100, Ard Biesheuvel wrote:
> > So what I think is happening is the following:
> >
> > In v5.7 and before, the set/way operations trap into KVM, which sets
> > another trap bit to ensure that second trap occurs the next time the
> > MMU is disabled. So if any cachelines are allocated after the call to
> > cache_clean_flush(), they will be invalidated again when KVM
> > invalidates the VM's entire IPA space.
> >
> > According to DDI0406C.d paragraph B3.2.1, it is implementation defined
> > whether non-cacheable accesses that occur with MMU/caches disabled may
> > hit in the data cache.
> >
> > So after v5.7, without set/way instructions being issued, the second
> > trap is never set, and so the only cache clean+invalidate that occurs
> > is the one that the decompressor performs itself, and the one that KVM
> > does on the guest's behalf at cache_off() time is omitted. This
> > results in clean cachelines being allocated that shadow the
> > mini-stack, which are hit by the non-cacheable accesses that occur
> > before the kernel proper enables the MMU again.
> >
> > Reordering the clean+invalidate with the MMU/cache disabling prevent
> > the issue, as disabling the MMU and caches first disables any mappings
> > that the cache could perform speculative linefills from, and so the
> > mini-stack memory access cannot be served from the cache.
>
> This may be part of the story, but it doesn't explain all of the
> observed behaviour.
>
> First, some backround...
>
> We have three levels of cache on the Armada 8040 - there are the two
> levels inside the A72 clusters, as designed by Arm Ltd. There is a
> third level designed by Marvell which is common to all CPUs, which is
> an exclusive cache. This means that if the higher levels of cache
> contain a cache line, the L3 cache will not.
>
> Next, consider the state leading up to this point inside the guest:
>
> - the decompressor code has been copied, overlapping the BSS and the
>   mini-stack.
> - the decompressor code and data has been C+I using the by-MVA
>   instructions. This should push the data out to DDR.
> - the decompressor has run, writing a large amount of data (that being
>   the decompressed kernel image.)
>
> At this precise point where we write to the mini-cache, the data cache
> and MMU are both turned off, but the instruction cache is left enabled.
>
> The action around the mini-stack involves writing the following hex
> values to the mini-stack, located at 0x40e69420 - note it's alignment:
>
>    ffffffff 48000000 09000401 40003000 00000000 4820071d 40008090
>
> It has been observed that immediately after writing, reading the values
> read back have been observed to be (when incorrect, these are a couple
> of examples):
>
>    ffffffff 48000000 09000401 ee020f30 ee030f10 e3a00903 ee050f30 (1)
>    ffffffff 48000000 09000401 ee020f30 00000000 4820071d 40008090 (2)
>
> and after v1_invalidate_l1, it always seems to be:
>
>    ee060f37 e3a00080 ee020f10 ee020f30 ee030f10 e3a00903 ee050f30
>
> v1_invalidate_l1 operates by issuing set/way instructions that target
> only the L1 cache - its purpose is to initialise the at-reset undefined
> state of the L1 cache. These invalidates must not target lower level
> caches, since these may contain valid data from other CPUs already
> brought up in the system.
>
> To be absolutely clear about these two observed cases:
>
> case 1:
> write: ffffffff 48000000 09000401 40003000 00000000 4820071d 40008090
> read : ffffffff 48000000 09000401 ee020f30 ee030f10 e3a00903 ee050f30
> read : ee060f37 e3a00080 ee020f10 ee020f30 ee030f10 e3a00903 ee050f30
>
> case 2:
> write: ffffffff 48000000 09000401 40003000 00000000 4820071d 40008090
> read : ffffffff 48000000 09000401 ee020f30 00000000 4820071d 40008090
> read : ee060f37 e3a00080 ee020f10 ee020f30 ee030f10 e3a00903 ee050f30
>
> If we look at the captured data above, there are a few things to note:
> 1) the point at which we read-back wrong data is part way through
>    a cache line.
> 2) case 2 shows only one value is wrong initially, mid-way through the
>    stack.
> 3) after v1_invalidate_l1, it seems that all data is incorrect. This
>    could be a result of the actions of v1_invalidate_l1, or merely
>    due to time passing and there being pressure from other system
>    activity to evict lines from the various levels of caches.
>
> Considering your theory that there are clean cache lines overlapping
> the mini-stack, and that non-cacheable accesses hit those cache lines,
> then the stmia write should hit those cache lines and mark them dirty.
> The subsequent read-back should also hit those cache lines, and return
> consistent data. If the cache lines are evicted back to RAM, then a
> read will not hit any cache lines, and should still return the data
> that was written. Therefore, we should not be seeing any effects at
> all, and the data should be consistent. This does not fit with the
> observations.
>
> If we consider an alternative theory - that there are clean cache lines
> overlapping the mini-stack, and non-cacheable accesses do not hit the
> cache lines. This means that the stmia write bypasses the caches and
> hits the RAM directly, and reads would also fetch from the RAM. The
> only way in this case that we would see data change is if the cache
> line were in fact dirty, and it gets written back to RAM between our
> non-cacheable write and a subsequent non-cacheable read. This also does
> not fit the observations, particularly case (2) that I highlight above
> where only _one_ value was seen to be incorrect.
>
> There is another theory along this though - the L1 and L2 have
> differing behaviour to non-cacheable accesses from L3, and when a
> clean cache line is discarded from L1/L2, it is placed in L3. For
> example, if non-cacheable accesses bypass L1 and L2 but not L3. Now we
> have a _possibility_ to explain this behaviour. Initially, L1/L2
> contain a clean cache line overlapping this area. Accesses initially
> bypass the clean cache line, until it gets evicted into L3, where
> accesses hit it instead. When it gets evicted from L3, as it was clean,
> it doesn't get written back, and we see the in-DDR data. The reverse
> could also be true - L1/L2 could be hit by an uncached access but not
> L3, and I'd suggest similar effects would be possible. However, this
> does not fully explain case (2).
>
> So, I don't think we have a full and proper idea of what is really
> behind this.
>

Fair enough. I don't think we will ever be able to get to the bottom of this.

But I do stand by my conclusion that the likely reason that this patch
fixes the issue is that it reinstates the additional cache
clean+invalidate occuring *after* cache_off(), which was carried out
on the guest's behalf before, due to the way KVM handles set/way
operations and the subsequent disabling of the MMU. I.e.,

kvm_set_way_flush() in arch/arm64/kvm/mmu.c, which is called every
time the guest performs a cache op by set/way

/*
 * If this is the first time we do a S/W operation
 * (i.e. HCR_TVM not set) flush the whole memory, and set the
 * VM trapping.
 *
 * Otherwise, rely on the VM trapping to wait for the MMU +
 * Caches to be turned off. At that point, we'll be able to
 * clean the caches again.
 */

(so the significance here is *not* that it flushes the whole memory
but that it sets HCR_TVM - this is why the experiment we did with
adding a single set/way op fixed the issue but changing it to another
mcr instruction that is also known to trap to KVM did not make a
difference)

Then, in kvm_toggle_cache(),

/*
 * If switching the MMU+caches on, need to invalidate the caches.
 * If switching it off, need to clean the caches.
 * Clean + invalidate does the trick always.
 */

which is why cache_off() used to result in a full clean+invalidate
under KVM, but not after my by-MVA patch was applied.

So on the basis that this patch adds back a full cache
clean+invalidate that was inadvertently removed by my previous patch,
I think we should apply this patch. GIven that this change can be
traced back to commit 401b368caaecdce1cf8f05bab448172752230cb0, we
should probably include

Fixes: 401b368caaec ("ARM: decompressor: switch to by-VA cache
maintenance for v7 cores")

Are you on board with that?
Russell King (Oracle) Jan. 24, 2021, 4:18 p.m. UTC | #6
On Sun, Jan 24, 2021 at 04:45:14PM +0100, Ard Biesheuvel wrote:
> Fair enough. I don't think we will ever be able to get to the bottom of this.

DDI0406 does say that the data accesses will have a "strongly ordered"
memory type, and that the unexpected cache hit behaviour is
implementation defined. That leaves the door open for different
behaviours at different cache levels in a SoC, especially when each
cache level is designed by a different entity.

So, without a greater understanding of how each level of cache operates
and how they interact with each other, I don't think there's much hope
of knowing definitely what is causing the observed behaviour. Hence, we
agree.

> So on the basis that this patch adds back a full cache
> clean+invalidate that was inadvertently removed by my previous patch,
> I think we should apply this patch. GIven that this change can be
> traced back to commit 401b368caaecdce1cf8f05bab448172752230cb0, we
> should probably include
> 
> Fixes: 401b368caaec ("ARM: decompressor: switch to by-VA cache
> maintenance for v7 cores")
> 
> Are you on board with that?

Yes, I think that's definitely the commit which introduced this bug.
Russell King (Oracle) Jan. 24, 2021, 11:08 p.m. UTC | #7
On Sun, Jan 24, 2021 at 04:18:31PM +0000, Russell King - ARM Linux admin wrote:
> On Sun, Jan 24, 2021 at 04:45:14PM +0100, Ard Biesheuvel wrote:
> > So on the basis that this patch adds back a full cache
> > clean+invalidate that was inadvertently removed by my previous patch,
> > I think we should apply this patch. GIven that this change can be
> > traced back to commit 401b368caaecdce1cf8f05bab448172752230cb0, we
> > should probably include
> > 
> > Fixes: 401b368caaec ("ARM: decompressor: switch to by-VA cache
> > maintenance for v7 cores")
> > 
> > Are you on board with that?
> 
> Yes, I think that's definitely the commit which introduced this bug.

BTW, there's no need to submit your patch to the patch system, I
obviously already have it in my tree. I'll move it over to the usual
"fixes" branch in due course.

Thanks.
Ard Biesheuvel Jan. 25, 2021, 7:51 a.m. UTC | #8
On Mon, 25 Jan 2021 at 00:09, Russell King - ARM Linux admin
<linux@armlinux.org.uk> wrote:
>
> On Sun, Jan 24, 2021 at 04:18:31PM +0000, Russell King - ARM Linux admin wrote:
> > On Sun, Jan 24, 2021 at 04:45:14PM +0100, Ard Biesheuvel wrote:
> > > So on the basis that this patch adds back a full cache
> > > clean+invalidate that was inadvertently removed by my previous patch,
> > > I think we should apply this patch. GIven that this change can be
> > > traced back to commit 401b368caaecdce1cf8f05bab448172752230cb0, we
> > > should probably include
> > >
> > > Fixes: 401b368caaec ("ARM: decompressor: switch to by-VA cache
> > > maintenance for v7 cores")
> > >
> > > Are you on board with that?
> >
> > Yes, I think that's definitely the commit which introduced this bug.
>
> BTW, there's no need to submit your patch to the patch system, I
> obviously already have it in my tree. I'll move it over to the usual
> "fixes" branch in due course.
>
> Thanks.
>

Oops, I already did so yesterday. I did expand a bit on the commit
log, and added the fixes header, but either version is fine with me.
diff mbox series

Patch

diff --git a/arch/arm/boot/compressed/head.S b/arch/arm/boot/compressed/head.S
index caa27322a0ab..b0e5c41cefc5 100644
--- a/arch/arm/boot/compressed/head.S
+++ b/arch/arm/boot/compressed/head.S
@@ -614,11 +614,24 @@  not_relocated:	mov	r0, #0
 		mov	r3, r7
 		bl	decompress_kernel
 
+		@
+		@ Perform a cache clean before disabling the MMU entirely.
+		@ In cases where the MMU needs to be disabled first (v7+),
+		@ the clean is performed again by cache_off(), using by-VA
+		@ operations on the range [R0, R1], making this prior call to
+		@ cache_clean_flush() redundant. In other cases, the clean is
+		@ performed by set/way and R0/R1 are ignored.
+		@
+		mov	r0, #0
+		mov	r1, #0
+		bl	cache_clean_flush
+
 		get_inflated_image_size	r1, r2, r3
+		ldr	r2, =_kernel_bss_size
+		add	r1, r1, r2
 
-		mov	r0, r4			@ start of inflated image
-		add	r1, r1, r0		@ end of inflated image
-		bl	cache_clean_flush
+		mov	r0, r4			@ start of decompressed kernel
+		add	r1, r1, r0		@ end of kernel BSS
 		bl	cache_off
 
 #ifdef CONFIG_ARM_VIRT_EXT
@@ -1135,12 +1148,14 @@  proc_types:
  * reading the control register, but ARMv4 does.
  *
  * On exit,
- *  r0, r1, r2, r3, r9, r12 corrupted
+ *  r0, r1, r2, r3, r9, r10, r11, r12 corrupted
  * This routine must preserve:
  *  r4, r7, r8
  */
 		.align	5
 cache_off:	mov	r3, #12			@ cache_off function
+		mov	r10, r0
+		mov	r11, r1
 		b	call_cache_fn
 
 __armv4_mpu_cache_off:
@@ -1187,7 +1202,9 @@  __armv7_mmu_cache_off:
 		mcr	p15, 0, r0, c7, c5, 6	@ invalidate BTC
 		mcr	p15, 0, r0, c7, c10, 4	@ DSB
 		mcr	p15, 0, r0, c7, c5, 4	@ ISB
-		mov	pc, lr
+
+		mov	r0, r10
+		b	__armv7_mmu_cache_flush
 
 /*
  * Clean and flush the cache to maintain consistency.