diff mbox series

[RFC,v2,8/8] arm64, kexec: enable MMU during kexec relocation

Message ID 20190731153857.4045-9-pasha.tatashin@soleen.com (mailing list archive)
State New, archived
Headers show
Series arm64: MMU enabled kexec relocation | expand

Commit Message

Pasha Tatashin July 31, 2019, 3:38 p.m. UTC
Now, that we have transitional page tables configured, temporarily enable
MMU to allow faster relocation of segments to final destination.

The performance data: for a moderate size kernel + initramfs: 25M the
relocation was taking 0.382s, with enabled MMU it now takes
0.019s only or x20 improvement.

The time is proportional to the size of relocation, therefore if initramfs
is larger, 100M it could take over a second.

Signed-off-by: Pavel Tatashin <pasha.tatashin@soleen.com>
---
 arch/arm64/kernel/relocate_kernel.S | 192 ++++++++++++++++++++++------
 1 file changed, 154 insertions(+), 38 deletions(-)

Comments

Mark Rutland July 31, 2019, 3:50 p.m. UTC | #1
On Wed, Jul 31, 2019 at 11:38:57AM -0400, Pavel Tatashin wrote:
> +/*
> + * The following code is adoped from "Bare-metal Boot Code for ARMv8-A
> + * Processors Version 1.0, 5.3.1 Cleaning and invalidating the caches".
> + * http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dai0527a
> + */
> +.macro dcache_invalidate tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8
> +	mov	\tmp0, #0x0			/* tmp0 = Cache level */
> +	msr	CSSELR_EL1, \tmp0		/* 0x0 for L1, 0x2 for L2 */
> +	mrs	\tmp4, CCSIDR_EL1		/* Read Cache Size ID */
> +	and	\tmp1, \tmp4, #0x7
> +	add	\tmp1, \tmp1, #0x4		/* tmp1 Cache Line Size */
> +	ldr	\tmp3, =0x7fff
> +	and	\tmp2, \tmp3, \tmp4, lsr #13	/* tmp2 Cache Set num - 1 */
> +	ldr	\tmp3, =0x3ff
> +	and	\tmp3, \tmp3, \tmp4, lsr #3	/* tmp3 Cache Assoc. num - 1 */
> +	clz	\tmp4, \tmp3			/* tmp4 way pos. in the CISW */
> +	mov	\tmp5, #0			/* tmp5 way counter way_loop */
> +1: /* way_loop */
> +	mov	\tmp6, #0			/* tmp6 set counter set_loop */
> +2: /* set_loop */
> +	lsl	\tmp7, \tmp5, \tmp4
> +	orr	\tmp7, \tmp0, \tmp7		/* Set way */
> +	lsl	\tmp8, \tmp6, \tmp1
> +	orr	\tmp7, \tmp7, \tmp8		/* Set set */
> +	dc	cisw, \tmp7			/* Clean & Inval. cache line */
> +	add	\tmp6, \tmp6, #1		/* Increment set counter */
> +	cmp	\tmp6, \tmp2			/* Last set reached yet? */
> +	ble	2b				/* If not, iterate set_loop, */
> +	add	\tmp5, \tmp5, #1		/* else, next way. */
> +	cmp	\tmp5, \tmp3			/* Last way reached yet? */
> +	ble	1b				/* If not, iterate way_loop. */
> +.endm
> +

For various reasons, one cannot safely use Set/Way operations in
portable code. They only make sense for low-level platform-specific
firmware performing power management operations.

If you need to perform D-cache maintenance, you must use the VA
operations to do so.

Thanks,
Mark.
Pasha Tatashin July 31, 2019, 4:01 p.m. UTC | #2
> For various reasons, one cannot safely use Set/Way operations in
> portable code. They only make sense for low-level platform-specific
> firmware performing power management operations.
>
> If you need to perform D-cache maintenance, you must use the VA
> operations to do so.

Hi Mark,

I see, thank you for letting me know. I will do d-cache flushing by VA
in the next iteration. First I need to root cause/fix the bug
described in the cover letter.

Thank you,
Pasha
diff mbox series

Patch

diff --git a/arch/arm64/kernel/relocate_kernel.S b/arch/arm64/kernel/relocate_kernel.S
index d352faf7cbe6..88fc69adb90d 100644
--- a/arch/arm64/kernel/relocate_kernel.S
+++ b/arch/arm64/kernel/relocate_kernel.S
@@ -4,6 +4,8 @@ 
  *
  * Copyright (C) Linaro.
  * Copyright (C) Huawei Futurewei Technologies.
+ * Copyright (c) 2019, Microsoft Corporation.
+ * Pavel Tatashin <patatash@linux.microsoft.com>
  */
 
 #include <linux/kexec.h>
@@ -13,6 +15,130 @@ 
 #include <asm/kexec.h>
 #include <asm/page.h>
 #include <asm/sysreg.h>
+#include <asm/kvm_arm.h>
+
+/*
+ * The following code is adoped from "Bare-metal Boot Code for ARMv8-A
+ * Processors Version 1.0, 5.3.1 Cleaning and invalidating the caches".
+ * http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dai0527a
+ */
+.macro dcache_invalidate tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8
+	mov	\tmp0, #0x0			/* tmp0 = Cache level */
+	msr	CSSELR_EL1, \tmp0		/* 0x0 for L1, 0x2 for L2 */
+	mrs	\tmp4, CCSIDR_EL1		/* Read Cache Size ID */
+	and	\tmp1, \tmp4, #0x7
+	add	\tmp1, \tmp1, #0x4		/* tmp1 Cache Line Size */
+	ldr	\tmp3, =0x7fff
+	and	\tmp2, \tmp3, \tmp4, lsr #13	/* tmp2 Cache Set num - 1 */
+	ldr	\tmp3, =0x3ff
+	and	\tmp3, \tmp3, \tmp4, lsr #3	/* tmp3 Cache Assoc. num - 1 */
+	clz	\tmp4, \tmp3			/* tmp4 way pos. in the CISW */
+	mov	\tmp5, #0			/* tmp5 way counter way_loop */
+1: /* way_loop */
+	mov	\tmp6, #0			/* tmp6 set counter set_loop */
+2: /* set_loop */
+	lsl	\tmp7, \tmp5, \tmp4
+	orr	\tmp7, \tmp0, \tmp7		/* Set way */
+	lsl	\tmp8, \tmp6, \tmp1
+	orr	\tmp7, \tmp7, \tmp8		/* Set set */
+	dc	cisw, \tmp7			/* Clean & Inval. cache line */
+	add	\tmp6, \tmp6, #1		/* Increment set counter */
+	cmp	\tmp6, \tmp2			/* Last set reached yet? */
+	ble	2b				/* If not, iterate set_loop, */
+	add	\tmp5, \tmp5, #1		/* else, next way. */
+	cmp	\tmp5, \tmp3			/* Last way reached yet? */
+	ble	1b				/* If not, iterate way_loop. */
+.endm
+
+/*
+ * Invalidae all TLB: if we are running at EL2, invalidate all TLB at EL1 & EL2,
+ * if we are running at EL1 invalidate all current VMID TLB at EL1.
+ */
+.macro tlb_invalidate tmp
+	mrs	\tmp, CurrentEL
+	cmp	\tmp, #CurrentEL_EL2
+	isb
+	b.ne	1f
+	dsb	sy
+	tlbi	alle2
+	tlbi	alle1
+	dsb	ish
+	isb
+	b	2f
+1:
+	dsb	sy
+	tlbi	vmalle1
+	dsb	ish
+	isb
+2:
+.endm
+
+.macro turn_off_mmu_el	sctlr, tmp1, tmp2
+	mrs	\tmp1, \sctlr
+	ldr	\tmp2, =SCTLR_ELx_FLAGS
+	bic	\tmp1, \tmp1, \tmp2
+	pre_disable_mmu_workaround
+	msr	\sctlr, \tmp1
+	isb
+.endm
+
+.macro turn_off_mmu tmp1, tmp2
+	turn_off_mmu_el	sctlr_el1, \tmp1, \tmp2	/* Turn off MMU at EL1 */
+	mrs	\tmp1, CurrentEL
+	cmp	\tmp1, #CurrentEL_EL2
+	b.ne	1f
+	turn_off_mmu_el	sctlr_el2, \tmp1, \tmp2	/* Turn off MMU at EL2 */
+1:
+.endm
+
+/* Configure TCR_EL2 and MAIR_EL2 */
+.macro tcr_mair_mmu_el2 tmp1, tmp2, tmp3
+	mrs	\tmp1, tcr_el1
+	ldr	\tmp2, =TCR_EL2_MASK
+	and	\tmp1, \tmp1, \tmp2
+	mov	\tmp2, #TCR_EL2_RES1
+	orr	\tmp1, \tmp1, \tmp2
+	ldr	\tmp2, =TCR_T0SZ(VA_BITS)
+	orr	\tmp1, \tmp1, \tmp2
+	tcr_compute_pa_size \tmp1, #TCR_EL2_PS_SHIFT, \tmp2, \tmp3
+	msr	tcr_el2, \tmp1
+	mrs	\tmp1, mair_el1
+	msr	mair_el2, \tmp1
+.endm
+
+.macro turn_on_mmu tmp1, tmp2, tmp3
+	mrs	\tmp1, CurrentEL
+	cmp	\tmp1, #CurrentEL_EL2
+	b.ne	1f
+	tcr_mair_mmu_el2 \tmp1, \tmp2, \tmp3
+	ldr	\tmp1, =(SCTLR_EL2_RES1 | SCTLR_ELx_FLAGS | ENDIAN_SET_EL2)
+	msr	sctlr_el2, \tmp1
+	b	2f
+1:	mrs	\tmp1, sctlr_el1
+	ldr	\tmp2, =SCTLR_ELx_FLAGS
+	orr	\tmp1, \tmp1, \tmp2
+	msr	sctlr_el1, \tmp1
+2:	ic	iallu
+	dsb	nsh
+	isb
+.endm
+
+.macro set_ttbr_el ttbr_reg, trans_table
+	phys_to_ttbr \trans_table, \trans_table
+	msr	\ttbr_reg, \trans_table
+	isb
+.endm
+
+.macro set_ttbr trans_table, tmp
+	mrs	\tmp, CurrentEL
+	cmp	\tmp, #CurrentEL_EL2
+	b.ne	1f
+	set_ttbr_el	ttbr0_el2 \trans_table
+	b	2f
+1:
+	set_ttbr_el	ttbr0_el1 \trans_table
+2:
+.endm
 
 /*
  * arm64_relocate_new_kernel - Put a 2nd stage image in place and boot it.
@@ -24,59 +150,49 @@ 
  * symbols arm64_relocate_new_kernel and arm64_relocate_new_kernel_end.  The
  * machine_kexec() routine will copy arm64_relocate_new_kernel to the kexec
  * safe memory that has been set up to be preserved during the copy operation.
+ *
+ * This function temporarily enables MMU if kernel relocation is needed. This is
+ * done for performance reasons: with MMU-enabled arm64 is much quicker at
+ * copying pages due to also having enabled caching.
  */
 ENTRY(arm64_relocate_new_kernel)
-	/* Clear the sctlr_el2 flags. */
-	mrs	x2, CurrentEL
-	cmp	x2, #CurrentEL_EL2
-	b.ne	1f
-	mrs	x2, sctlr_el2
-	ldr	x1, =SCTLR_ELx_FLAGS
-	bic	x2, x2, x1
-	pre_disable_mmu_workaround
-	msr	sctlr_el2, x2
-	isb
-1:	/* Check if the new image needs relocation. */
-	ldr	x16, [x0, #KRELOC_HEAD]		/* x16 = kimage_head */
-	tbnz	x16, IND_DONE_BIT, .Ldone
-	raw_dcache_line_size x15, x1		/* x15 = dcache line size */
+	/* MMU on EL2 might still be on, turn it off for now */
+	turn_off_mmu	x1, x2
+	dcache_invalidate x1, x2, x3, x4, x5, x6, x7, x8, x9
+	tlb_invalidate x1
+
+	/* Check if the new image needs relocation. */
+	ldr	x12, [x0, #KRELOC_HEAD]		/* x12 = kimage_head */
+	tbnz	x12, IND_DONE_BIT, .Ldone
+	ldr	x1, [x0, #KRELOC_TRANS_TABLE]
+	set_ttbr x1, x2
+	turn_on_mmu x1, x2, x3
 .Lloop:
-	and	x12, x16, PAGE_MASK		/* x12 = addr */
+	and	x2, x12, PAGE_MASK		/* x2 = addr */
 	/* Test the entry flags. */
 .Ltest_source:
-	tbz	x16, IND_SOURCE_BIT, .Ltest_indirection
-
-	/* Invalidate dest page to PoC. */
-	mov     x2, x13
-	add     x20, x2, #PAGE_SIZE
-	sub     x1, x15, #1
-	bic     x2, x2, x1
-2:	dc      ivac, x2
-	add     x2, x2, x15
-	cmp     x2, x20
-	b.lo    2b
-	dsb     sy
-
-	copy_page x13, x12, x1, x2, x3, x4, x5, x6, x7, x8
+	tbz	x12, IND_SOURCE_BIT, .Ltest_indirection
+	copy_page x1, x2, x3, x4, x5, x6, x7, x8, x9, x10
 	b	.Lnext
 .Ltest_indirection:
-	tbz	x16, IND_INDIRECTION_BIT, .Ltest_destination
-	mov	x14, x12			/* ptr = addr */
+	tbz	x12, IND_INDIRECTION_BIT, .Ltest_destination
+	mov	x11, x2				/* x11 = ptr */
 	b	.Lnext
 .Ltest_destination:
-	tbz	x16, IND_DESTINATION_BIT, .Lnext
-	mov	x13, x12			/* dest = addr */
+	tbz	x12, IND_DESTINATION_BIT, .Lnext
+	mov	x1, x2				/* x1 = dest */
 .Lnext:
-	ldr	x16, [x14], #8			/* entry = *ptr++ */
-	tbz	x16, IND_DONE_BIT, .Lloop	/* while (!(entry & DONE)) */
-.Ldone:
+	ldr	x12, [x11], #8			/* x12 = entry = *ptr++ */
+	tbz	x12, IND_DONE_BIT, .Lloop	/* while (!(entry & DONE)) */
 	/* wait for writes from copy_page to finish */
 	dsb	nsh
 	ic	iallu
 	dsb	nsh
 	isb
-
-	/* Start new image. */
+	turn_off_mmu	x1, x2
+	dcache_invalidate x1, x2, x3, x4, x5, x6, x7, x8, x9
+	tlb_invalidate x1
+.Ldone:	/* Start new image. */
 	ldr	x4, [x0, #KRELOC_ENTRY_ADDR]	/* x4 = kimage_start */
 	ldr	x3, [x0, #KRELOC_KERN_ARG3]
 	ldr	x2, [x0, #KRELOC_KERN_ARG2]