diff mbox series

arm64: mm: install KPTI nG mappings with MMU enabled

Message ID 20220412092928.411581-1-ardb@kernel.org (mailing list archive)
State New, archived
Headers show
Series arm64: mm: install KPTI nG mappings with MMU enabled | expand

Commit Message

Ard Biesheuvel April 12, 2022, 9:29 a.m. UTC
In cases where we unmap the kernel while running in user space, we rely
on ASIDs to distinguish the minimal trampoline from the full kernel
mapping, and this means we must use non-global attributes for those
mappings, to ensure they are scoped by ASID and will not hit in the TLB
inadvertently.

We only do this when needed, as this is generally more costly in terms
of TLB pressure, and so we boot without these non-global attributes, and
apply them to all existing kernel mappings once all CPUs are up and we
know whether or not the non-global attributes are needed. At this point,
we cannot simply unmap and remap the entire address space, so we have to
update all existing block and page descriptors in place.

Currently, we go through a lot of trouble to perform these updates with
the MMU and caches off, to avoid violating break before make (BBM) rules
imposed by the architecture. Since we make changes to page tables that
are not covered by the ID map, we gain access to those descriptors by
disabling translations altogether. This means that the stores to memory
are issued with device attributes, and require extra care in terms of
coherency, which is costly. We also rely on the ID map to access a
shared flag, which requires the ID map to be executable and writable at
the same time, which is another thing we'd prefer to avoid.

So let's switch to an approach where we replace the kernel mapping with
a minimal mapping of a few pages that can be used for the shared flag,
as well as a minimal, ad-hoc fixmap that we can use to map each page
table in turn as we traverse the hierarchy. This requires one PTE per
level, and an associated page worth of VA space in the temporary
mapping.

Note that table entries do not have a non-global attribute, so avoid
setting this bit unnecessarily as well.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 arch/arm64/include/asm/mmu.h   |  4 ++
 arch/arm64/kernel/cpufeature.c | 51 ++++++++++++++++--
 arch/arm64/mm/mmu.c            |  8 ++-
 arch/arm64/mm/proc.S           | 98 +++++++++++++++++++---------------
 4 files changed, 109 insertions(+), 52 deletions(-)

Comments

Ard Biesheuvel April 12, 2022, 10:07 a.m. UTC | #1
On Tue, 12 Apr 2022 at 11:29, Ard Biesheuvel <ardb@kernel.org> wrote:
>
> In cases where we unmap the kernel while running in user space, we rely
> on ASIDs to distinguish the minimal trampoline from the full kernel
> mapping, and this means we must use non-global attributes for those
> mappings, to ensure they are scoped by ASID and will not hit in the TLB
> inadvertently.
>
> We only do this when needed, as this is generally more costly in terms
> of TLB pressure, and so we boot without these non-global attributes, and
> apply them to all existing kernel mappings once all CPUs are up and we
> know whether or not the non-global attributes are needed. At this point,
> we cannot simply unmap and remap the entire address space, so we have to
> update all existing block and page descriptors in place.
>
> Currently, we go through a lot of trouble to perform these updates with
> the MMU and caches off, to avoid violating break before make (BBM) rules
> imposed by the architecture. Since we make changes to page tables that
> are not covered by the ID map, we gain access to those descriptors by
> disabling translations altogether. This means that the stores to memory
> are issued with device attributes, and require extra care in terms of
> coherency, which is costly. We also rely on the ID map to access a
> shared flag, which requires the ID map to be executable and writable at
> the same time, which is another thing we'd prefer to avoid.
>
> So let's switch to an approach where we replace the kernel mapping with
> a minimal mapping of a few pages that can be used for the shared flag,
> as well as a minimal, ad-hoc fixmap that we can use to map each page
> table in turn as we traverse the hierarchy. This requires one PTE per
> level, and an associated page worth of VA space in the temporary
> mapping.
>
> Note that table entries do not have a non-global attribute, so avoid
> setting this bit unnecessarily as well.
>
> Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
> ---
>  arch/arm64/include/asm/mmu.h   |  4 ++
>  arch/arm64/kernel/cpufeature.c | 51 ++++++++++++++++--
>  arch/arm64/mm/mmu.c            |  8 ++-
>  arch/arm64/mm/proc.S           | 98 +++++++++++++++++++---------------
>  4 files changed, 109 insertions(+), 52 deletions(-)
>
...
> diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
> index abc3696bd601..33b1517f2e37 100644
> --- a/arch/arm64/mm/proc.S
> +++ b/arch/arm64/mm/proc.S
> @@ -14,6 +14,7 @@
>  #include <asm/asm-offsets.h>
>  #include <asm/asm_pointer_auth.h>
>  #include <asm/hwcap.h>
> +#include <asm/kernel-pgtable.h>
>  #include <asm/pgtable-hwdef.h>
>  #include <asm/cpufeature.h>
>  #include <asm/alternative.h>
> @@ -167,8 +168,7 @@ SYM_FUNC_END(cpu_do_resume)
>
>         .pushsection ".idmap.text", "awx"
>
> -.macro __idmap_cpu_set_reserved_ttbr1, tmp1, tmp2
> -       adrp    \tmp1, reserved_pg_dir
> +.macro __idmap_cpu_set_ttbr1, tmp1, tmp2
>         phys_to_ttbr \tmp2, \tmp1
>         offset_ttbr1 \tmp2, \tmp1
>         msr     ttbr1_el1, \tmp2
> @@ -187,7 +187,8 @@ SYM_FUNC_END(cpu_do_resume)
>  SYM_FUNC_START(idmap_cpu_replace_ttbr1)
>         save_and_disable_daif flags=x2
>
> -       __idmap_cpu_set_reserved_ttbr1 x1, x3
> +       adrp    x1, reserved_pg_dir
> +       __idmap_cpu_set_ttbr1 x1, x3
>
>         offset_ttbr1 x0, x3
>         msr     ttbr1_el1, x0
> @@ -200,36 +201,52 @@ SYM_FUNC_END(idmap_cpu_replace_ttbr1)
>         .popsection
>
>  #ifdef CONFIG_UNMAP_KERNEL_AT_EL0
> +
> +#define KPTI_NG_PTE_FLAGS      (PTE_ATTRINDX(MT_NORMAL) | SWAPPER_PTE_FLAGS)
> +
>         .pushsection ".idmap.text", "awx"
>
>         .macro  __idmap_kpti_get_pgtable_ent, type
> -       dc      cvac, cur_\()\type\()p          // Ensure any existing dirty
> -       dmb     sy                              // lines are written back before
> -       ldr     \type, [cur_\()\type\()p]       // loading the entry
> -       tbz     \type, #0, skip_\()\type        // Skip invalid and
> -       tbnz    \type, #11, skip_\()\type       // non-global entries
> +       ldr     \type, [cur_\()\type\()p]       // Load the entry
> +       tbz     \type, #0, next_\()\type        // Skip invalid and
> +       tbnz    \type, #11, next_\()\type       // non-global entries
>         .endm
>
>         .macro __idmap_kpti_put_pgtable_ent_ng, type
>         orr     \type, \type, #PTE_NG           // Same bit for blocks and pages
> -       str     \type, [cur_\()\type\()p]       // Update the entry and ensure
> -       dmb     sy                              // that it is visible to all
> -       dc      civac, cur_\()\type\()p         // CPUs.
> +       str     \type, [cur_\()\type\()p]       // Update the entry
> +       .endm
> +
> +       /*
> +        * Dereference the current table entry and map it into the temporary
> +        * page table slot associated with the current level. The ad-hoc fixmap
> +        * is a set of PTEs that is located above the PTEs that cover the level 3
> +        * page table and the scratch page that precedes it.
> +        */
> +       .macro  __idmap_kpti_map_pgtable, type, level
> +       phys_to_pte cur_\type\()p, cur_\type\()p
> +       orr     cur_\type\()p, cur_\type\()p, pte_flags
> +       str     cur_\type\()p, [temp_pte, #8 * (\level + 2)]

Just realised that this needs a break before make, and that the first
DSB probably needs ISHST scope as well.

> +       add     cur_\type\()p, flag_ptr, #PAGE_SIZE * (\level + 2)
> +       dsb     nshst
> +       tlbi    vaae1is, cur_\type\()p
> +       dsb     nsh
>         .endm
>
diff mbox series

Patch

diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h
index 48f8466a4be9..b896f0ac4985 100644
--- a/arch/arm64/include/asm/mmu.h
+++ b/arch/arm64/include/asm/mmu.h
@@ -68,6 +68,10 @@  extern void init_mem_pgprot(void);
 extern void create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys,
 			       unsigned long virt, phys_addr_t size,
 			       pgprot_t prot, bool page_mappings_only);
+extern void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
+				 unsigned long virt, phys_addr_t size,
+				 pgprot_t prot,
+				 phys_addr_t (*pgtable_alloc)(int), int flags);
 extern void *fixmap_remap_fdt(phys_addr_t dt_phys, int *size, pgprot_t prot);
 extern void mark_linear_text_alias_ro(void);
 extern bool kaslr_requires_kpti(void);
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 1661766f50f3..0489d767e7b7 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -1596,14 +1596,31 @@  static bool unmap_kernel_at_el0(const struct arm64_cpu_capabilities *entry,
 }
 
 #ifdef CONFIG_UNMAP_KERNEL_AT_EL0
+#define KPTI_NG_TEMP_VA		(-(1UL << PMD_SHIFT))
+
+static phys_addr_t kpti_ng_temp_alloc;
+
+static phys_addr_t kpti_ng_pgd_alloc(int shift)
+{
+	kpti_ng_temp_alloc -= PAGE_SIZE;
+	return kpti_ng_temp_alloc;
+}
+
 static void __nocfi
 kpti_install_ng_mappings(const struct arm64_cpu_capabilities *__unused)
 {
-	typedef void (kpti_remap_fn)(int, int, phys_addr_t);
+	static atomic_t flag = ATOMIC_INIT(0);
+	static pgd_t *kpti_ng_temp_pgd;
+
+	typedef void (kpti_remap_fn)(int, int, phys_addr_t, unsigned long);
 	extern kpti_remap_fn idmap_kpti_install_ng_mappings;
 	kpti_remap_fn *remap_fn;
 
-	int cpu = smp_processor_id();
+	int levels = CONFIG_PGTABLE_LEVELS;
+	int order = order_base_2(levels + 1);
+	int num_cpus = num_online_cpus();
+	int primary = 0;
+	u64 alloc;
 
 	if (__this_cpu_read(this_cpu_vector) == vectors) {
 		const char *v = arm64_get_bp_hardening_vector(EL1_VECTOR_KPTI);
@@ -1619,14 +1636,40 @@  kpti_install_ng_mappings(const struct arm64_cpu_capabilities *__unused)
 	if (arm64_use_ng_mappings)
 		return;
 
+	// First CPU to arrive here gets the job
+	if (atomic_inc_return(&flag) == 1) {
+		alloc =  __get_free_pages(GFP_ATOMIC | __GFP_ZERO, order);
+		kpti_ng_temp_pgd = (pgd_t *)(alloc + levels * PAGE_SIZE);
+		kpti_ng_temp_alloc = __pa(kpti_ng_temp_pgd);
+		primary = 1;
+
+		// Map the first two pages into the temporary page tables so we
+		// have a place to store a shared variable, and so that we can
+		// manipulate the PTE level while the mapping is active
+		__create_pgd_mapping(kpti_ng_temp_pgd, __pa(alloc),
+				     KPTI_NG_TEMP_VA, 2 * PAGE_SIZE,
+				     PAGE_KERNEL, kpti_ng_pgd_alloc, 0);
+
+		// Wait for other CPUs to catch up
+		while (atomic_read(&flag) < num_cpus)
+			cpu_relax();
+
+		// Clear flag so all CPUs can proceed
+		atomic_set_release(&flag, 0);
+	} else {
+		while (atomic_read(&flag))
+			cpu_relax();
+	}
 	remap_fn = (void *)__pa_symbol(function_nocfi(idmap_kpti_install_ng_mappings));
 
 	cpu_install_idmap();
-	remap_fn(cpu, num_online_cpus(), __pa_symbol(swapper_pg_dir));
+	remap_fn(!primary, num_cpus - 1, __pa(kpti_ng_temp_pgd), KPTI_NG_TEMP_VA);
 	cpu_uninstall_idmap();
 
-	if (!cpu)
+	if (primary) {
+		free_pages(alloc, order);
 		arm64_use_ng_mappings = true;
+	}
 }
 #else
 static void
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index ef1f01da387d..ee4661c0352f 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -372,11 +372,9 @@  static void alloc_init_pud(pgd_t *pgdp, unsigned long addr, unsigned long end,
 		mutex_unlock(&fixmap_lock);
 }
 
-static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
-				 unsigned long virt, phys_addr_t size,
-				 pgprot_t prot,
-				 phys_addr_t (*pgtable_alloc)(int),
-				 int flags)
+void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys, unsigned long virt,
+			  phys_addr_t size, pgprot_t prot,
+			  phys_addr_t (*pgtable_alloc)(int), int flags)
 {
 	unsigned long addr, end, next;
 	pgd_t *pgdp = pgd_offset_pgd(pgdir, virt);
diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index abc3696bd601..33b1517f2e37 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -14,6 +14,7 @@ 
 #include <asm/asm-offsets.h>
 #include <asm/asm_pointer_auth.h>
 #include <asm/hwcap.h>
+#include <asm/kernel-pgtable.h>
 #include <asm/pgtable-hwdef.h>
 #include <asm/cpufeature.h>
 #include <asm/alternative.h>
@@ -167,8 +168,7 @@  SYM_FUNC_END(cpu_do_resume)
 
 	.pushsection ".idmap.text", "awx"
 
-.macro	__idmap_cpu_set_reserved_ttbr1, tmp1, tmp2
-	adrp	\tmp1, reserved_pg_dir
+.macro	__idmap_cpu_set_ttbr1, tmp1, tmp2
 	phys_to_ttbr \tmp2, \tmp1
 	offset_ttbr1 \tmp2, \tmp1
 	msr	ttbr1_el1, \tmp2
@@ -187,7 +187,8 @@  SYM_FUNC_END(cpu_do_resume)
 SYM_FUNC_START(idmap_cpu_replace_ttbr1)
 	save_and_disable_daif flags=x2
 
-	__idmap_cpu_set_reserved_ttbr1 x1, x3
+	adrp	x1, reserved_pg_dir
+	__idmap_cpu_set_ttbr1 x1, x3
 
 	offset_ttbr1 x0, x3
 	msr	ttbr1_el1, x0
@@ -200,36 +201,52 @@  SYM_FUNC_END(idmap_cpu_replace_ttbr1)
 	.popsection
 
 #ifdef CONFIG_UNMAP_KERNEL_AT_EL0
+
+#define KPTI_NG_PTE_FLAGS	(PTE_ATTRINDX(MT_NORMAL) | SWAPPER_PTE_FLAGS)
+
 	.pushsection ".idmap.text", "awx"
 
 	.macro	__idmap_kpti_get_pgtable_ent, type
-	dc	cvac, cur_\()\type\()p		// Ensure any existing dirty
-	dmb	sy				// lines are written back before
-	ldr	\type, [cur_\()\type\()p]	// loading the entry
-	tbz	\type, #0, skip_\()\type	// Skip invalid and
-	tbnz	\type, #11, skip_\()\type	// non-global entries
+	ldr	\type, [cur_\()\type\()p]	// Load the entry
+	tbz	\type, #0, next_\()\type	// Skip invalid and
+	tbnz	\type, #11, next_\()\type	// non-global entries
 	.endm
 
 	.macro __idmap_kpti_put_pgtable_ent_ng, type
 	orr	\type, \type, #PTE_NG		// Same bit for blocks and pages
-	str	\type, [cur_\()\type\()p]	// Update the entry and ensure
-	dmb	sy				// that it is visible to all
-	dc	civac, cur_\()\type\()p		// CPUs.
+	str	\type, [cur_\()\type\()p]	// Update the entry
+	.endm
+
+	/*
+	 * Dereference the current table entry and map it into the temporary
+	 * page table slot associated with the current level. The ad-hoc fixmap
+	 * is a set of PTEs that is located above the PTEs that cover the level 3
+	 * page table and the scratch page that precedes it.
+	 */
+	.macro	__idmap_kpti_map_pgtable, type, level
+	phys_to_pte cur_\type\()p, cur_\type\()p
+	orr	cur_\type\()p, cur_\type\()p, pte_flags
+	str	cur_\type\()p, [temp_pte, #8 * (\level + 2)]
+	add	cur_\type\()p, flag_ptr, #PAGE_SIZE * (\level + 2)
+	dsb	nshst
+	tlbi	vaae1is, cur_\type\()p
+	dsb	nsh
 	.endm
 
 /*
- * void __kpti_install_ng_mappings(int cpu, int num_cpus, phys_addr_t swapper)
+ * void __kpti_install_ng_mappings(int cpu, int num_secondaries, phys_addr_t temp_pgd,
+ *				   unsigned long kpti_ng_temp_va)
  *
  * Called exactly once from stop_machine context by each CPU found during boot.
  */
-__idmap_kpti_flag:
-	.long	1
 SYM_FUNC_START(idmap_kpti_install_ng_mappings)
-	cpu		.req	w0
+	cpu		.req	w0	// at entry
+	pte_flags	.req	x0
 	num_cpus	.req	w1
-	swapper_pa	.req	x2
-	swapper_ttb	.req	x3
-	flag_ptr	.req	x4
+	temp_pgd_phys	.req	x2	// at entry
+	temp_pte	.req	x2
+	flag_ptr	.req	x3
+	swapper_ttb	.req	x4
 	cur_pgdp	.req	x5
 	end_pgdp	.req	x6
 	pgd		.req	x7
@@ -244,11 +261,15 @@  SYM_FUNC_START(idmap_kpti_install_ng_mappings)
 	pte		.req	x16
 
 	mrs	swapper_ttb, ttbr1_el1
-	restore_ttbr1	swapper_ttb
-	adr	flag_ptr, __idmap_kpti_flag
+
+	/* Uninstall swapper before surgery begins */
+	__idmap_cpu_set_ttbr1 temp_pgd_phys, x17
 
 	cbnz	cpu, __idmap_kpti_secondary
 
+	/* Advance temp_pte to the fixmap page */
+	add	temp_pte, flag_ptr, #PAGE_SIZE
+
 	/* We're the boot CPU. Wait for the others to catch up */
 	sevl
 1:	wfe
@@ -256,22 +277,17 @@  SYM_FUNC_START(idmap_kpti_install_ng_mappings)
 	eor	w17, w17, num_cpus
 	cbnz	w17, 1b
 
-	/* We need to walk swapper, so turn off the MMU. */
-	pre_disable_mmu_workaround
-	mrs	x17, sctlr_el1
-	bic	x17, x17, #SCTLR_ELx_M
-	msr	sctlr_el1, x17
-	isb
+	mov	pte_flags, #KPTI_NG_PTE_FLAGS
 
 	/* Everybody is enjoying the idmap, so we can rewrite swapper. */
 	/* PGD */
-	mov	cur_pgdp, swapper_pa
+	adrp	cur_pgdp, swapper_pg_dir
+	__idmap_kpti_map_pgtable pgd, 0
 	add	end_pgdp, cur_pgdp, #(PTRS_PER_PGD * 8)
 do_pgd:	__idmap_kpti_get_pgtable_ent	pgd
 	tbnz	pgd, #1, walk_puds
-next_pgd:
 	__idmap_kpti_put_pgtable_ent_ng	pgd
-skip_pgd:
+next_pgd:
 	add	cur_pgdp, cur_pgdp, #8
 	cmp	cur_pgdp, end_pgdp
 	b.ne	do_pgd
@@ -282,25 +298,24 @@  skip_pgd:
 	dsb	ish
 	isb
 
-	/* We're done: fire up the MMU again */
-	mrs	x17, sctlr_el1
-	orr	x17, x17, #SCTLR_ELx_M
-	set_sctlr_el1	x17
-
 	/* Set the flag to zero to indicate that we're all done */
 	str	wzr, [flag_ptr]
+
+	/* We're done: fire up the MMU again */
+	msr	ttbr1_el1, swapper_ttb
+	isb
 	ret
 
 	/* PUD */
 walk_puds:
 	.if CONFIG_PGTABLE_LEVELS > 3
 	pte_to_phys	cur_pudp, pgd
+	__idmap_kpti_map_pgtable pud, 1
 	add	end_pudp, cur_pudp, #(PTRS_PER_PUD * 8)
 do_pud:	__idmap_kpti_get_pgtable_ent	pud
 	tbnz	pud, #1, walk_pmds
-next_pud:
 	__idmap_kpti_put_pgtable_ent_ng	pud
-skip_pud:
+next_pud:
 	add	cur_pudp, cur_pudp, 8
 	cmp	cur_pudp, end_pudp
 	b.ne	do_pud
@@ -316,12 +331,12 @@  next_pud:
 walk_pmds:
 	.if CONFIG_PGTABLE_LEVELS > 2
 	pte_to_phys	cur_pmdp, pud
+	__idmap_kpti_map_pgtable pmd, 2
 	add	end_pmdp, cur_pmdp, #(PTRS_PER_PMD * 8)
 do_pmd:	__idmap_kpti_get_pgtable_ent	pmd
 	tbnz	pmd, #1, walk_ptes
-next_pmd:
 	__idmap_kpti_put_pgtable_ent_ng	pmd
-skip_pmd:
+next_pmd:
 	add	cur_pmdp, cur_pmdp, #8
 	cmp	cur_pmdp, end_pmdp
 	b.ne	do_pmd
@@ -336,10 +351,11 @@  next_pmd:
 	/* PTE */
 walk_ptes:
 	pte_to_phys	cur_ptep, pmd
+	__idmap_kpti_map_pgtable pte, 3
 	add	end_ptep, cur_ptep, #(PTRS_PER_PTE * 8)
 do_pte:	__idmap_kpti_get_pgtable_ent	pte
 	__idmap_kpti_put_pgtable_ent_ng	pte
-skip_pte:
+next_pte:
 	add	cur_ptep, cur_ptep, #8
 	cmp	cur_ptep, end_ptep
 	b.ne	do_pte
@@ -363,9 +379,6 @@  skip_pte:
 
 	/* Secondary CPUs end up here */
 __idmap_kpti_secondary:
-	/* Uninstall swapper before surgery begins */
-	__idmap_cpu_set_reserved_ttbr1 x16, x17
-
 	/* Increment the flag to let the boot CPU we're ready */
 1:	ldxr	w16, [flag_ptr]
 	add	w16, w16, #1
@@ -379,7 +392,6 @@  __idmap_kpti_secondary:
 	cbnz	w16, 1b
 
 	/* All done, act like nothing happened */
-	offset_ttbr1 swapper_ttb, x16
 	msr	ttbr1_el1, swapper_ttb
 	isb
 	ret