@@ -68,6 +68,10 @@ extern void init_mem_pgprot(void);
extern void create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys,
unsigned long virt, phys_addr_t size,
pgprot_t prot, bool page_mappings_only);
+extern void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
+ unsigned long virt, phys_addr_t size,
+ pgprot_t prot,
+ phys_addr_t (*pgtable_alloc)(int), int flags);
extern void *fixmap_remap_fdt(phys_addr_t dt_phys, int *size, pgprot_t prot);
extern void mark_linear_text_alias_ro(void);
extern bool kaslr_requires_kpti(void);
@@ -1596,14 +1596,31 @@ static bool unmap_kernel_at_el0(const struct arm64_cpu_capabilities *entry,
}
#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
+#define KPTI_NG_TEMP_VA (-(1UL << PMD_SHIFT))
+
+static phys_addr_t kpti_ng_temp_alloc;
+
+static phys_addr_t kpti_ng_pgd_alloc(int shift)
+{
+ kpti_ng_temp_alloc -= PAGE_SIZE;
+ return kpti_ng_temp_alloc;
+}
+
static void __nocfi
kpti_install_ng_mappings(const struct arm64_cpu_capabilities *__unused)
{
- typedef void (kpti_remap_fn)(int, int, phys_addr_t);
+ static atomic_t flag = ATOMIC_INIT(0);
+ static pgd_t *kpti_ng_temp_pgd;
+ static u64 alloc;
+
+ typedef void (kpti_remap_fn)(int, int, phys_addr_t, unsigned long);
extern kpti_remap_fn idmap_kpti_install_ng_mappings;
kpti_remap_fn *remap_fn;
- int cpu = smp_processor_id();
+ int levels = CONFIG_PGTABLE_LEVELS;
+ int order = order_base_2(levels + 1);
+ int num_cpus = num_online_cpus();
+ int primary = 0;
if (__this_cpu_read(this_cpu_vector) == vectors) {
const char *v = arm64_get_bp_hardening_vector(EL1_VECTOR_KPTI);
@@ -1619,14 +1636,57 @@ kpti_install_ng_mappings(const struct arm64_cpu_capabilities *__unused)
if (arm64_use_ng_mappings)
return;
+ // First CPU to arrive here gets the job
+ if (atomic_inc_return(&flag) == 1) {
+ alloc = __get_free_pages(GFP_ATOMIC | __GFP_ZERO, order);
+ kpti_ng_temp_pgd = (pgd_t *)(alloc + levels * PAGE_SIZE);
+ kpti_ng_temp_alloc = __pa(kpti_ng_temp_pgd);
+ primary = 1;
+
+ //
+ // Create a minimal page table hierarchy that permits us to
+ // store a shared variable that secondaries will poll, and to
+ // map the swapper page tables temporarily as we traverse them.
+ //
+ // The physical pages are laid out as follows:
+ //
+ // +---------+--------+-/-------+-/------ +-\\--------+
+ // : data : PTE[] : | PMD[] : | PUD[] : || PGD[] :
+ // +---------+--------+-\-------+-\------ +-//--------+
+ // ^ ^
+ // The first two pages are mapped consecutively into this
+ // hierarchy at a PMD_SHIFT aligned virtual address, so that we
+ // have a place to store the shared variable, and so that we
+ // can manipulate the PTE level entries while the mapping is
+ // active. The first two entries cover the data page and the
+ // PTE[] page itself, the remaining entries are free to be used
+ // as a ad-hoc fixmap.
+ //
+ __create_pgd_mapping(kpti_ng_temp_pgd, __pa(alloc),
+ KPTI_NG_TEMP_VA, 2 * PAGE_SIZE,
+ PAGE_KERNEL, kpti_ng_pgd_alloc, 0);
+
+ // Wait for other CPUs to catch up
+ while (atomic_read(&flag) < num_cpus)
+ cpu_relax();
+
+ // Clear flag so all CPUs can proceed
+ atomic_set_release(&flag, 0);
+ } else {
+ while (atomic_read(&flag))
+ cpu_relax();
+ }
remap_fn = (void *)__pa_symbol(function_nocfi(idmap_kpti_install_ng_mappings));
cpu_install_idmap();
- remap_fn(cpu, num_online_cpus(), __pa_symbol(swapper_pg_dir));
+ remap_fn(!primary, num_cpus - 1, __pa(kpti_ng_temp_pgd), KPTI_NG_TEMP_VA);
cpu_uninstall_idmap();
- if (!cpu)
+ // Last CPU to leave frees the pages
+ if (atomic_inc_return(&flag) == num_cpus) {
+ free_pages(alloc, order);
arm64_use_ng_mappings = true;
+ }
}
#else
static void
@@ -360,11 +360,9 @@ static void alloc_init_pud(pgd_t *pgdp, unsigned long addr, unsigned long end,
mutex_unlock(&fixmap_lock);
}
-static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
- unsigned long virt, phys_addr_t size,
- pgprot_t prot,
- phys_addr_t (*pgtable_alloc)(int),
- int flags)
+void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys, unsigned long virt,
+ phys_addr_t size, pgprot_t prot,
+ phys_addr_t (*pgtable_alloc)(int), int flags)
{
unsigned long addr, end, next;
pgd_t *pgdp = pgd_offset_pgd(pgdir, virt);
@@ -14,6 +14,7 @@
#include <asm/asm-offsets.h>
#include <asm/asm_pointer_auth.h>
#include <asm/hwcap.h>
+#include <asm/kernel-pgtable.h>
#include <asm/pgtable-hwdef.h>
#include <asm/cpufeature.h>
#include <asm/alternative.h>
@@ -200,36 +201,56 @@ SYM_FUNC_END(idmap_cpu_replace_ttbr1)
.popsection
#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
+
+#define KPTI_NG_PTE_FLAGS (PTE_ATTRINDX(MT_NORMAL) | SWAPPER_PTE_FLAGS)
+
.pushsection ".idmap.text", "awx"
.macro __idmap_kpti_get_pgtable_ent, type
- dc cvac, cur_\()\type\()p // Ensure any existing dirty
- dmb sy // lines are written back before
- ldr \type, [cur_\()\type\()p] // loading the entry
- tbz \type, #0, skip_\()\type // Skip invalid and
- tbnz \type, #11, skip_\()\type // non-global entries
+ ldr \type, [cur_\()\type\()p], #8 // Load the entry
.endm
.macro __idmap_kpti_put_pgtable_ent_ng, type
- orr \type, \type, #PTE_NG // Same bit for blocks and pages
- str \type, [cur_\()\type\()p] // Update the entry and ensure
- dmb sy // that it is visible to all
- dc civac, cur_\()\type\()p // CPUs.
+ and valid, \type, #1
+ orr \type, \type, valid, lsl #11 // nG |= valid
+ str \type, [cur_\()\type\()p, #-8] // Update the entry
+ .endm
+
+ /*
+ * Dereference the current table entry and map it into the temporary
+ * page table slot associated with the current level. The ad-hoc fixmap
+ * is a set of PTEs that are located above the PTEs that cover the
+ * level 3 page table and the scratch page that precedes it.
+ */
+ .macro __idmap_kpti_map_pgtable, type, level
+ str xzr, [temp_pte, #8 * (\level + 2)] // break before make
+ dsb ishst
+ add pte, flag_ptr, #PAGE_SIZE * (\level + 2)
+ lsr pte, pte, #12
+ tlbi vaae1, pte
+ dsb nsh
+
+ phys_to_pte pte, cur_\type\()p
+ add cur_\type\()p, flag_ptr, #PAGE_SIZE * (\level + 2)
+ orr pte, pte, pte_flags
+ str pte, [temp_pte, #8 * (\level + 2)]
+ dsb ishst
.endm
/*
- * void __kpti_install_ng_mappings(int cpu, int num_cpus, phys_addr_t swapper)
+ * void __kpti_install_ng_mappings(int cpu, int num_secondaries, phys_addr_t temp_pgd,
+ * unsigned long kpti_ng_temp_va)
*
* Called exactly once from stop_machine context by each CPU found during boot.
*/
-__idmap_kpti_flag:
- .long 1
SYM_FUNC_START(idmap_kpti_install_ng_mappings)
- cpu .req w0
+ cpu .req w0 // at entry
+ pte_flags .req x0
num_cpus .req w1
- swapper_pa .req x2
- swapper_ttb .req x3
- flag_ptr .req x4
+ temp_pgd_phys .req x2 // at entry
+ temp_pte .req x2
+ flag_ptr .req x3
+ swapper_ttb .req x4
cur_pgdp .req x5
end_pgdp .req x6
pgd .req x7
@@ -242,13 +263,21 @@ SYM_FUNC_START(idmap_kpti_install_ng_mappings)
cur_ptep .req x14
end_ptep .req x15
pte .req x16
+ valid .req x17
mrs swapper_ttb, ttbr1_el1
- restore_ttbr1 swapper_ttb
- adr flag_ptr, __idmap_kpti_flag
+
+ /* Uninstall swapper before surgery begins */
+ __idmap_cpu_set_reserved_ttbr1 x8, x9
+ offset_ttbr1 temp_pgd_phys, x8
+ msr ttbr1_el1, temp_pgd_phys
+ isb
cbnz cpu, __idmap_kpti_secondary
+ /* Advance temp_pte to the fixmap page */
+ add temp_pte, flag_ptr, #PAGE_SIZE
+
/* We're the boot CPU. Wait for the others to catch up */
sevl
1: wfe
@@ -256,52 +285,40 @@ SYM_FUNC_START(idmap_kpti_install_ng_mappings)
eor w17, w17, num_cpus
cbnz w17, 1b
- /* We need to walk swapper, so turn off the MMU. */
- pre_disable_mmu_workaround
- mrs x17, sctlr_el1
- bic x17, x17, #SCTLR_ELx_M
- msr sctlr_el1, x17
- isb
+ mov pte_flags, #KPTI_NG_PTE_FLAGS
/* Everybody is enjoying the idmap, so we can rewrite swapper. */
/* PGD */
- mov cur_pgdp, swapper_pa
+ adrp cur_pgdp, swapper_pg_dir
+ __idmap_kpti_map_pgtable pgd, 0
add end_pgdp, cur_pgdp, #(PTRS_PER_PGD * 8)
do_pgd: __idmap_kpti_get_pgtable_ent pgd
tbnz pgd, #1, walk_puds
-next_pgd:
__idmap_kpti_put_pgtable_ent_ng pgd
-skip_pgd:
- add cur_pgdp, cur_pgdp, #8
+next_pgd:
cmp cur_pgdp, end_pgdp
b.ne do_pgd
- /* Publish the updated tables and nuke all the TLBs */
- dsb sy
- tlbi vmalle1is
- dsb ish
- isb
-
- /* We're done: fire up the MMU again */
- mrs x17, sctlr_el1
- orr x17, x17, #SCTLR_ELx_M
- set_sctlr_el1 x17
-
/* Set the flag to zero to indicate that we're all done */
str wzr, [flag_ptr]
+ dmb ishst
+
+ /* We're done: fire up swapper again */
+ __idmap_cpu_set_reserved_ttbr1 x8, x9
+ msr ttbr1_el1, swapper_ttb
+ isb
ret
/* PUD */
walk_puds:
.if CONFIG_PGTABLE_LEVELS > 3
pte_to_phys cur_pudp, pgd
+ __idmap_kpti_map_pgtable pud, 1
add end_pudp, cur_pudp, #(PTRS_PER_PUD * 8)
do_pud: __idmap_kpti_get_pgtable_ent pud
tbnz pud, #1, walk_pmds
-next_pud:
__idmap_kpti_put_pgtable_ent_ng pud
-skip_pud:
- add cur_pudp, cur_pudp, 8
+next_pud:
cmp cur_pudp, end_pudp
b.ne do_pud
b next_pgd
@@ -316,13 +333,12 @@ next_pud:
walk_pmds:
.if CONFIG_PGTABLE_LEVELS > 2
pte_to_phys cur_pmdp, pud
+ __idmap_kpti_map_pgtable pmd, 2
add end_pmdp, cur_pmdp, #(PTRS_PER_PMD * 8)
do_pmd: __idmap_kpti_get_pgtable_ent pmd
tbnz pmd, #1, walk_ptes
-next_pmd:
__idmap_kpti_put_pgtable_ent_ng pmd
-skip_pmd:
- add cur_pmdp, cur_pmdp, #8
+next_pmd:
cmp cur_pmdp, end_pmdp
b.ne do_pmd
b next_pud
@@ -336,18 +352,19 @@ next_pmd:
/* PTE */
walk_ptes:
pte_to_phys cur_ptep, pmd
+ __idmap_kpti_map_pgtable pte, 3
add end_ptep, cur_ptep, #(PTRS_PER_PTE * 8)
do_pte: __idmap_kpti_get_pgtable_ent pte
__idmap_kpti_put_pgtable_ent_ng pte
-skip_pte:
- add cur_ptep, cur_ptep, #8
+next_pte:
cmp cur_ptep, end_ptep
b.ne do_pte
b next_pmd
.unreq cpu
+ .unreq pte_flags
.unreq num_cpus
- .unreq swapper_pa
+ .unreq temp_pgd_phys
.unreq cur_pgdp
.unreq end_pgdp
.unreq pgd
@@ -360,12 +377,10 @@ skip_pte:
.unreq cur_ptep
.unreq end_ptep
.unreq pte
+ .unreq valid
/* Secondary CPUs end up here */
__idmap_kpti_secondary:
- /* Uninstall swapper before surgery begins */
- __idmap_cpu_set_reserved_ttbr1 x16, x17
-
/* Increment the flag to let the boot CPU we're ready */
1: ldxr w16, [flag_ptr]
add w16, w16, #1
@@ -379,7 +394,7 @@ __idmap_kpti_secondary:
cbnz w16, 1b
/* All done, act like nothing happened */
- offset_ttbr1 swapper_ttb, x16
+ __idmap_cpu_set_reserved_ttbr1 x8, x9
msr ttbr1_el1, swapper_ttb
isb
ret
In cases where we unmap the kernel while running in user space, we rely on ASIDs to distinguish the minimal trampoline from the full kernel mapping, and this means we must use non-global attributes for those mappings, to ensure they are scoped by ASID and will not hit in the TLB inadvertently. We only do this when needed, as this is generally more costly in terms of TLB pressure, and so we boot without these non-global attributes, and apply them to all existing kernel mappings once all CPUs are up and we know whether or not the non-global attributes are needed. At this point, we cannot simply unmap and remap the entire address space, so we have to update all existing block and page descriptors in place. Currently, we go through a lot of trouble to perform these updates with the MMU and caches off, to avoid violating break before make (BBM) rules imposed by the architecture. Since we make changes to page tables that are not covered by the ID map, we gain access to those descriptors by disabling translations altogether. This means that the stores to memory are issued with device attributes, and require extra care in terms of coherency, which is costly. We also rely on the ID map to access a shared flag, which requires the ID map to be executable and writable at the same time, which is another thing we'd prefer to avoid. So let's switch to an approach where we replace the kernel mapping with a minimal mapping of a few pages that can be used for the shared flag, as well as a minimal, ad-hoc fixmap that we can use to map each page table in turn as we traverse the hierarchy. This requires one PTE per level, and an associated page worth of VA space in the temporary mapping. Note that table descriptors do not have a non-global attribute, so avoid setting this bit unnecessarily as well. Also, given that the bulk of time is spent updating the dense level 3 entries that cover the linear map, let's just rewrite all block and page entries, and OR the valid bit into the nG bit position. On a KVM guest with 64G of memory booting with rodata=full under a ThunderX2 host (*), this reduces the boot delay from 25 seconds to 50 milliseconds. (*) ThunderX2 does not need KPTI to mitigate Meltdown so the build was rigged to enable it nonetheless when taking this measurement as otherwise, this routine never executes. Signed-off-by: Ard Biesheuvel <ardb@kernel.org> --- v2: - use break before make when remapping the page table pages - switch via reserved_pg_dir when going to the temporary page tables and back - improve comments describing what is going on - leave it to the last CPU that leaves to free the temporary pages arch/arm64/include/asm/mmu.h | 4 + arch/arm64/kernel/cpufeature.c | 68 ++++++++++- arch/arm64/mm/mmu.c | 8 +- arch/arm64/mm/proc.S | 119 +++++++++++--------- 4 files changed, 138 insertions(+), 61 deletions(-)