From c4b1831d44c6144d3762ccc72f0c4e71a0c713e5 Mon Sep 17 00:00:00 2001
From: Richard Fellner <richard.fellner@student.tugraz.at>
Date: Thu, 4 May 2017 14:16:44 +0200
Subject: [PATCH] KAISER: Kernel Address Isolation
This patch introduces our implementation of KAISER (Kernel Address Isolation to
have Side-channels Efficiently Removed), a kernel isolation technique to close
hardware side channels on kernel address information.
More information about the patch can be found on:
https://github.com/IAIK/KAISER
---
arch/x86/entry/entry_64.S | 17 ++++
arch/x86/entry/entry_64_compat.S | 7 +-
arch/x86/include/asm/hw_irq.h | 2 +-
arch/x86/include/asm/kaiser.h | 113 +++++++++++++++++++++++
arch/x86/include/asm/pgtable.h | 4 +
arch/x86/include/asm/pgtable_64.h | 21 +++++
arch/x86/include/asm/pgtable_types.h | 12 ++-
arch/x86/include/asm/processor.h | 7 +-
arch/x86/kernel/cpu/common.c | 4 +-
arch/x86/kernel/espfix_64.c | 6 ++
arch/x86/kernel/head_64.S | 16 +++-
arch/x86/kernel/irqinit.c | 2 +-
arch/x86/kernel/process.c | 2 +-
arch/x86/mm/Makefile | 2 +-
arch/x86/mm/kaiser.c | 172 +++++++++++++++++++++++++++++++++++
arch/x86/mm/pageattr.c | 2 +-
arch/x86/mm/pgtable.c | 28 +++++-
include/asm-generic/vmlinux.lds.h | 11 ++-
include/linux/percpu-defs.h | 30 ++++++
init/main.c | 5 +
kernel/fork.c | 8 ++
security/Kconfig | 7 ++
22 files changed, 461 insertions(+), 17 deletions(-)
create mode 100644 arch/x86/include/asm/kaiser.h
create mode 100644 arch/x86/mm/kaiser.c
@@ -37,6 +37,7 @@
#include <asm/pgtable_types.h>
#include <asm/export.h>
#include <asm/frame.h>
+#include <asm/kaiser.h>
#include <linux/err.h>
.code64
@@ -141,6 +142,7 @@ ENTRY(entry_SYSCALL_64)
* it is too small to ever cause noticeable irq latency.
*/
SWAPGS_UNSAFE_STACK
+ SWITCH_KERNEL_CR3_NO_STACK
/*
* A hypervisor implementation might want to use a label
* after the swapgs, so that it can do the swapgs
@@ -223,6 +225,7 @@ entry_SYSCALL_64_fastpath:
movq RIP(%rsp), %rcx
movq EFLAGS(%rsp), %r11
RESTORE_C_REGS_EXCEPT_RCX_R11
+ SWITCH_USER_CR3
movq RSP(%rsp), %rsp
USERGS_SYSRET64
@@ -318,10 +321,12 @@ return_from_SYSCALL_64:
syscall_return_via_sysret:
/* rcx and r11 are already restored (see code above) */
RESTORE_C_REGS_EXCEPT_RCX_R11
+ SWITCH_USER_CR3
movq RSP(%rsp), %rsp
USERGS_SYSRET64
opportunistic_sysret_failed:
+ SWITCH_USER_CR3
SWAPGS
jmp restore_c_regs_and_iret
END(entry_SYSCALL_64)
@@ -420,6 +425,7 @@ ENTRY(ret_from_fork)
leaq FRAME_OFFSET(%rsp),%rdi /* pt_regs pointer */
call syscall_return_slowpath /* returns with IRQs disabled */
TRACE_IRQS_ON /* user mode is traced as IRQS on */
+ SWITCH_USER_CR3
SWAPGS
FRAME_END
jmp restore_regs_and_iret
@@ -476,6 +482,7 @@ END(irq_entries_start)
* tracking that we're in kernel mode.
*/
SWAPGS
+ SWITCH_KERNEL_CR3
/*
* We need to tell lockdep that IRQs are off. We can't do this until
@@ -533,6 +540,7 @@ GLOBAL(retint_user)
mov %rsp,%rdi
call prepare_exit_to_usermode
TRACE_IRQS_IRETQ
+ SWITCH_USER_CR3
SWAPGS
jmp restore_regs_and_iret
@@ -610,6 +618,7 @@ native_irq_return_ldt:
pushq %rdi /* Stash user RDI */
SWAPGS
+ SWITCH_KERNEL_CR3
movq PER_CPU_VAR(espfix_waddr), %rdi
movq %rax, (0*8)(%rdi) /* user RAX */
movq (1*8)(%rsp), %rax /* user RIP */
@@ -636,6 +645,7 @@ native_irq_return_ldt:
* still points to an RO alias of the ESPFIX stack.
*/
orq PER_CPU_VAR(espfix_stack), %rax
+ SWITCH_USER_CR3
SWAPGS
movq %rax, %rsp
@@ -1034,6 +1044,7 @@ ENTRY(paranoid_entry)
testl %edx, %edx
js 1f /* negative -> in kernel */
SWAPGS
+ SWITCH_KERNEL_CR3
xorl %ebx, %ebx
1: ret
END(paranoid_entry)
@@ -1056,6 +1067,7 @@ ENTRY(paranoid_exit)
testl %ebx, %ebx /* swapgs needed? */
jnz paranoid_exit_no_swapgs
TRACE_IRQS_IRETQ
+ SWITCH_USER_CR3_NO_STACK
SWAPGS_UNSAFE_STACK
jmp paranoid_exit_restore
paranoid_exit_no_swapgs:
@@ -1085,6 +1097,7 @@ ENTRY(error_entry)
* from user mode due to an IRET fault.
*/
SWAPGS
+ SWITCH_KERNEL_CR3
.Lerror_entry_from_usermode_after_swapgs:
/*
@@ -1136,6 +1149,7 @@ ENTRY(error_entry)
* Switch to kernel gsbase:
*/
SWAPGS
+ SWITCH_KERNEL_CR3
/*
* Pretend that the exception came from user mode: set up pt_regs
@@ -1234,6 +1248,7 @@ ENTRY(nmi)
*/
SWAPGS_UNSAFE_STACK
+ SWITCH_KERNEL_CR3_NO_STACK
cld
movq %rsp, %rdx
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
@@ -1274,6 +1289,7 @@ ENTRY(nmi)
* Return back to user mode. We must *not* do the normal exit
* work, because we don't want to enable interrupts.
*/
+ SWITCH_USER_CR3
SWAPGS
jmp restore_regs_and_iret
@@ -1485,6 +1501,7 @@ end_repeat_nmi:
testl %ebx, %ebx /* swapgs needed? */
jnz nmi_restore
nmi_swapgs:
+ SWITCH_USER_CR3_NO_STACK
SWAPGS_UNSAFE_STACK
nmi_restore:
RESTORE_EXTRA_REGS
@@ -13,6 +13,7 @@
#include <asm/irqflags.h>
#include <asm/asm.h>
#include <asm/smap.h>
+#include <asm/kaiser.h>
#include <linux/linkage.h>
#include <linux/err.h>
@@ -48,6 +49,7 @@
ENTRY(entry_SYSENTER_compat)
/* Interrupts are off on entry. */
SWAPGS_UNSAFE_STACK
+ SWITCH_KERNEL_CR3_NO_STACK
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
/*
@@ -184,6 +186,7 @@ ENDPROC(entry_SYSENTER_compat)
ENTRY(entry_SYSCALL_compat)
/* Interrupts are off on entry. */
SWAPGS_UNSAFE_STACK
+ SWITCH_KERNEL_CR3_NO_STACK
/* Stash user ESP and switch to the kernel stack. */
movl %esp, %r8d
@@ -259,6 +262,7 @@ sysret32_from_system_call:
xorq %r8, %r8
xorq %r9, %r9
xorq %r10, %r10
+ SWITCH_USER_CR3
movq RSP-ORIG_RAX(%rsp), %rsp
swapgs
sysretl
@@ -297,7 +301,7 @@ ENTRY(entry_INT80_compat)
PARAVIRT_ADJUST_EXCEPTION_FRAME
ASM_CLAC /* Do this early to minimize exposure */
SWAPGS
-
+ SWITCH_KERNEL_CR3_NO_STACK
/*
* User tracing code (ptrace or signal handlers) might assume that
* the saved RAX contains a 32-bit number when we're invoking a 32-bit
@@ -338,6 +342,7 @@ ENTRY(entry_INT80_compat)
/* Go back to user mode. */
TRACE_IRQS_ON
+ SWITCH_USER_CR3_NO_STACK
SWAPGS
jmp restore_regs_and_iret
END(entry_INT80_compat)
@@ -178,7 +178,7 @@ extern char irq_entries_start[];
#define VECTOR_RETRIGGERED ((void *)~0UL)
typedef struct irq_desc* vector_irq_t[NR_VECTORS];
-DECLARE_PER_CPU(vector_irq_t, vector_irq);
+DECLARE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq);
#endif /* !ASSEMBLY_ */
new file mode 100644
@@ -0,0 +1,113 @@
+#ifndef _ASM_X86_KAISER_H
+#define _ASM_X86_KAISER_H
+
+/* This file includes the definitions for the KAISER feature.
+ * KAISER is a counter measure against x86_64 side channel attacks on the kernel virtual memory.
+ * It has a shodow-pgd for every process. the shadow-pgd has a minimalistic kernel-set mapped,
+ * but includes the whole user memory. Within a kernel context switch, or when an interrupt is handled,
+ * the pgd is switched to the normal one. When the system switches to user mode, the shadow pgd is enabled.
+ * By this, the virtual memory chaches are freed, and the user may not attack the whole kernel memory.
+ *
+ * A minimalistic kernel mapping holds the parts needed to be mapped in user mode, as the entry/exit functions
+ * of the user space, or the stacks.
+ */
+#ifdef __ASSEMBLY__
+#ifdef CONFIG_KAISER
+
+.macro _SWITCH_TO_KERNEL_CR3 reg
+movq %cr3, \reg
+andq $(~0x1000), \reg
+movq \reg, %cr3
+.endm
+
+.macro _SWITCH_TO_USER_CR3 reg
+movq %cr3, \reg
+orq $(0x1000), \reg
+movq \reg, %cr3
+.endm
+
+.macro SWITCH_KERNEL_CR3
+pushq %rax
+_SWITCH_TO_KERNEL_CR3 %rax
+popq %rax
+.endm
+
+.macro SWITCH_USER_CR3
+pushq %rax
+_SWITCH_TO_USER_CR3 %rax
+popq %rax
+.endm
+
+.macro SWITCH_KERNEL_CR3_NO_STACK
+movq %rax, PER_CPU_VAR(unsafe_stack_register_backup)
+_SWITCH_TO_KERNEL_CR3 %rax
+movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
+.endm
+
+
+.macro SWITCH_USER_CR3_NO_STACK
+
+movq %rax, PER_CPU_VAR(unsafe_stack_register_backup)
+_SWITCH_TO_USER_CR3 %rax
+movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
+
+.endm
+
+#else /* CONFIG_KAISER */
+
+.macro SWITCH_KERNEL_CR3 reg
+.endm
+.macro SWITCH_USER_CR3 reg
+.endm
+.macro SWITCH_USER_CR3_NO_STACK
+.endm
+.macro SWITCH_KERNEL_CR3_NO_STACK
+.endm
+
+#endif /* CONFIG_KAISER */
+#else /* __ASSEMBLY__ */
+
+
+#ifdef CONFIG_KAISER
+// Upon kernel/user mode switch, it may happen that
+// the address space has to be switched before the registers have been stored.
+// To change the address space, another register is needed.
+// A register therefore has to be stored/restored.
+//
+DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
+
+#endif /* CONFIG_KAISER */
+
+/**
+ * shadowmem_add_mapping - map a virtual memory part to the shadow mapping
+ * @addr: the start address of the range
+ * @size: the size of the range
+ * @flags: The mapping flags of the pages
+ *
+ * the mapping is done on a global scope, so no bigger synchronization has to be done.
+ * the pages have to be manually unmapped again when they are not needed any longer.
+ */
+extern void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags);
+
+
+/**
+ * shadowmem_remove_mapping - unmap a virtual memory part of the shadow mapping
+ * @addr: the start address of the range
+ * @size: the size of the range
+ */
+extern void kaiser_remove_mapping(unsigned long start, unsigned long size);
+
+/**
+ * shadowmem_initialize_mapping - Initalize the shadow mapping
+ *
+ * most parts of the shadow mapping can be mapped upon boot time.
+ * only the thread stacks have to be mapped on runtime.
+ * the mapped regions are not unmapped at all.
+ */
+extern void kaiser_init(void);
+
+#endif
+
+
+
+#endif /* _ASM_X86_KAISER_H */
@@ -904,6 +904,10 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,
static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
{
memcpy(dst, src, count * sizeof(pgd_t));
+#ifdef CONFIG_KAISER
+ // clone the shadow pgd part as well
+ memcpy(native_get_shadow_pgd(dst), native_get_shadow_pgd(src), count * sizeof(pgd_t));
+#endif
}
#define PTE_SHIFT ilog2(PTRS_PER_PTE)
@@ -106,9 +106,30 @@ static inline void native_pud_clear(pud_t *pud)
native_set_pud(pud, native_make_pud(0));
}
+#ifdef CONFIG_KAISER
+static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp) {
+ return (pgd_t *)(void*)((unsigned long)(void*)pgdp | (unsigned long)PAGE_SIZE);
+}
+
+static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp) {
+ return (pgd_t *)(void*)((unsigned long)(void*)pgdp & ~(unsigned long)PAGE_SIZE);
+}
+#endif /* CONFIG_KAISER */
+
static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
{
+#ifdef CONFIG_KAISER
+ // We know that a pgd is page aligned.
+ // Therefore the lower indices have to be mapped to user space.
+ // These pages are mapped to the shadow mapping.
+ if ((((unsigned long)pgdp) % PAGE_SIZE) < (PAGE_SIZE / 2)) {
+ native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
+ }
+
+ pgdp->pgd = pgd.pgd & ~_PAGE_USER;
+#else /* CONFIG_KAISER */
*pgdp = pgd;
+#endif
}
static inline void native_pgd_clear(pgd_t *pgd)
@@ -45,7 +45,11 @@
#define _PAGE_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED)
#define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY)
#define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE)
-#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
+#ifdef CONFIG_KAISER
+#define _PAGE_GLOBAL (_AT(pteval_t, 0))
+#else
+#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
+#endif
#define _PAGE_SOFTW1 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1)
#define _PAGE_SOFTW2 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW2)
#define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT)
@@ -119,7 +123,11 @@
#define _PAGE_DEVMAP (_AT(pteval_t, 0))
#endif
-#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
+#ifdef CONFIG_KAISER
+#define _PAGE_PROTNONE (_AT(pteval_t, 0))
+#else
+#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
+#endif
#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
_PAGE_ACCESSED | _PAGE_DIRTY)
@@ -339,7 +339,7 @@ struct tss_struct {
} ____cacheline_aligned;
-DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss);
+DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, cpu_tss);
#ifdef CONFIG_X86_32
DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);
@@ -366,6 +366,11 @@ union irq_stack_union {
char gs_base[40];
unsigned long stack_canary;
};
+
+ struct {
+ char irq_stack_pointer[64];
+ char unused[IRQ_STACK_SIZE - 64];
+ };
};
DECLARE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union) __visible;
@@ -93,7 +93,7 @@ static const struct cpu_dev default_cpu = {
static const struct cpu_dev *this_cpu = &default_cpu;
-DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
+DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page) = { .gdt = {
#ifdef CONFIG_X86_64
/*
* We need valid kernel segments for data and code in long mode too
@@ -1270,7 +1270,7 @@ static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
[DEBUG_STACK - 1] = DEBUG_STKSZ
};
-static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
+DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(char, exception_stacks
[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
/* May not be marked __init: used by software suspend */
@@ -41,6 +41,7 @@
#include <asm/pgalloc.h>
#include <asm/setup.h>
#include <asm/espfix.h>
+#include <asm/kaiser.h>
/*
* Note: we only need 6*8 = 48 bytes for the espfix stack, but round
@@ -126,6 +127,11 @@ void __init init_espfix_bsp(void)
/* Install the espfix pud into the kernel page directory */
pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)];
pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page);
+#ifdef CONFIG_KAISER
+ // add the esp stack pud to the shadow mapping here.
+ // This can be done directly, because the fixup stack has its own pud
+ set_pgd(native_get_shadow_pgd(pgd_p), __pgd(_PAGE_TABLE | __pa((pud_t *)espfix_pud_page)));
+#endif
/* Randomize the locations */
init_espfix_random();
@@ -410,6 +410,14 @@ GLOBAL(early_recursion_flag)
.balign PAGE_SIZE; \
GLOBAL(name)
+#ifdef CONFIG_KAISER
+#define NEXT_PGD_PAGE(name) \
+ .balign 2 * PAGE_SIZE; \
+GLOBAL(name)
+#else
+#define NEXT_PGD_PAGE(name) NEXT_PAGE(name)
+#endif
+
/* Automate the creation of 1 to 1 mapping pmd entries */
#define PMDS(START, PERM, COUNT) \
i = 0 ; \
@@ -419,7 +427,7 @@ GLOBAL(name)
.endr
__INITDATA
-NEXT_PAGE(early_level4_pgt)
+NEXT_PGD_PAGE(early_level4_pgt)
.fill 511,8,0
.quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
@@ -429,10 +437,10 @@ NEXT_PAGE(early_dynamic_pgts)
.data
#ifndef CONFIG_XEN
-NEXT_PAGE(init_level4_pgt)
- .fill 512,8,0
+NEXT_PGD_PAGE(init_level4_pgt)
+ .fill 2*512,8,0
#else
-NEXT_PAGE(init_level4_pgt)
+NEXT_PGD_PAGE(init_level4_pgt)
.quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
.org init_level4_pgt + L4_PAGE_OFFSET*8, 0
.quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
@@ -51,7 +51,7 @@ static struct irqaction irq2 = {
.flags = IRQF_NO_THREAD,
};
-DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
+DEFINE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq) = {
[0 ... NR_VECTORS - 1] = VECTOR_UNUSED,
};
@@ -40,7 +40,7 @@
* section. Since TSS's are completely CPU-local, we want them
* on exact cacheline boundaries, to eliminate cacheline ping-pong.
*/
-__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
+__visible DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, cpu_tss) = {
.x86_tss = {
.sp0 = TOP_OF_INIT_STACK,
#ifdef CONFIG_X86_32
@@ -38,4 +38,4 @@ obj-$(CONFIG_NUMA_EMU) += numa_emulation.o
obj-$(CONFIG_X86_INTEL_MPX) += mpx.o
obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o
obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
-
+obj-$(CONFIG_KAISER) += kaiser.o
new file mode 100644
@@ -0,0 +1,172 @@
+
+
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/bug.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/mm.h>
+
+#include <linux/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/desc.h>
+#ifdef CONFIG_KAISER
+
+__visible DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
+
+/**
+ * Get the real ppn from a address in kernel mapping.
+ * @param address The virtual adrress
+ * @return the physical address
+ */
+static inline unsigned long get_pa_from_mapping (unsigned long address)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ pgd = pgd_offset_k(address);
+ BUG_ON(pgd_none(*pgd) || pgd_large(*pgd));
+
+ pud = pud_offset(pgd, address);
+ BUG_ON(pud_none(*pud));
+
+ if(pud_large(*pud))
+ {
+ return (pud_pfn(*pud) << PAGE_SHIFT) | (address & ~PUD_PAGE_MASK);
+ }
+
+ pmd = pmd_offset(pud, address);
+ BUG_ON(pmd_none(*pmd));
+
+ if(pmd_large(*pmd))
+ {
+ return (pmd_pfn(*pmd) << PAGE_SHIFT) | (address & ~PMD_PAGE_MASK);
+ }
+
+ pte = pte_offset_kernel(pmd, address);
+ BUG_ON(pte_none(*pte));
+
+ return (pte_pfn(*pte) << PAGE_SHIFT) | (address & ~PAGE_MASK);
+}
+
+void _kaiser_copy (unsigned long start_addr, unsigned long size,
+ unsigned long flags)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+ unsigned long address;
+ unsigned long end_addr = start_addr + size;
+ unsigned long target_address;
+
+ for (address = PAGE_ALIGN(start_addr - (PAGE_SIZE - 1));
+ address < PAGE_ALIGN(end_addr); address += PAGE_SIZE)
+ {
+ target_address = get_pa_from_mapping(address);
+
+ pgd = native_get_shadow_pgd(pgd_offset_k(address));
+
+ BUG_ON(pgd_none(*pgd) && "All shadow pgds should be mapped at this time\n");
+ BUG_ON(pgd_large(*pgd));
+
+ pud = pud_offset(pgd, address);
+ if(pud_none(*pud))
+ {
+ set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd_alloc_one(0, address))));
+ }
+ BUG_ON(pud_large(*pud));
+
+ pmd = pmd_offset(pud, address);
+ if(pmd_none(*pmd))
+ {
+ set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte_alloc_one_kernel(0, address))));
+ }
+ BUG_ON(pmd_large(*pmd));
+
+ pte = pte_offset_kernel(pmd, address);
+ if(pte_none(*pte))
+ {
+ set_pte(pte, __pte(flags | target_address));
+ }
+ else
+ {
+ BUG_ON(__pa(pte_page(*pte)) != target_address);
+ }
+ }
+}
+
+// at first, add a pmd for every pgd entry in the shadowmem-kernel-part of the kernel mapping
+static inline void __init _kaiser_init(void)
+{
+ pgd_t *pgd;
+ int i = 0;
+
+ pgd = native_get_shadow_pgd(pgd_offset_k((unsigned long )0));
+ for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++)
+ {
+ set_pgd(pgd + i, __pgd(_PAGE_TABLE |__pa(pud_alloc_one(0, 0))));
+ }
+}
+
+extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[];
+spinlock_t shadow_table_lock;
+void __init kaiser_init(void)
+{
+ int cpu;
+ spin_lock_init(&shadow_table_lock);
+
+ spin_lock(&shadow_table_lock);
+
+ _kaiser_init();
+
+ for_each_possible_cpu(cpu)
+ {
+ // map the per cpu user variables
+ _kaiser_copy(
+ (unsigned long) (__per_cpu_user_mapped_start + per_cpu_offset(cpu)),
+ (unsigned long) __per_cpu_user_mapped_end - (unsigned long) __per_cpu_user_mapped_start,
+ __PAGE_KERNEL);
+ }
+
+ // map the entry/exit text section, which is responsible to switch between user- and kernel mode
+ _kaiser_copy(
+ (unsigned long) __entry_text_start,
+ (unsigned long) __entry_text_end - (unsigned long) __entry_text_start,
+ __PAGE_KERNEL_RX);
+
+ // the fixed map address of the idt_table
+ _kaiser_copy(
+ (unsigned long) idt_descr.address,
+ sizeof(gate_desc) * NR_VECTORS,
+ __PAGE_KERNEL_RO);
+
+ spin_unlock(&shadow_table_lock);
+}
+
+// add a mapping to the shadow-mapping, and synchronize the mappings
+void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)
+{
+ spin_lock(&shadow_table_lock);
+ _kaiser_copy(addr, size, flags);
+ spin_unlock(&shadow_table_lock);
+}
+
+extern void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end);
+void kaiser_remove_mapping(unsigned long start, unsigned long size)
+{
+ pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(start));
+ spin_lock(&shadow_table_lock);
+ do
+ {
+ unmap_pud_range(pgd, start, start + size);
+ }
+ while (pgd++ != native_get_shadow_pgd(pgd_offset_k(start + size)));
+ spin_unlock(&shadow_table_lock);
+}
+#endif /* CONFIG_KAISER */
@@ -823,7 +823,7 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
pud_clear(pud);
}
-static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
+void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
{
pud_t *pud = pud_offset(pgd, start);
@@ -346,12 +346,38 @@ static inline void _pgd_free(pgd_t *pgd)
#else
static inline pgd_t *_pgd_alloc(void)
{
- return (pgd_t *)__get_free_page(PGALLOC_GFP);
+#ifdef CONFIG_KAISER
+ // Instead of one PML4, we aquire two PML4s and, thus, an 8kb-aligned memory
+ // block. Therefore, we have to allocate at least 3 pages. However, the
+ // __get_free_pages returns us 4 pages. Hence, we store the base pointer at
+ // the beginning of the page of our 8kb-aligned memory block in order to
+ // correctly free it afterwars.
+
+ unsigned long pages = __get_free_pages(PGALLOC_GFP, get_order(4*PAGE_SIZE));
+
+ if(native_get_normal_pgd((pgd_t*) pages) == (pgd_t*) pages)
+ {
+ *((unsigned long*)(pages + 2 * PAGE_SIZE)) = pages;
+ return (pgd_t *) pages;
+ }
+ else
+ {
+ *((unsigned long*)(pages + 3 * PAGE_SIZE)) = pages;
+ return (pgd_t *) (pages + PAGE_SIZE);
+ }
+#else
+ return (pgd_t *)__get_free_page(PGALLOC_GFP);
+#endif
}
static inline void _pgd_free(pgd_t *pgd)
{
+#ifdef CONFIG_KAISER
+ unsigned long pages = *((unsigned long*) ((char*) pgd + 2 * PAGE_SIZE));
+ free_pages(pages, get_order(4*PAGE_SIZE));
+#else
free_page((unsigned long)pgd);
+#endif
}
#endif /* CONFIG_X86_PAE */
@@ -765,7 +765,16 @@
*/
#define PERCPU_INPUT(cacheline) \
VMLINUX_SYMBOL(__per_cpu_start) = .; \
- *(.data..percpu..first) \
+ \
+ VMLINUX_SYMBOL(__per_cpu_user_mapped_start) = .; \
+ *(.data..percpu..first) \
+ . = ALIGN(cacheline); \
+ *(.data..percpu..user_mapped) \
+ *(.data..percpu..user_mapped..shared_aligned) \
+ . = ALIGN(PAGE_SIZE); \
+ *(.data..percpu..user_mapped..page_aligned) \
+ VMLINUX_SYMBOL(__per_cpu_user_mapped_end) = .; \
+ \
. = ALIGN(PAGE_SIZE); \
*(.data..percpu..page_aligned) \
. = ALIGN(cacheline); \
@@ -35,6 +35,12 @@
#endif
+#ifdef CONFIG_KAISER
+#define USER_MAPPED_SECTION "..user_mapped"
+#else
+#define USER_MAPPED_SECTION ""
+#endif
+
/*
* Base implementations of per-CPU variable declarations and definitions, where
* the section in which the variable is to be placed is provided by the
@@ -115,6 +121,12 @@
#define DEFINE_PER_CPU(type, name) \
DEFINE_PER_CPU_SECTION(type, name, "")
+#define DECLARE_PER_CPU_USER_MAPPED(type, name) \
+ DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION)
+
+#define DEFINE_PER_CPU_USER_MAPPED(type, name) \
+ DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION)
+
/*
* Declaration/definition used for per-CPU variables that must come first in
* the set of variables.
@@ -144,6 +156,14 @@
DEFINE_PER_CPU_SECTION(type, name, PER_CPU_SHARED_ALIGNED_SECTION) \
____cacheline_aligned_in_smp
+#define DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name) \
+ DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION PER_CPU_SHARED_ALIGNED_SECTION) \
+ ____cacheline_aligned_in_smp
+
+#define DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name) \
+ DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION PER_CPU_SHARED_ALIGNED_SECTION) \
+ ____cacheline_aligned_in_smp
+
#define DECLARE_PER_CPU_ALIGNED(type, name) \
DECLARE_PER_CPU_SECTION(type, name, PER_CPU_ALIGNED_SECTION) \
____cacheline_aligned
@@ -162,6 +182,16 @@
#define DEFINE_PER_CPU_PAGE_ALIGNED(type, name) \
DEFINE_PER_CPU_SECTION(type, name, "..page_aligned") \
__aligned(PAGE_SIZE)
+/*
+ * Declaration/definition used for per-CPU variables that must be page aligned and need to be mapped in user mode.
+ */
+#define DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name) \
+ DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned") \
+ __aligned(PAGE_SIZE)
+
+#define DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name) \
+ DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned") \
+ __aligned(PAGE_SIZE)
/*
* Declaration/definition used for per-CPU variables that must be read mostly.
@@ -83,11 +83,13 @@
#include <linux/io.h>
#include <linux/cache.h>
+#include <asm/cmdline.h>
#include <asm/io.h>
#include <asm/bugs.h>
#include <asm/setup.h>
#include <asm/sections.h>
#include <asm/cacheflush.h>
+#include <asm/kaiser.h>
static int kernel_init(void *);
@@ -477,6 +479,9 @@ static void __init mm_init(void)
pgtable_init();
vmalloc_init();
ioremap_huge_init();
+#ifdef CONFIG_KAISER
+ kaiser_init();
+#endif
}
asmlinkage __visible void __init start_kernel(void)
@@ -211,8 +211,12 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
#endif
}
+extern void kaiser_remove_mapping(unsigned long start_addr, unsigned long size);
static inline void free_thread_stack(struct task_struct *tsk)
{
+#ifdef CONFIG_KAISER
+ kaiser_remove_mapping((unsigned long)tsk->stack, THREAD_SIZE);
+#endif
#ifdef CONFIG_VMAP_STACK
if (task_stack_vm_area(tsk)) {
unsigned long flags;
@@ -470,6 +474,7 @@ void set_task_stack_end_magic(struct task_struct *tsk)
*stackend = STACK_END_MAGIC; /* for overflow detection */
}
+extern void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags);
static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
{
struct task_struct *tsk;
@@ -497,6 +502,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
* functions again.
*/
tsk->stack = stack;
+#ifdef CONFIG_KAISER
+ kaiser_add_mapping((unsigned long)tsk->stack, THREAD_SIZE, __PAGE_KERNEL);
+#endif
#ifdef CONFIG_VMAP_STACK
tsk->stack_vm_area = stack_vm_area;
#endif
@@ -30,6 +30,13 @@ config SECURITY
model will be used.
If you are unsure how to answer this question, answer N.
+config KAISER
+ bool "Remove the kernel mapping in user mode"
+ depends on X86_64
+ depends on !PARAVIRT
+ help
+ This enforces a strict kernel and user space isolation in order to close
+ hardware side channels on kernel address information.
config SECURITYFS
bool "Enable the securityfs filesystem"
--
2.9.3