[RFC,24/27] kvm/isolation: KVM page fault handler

Message ID	1557758315-12667-25-git-send-email-alexandre.chartre@oracle.com (mailing list archive)
State	New, archived
Headers	show Return-Path: <kvm-owner@kernel.org> From: Alexandre Chartre <alexandre.chartre@oracle.com> To: pbonzini@redhat.com, rkrcmar@redhat.com, tglx@linutronix.de, mingo@redhat.com, bp@alien8.de, hpa@zytor.com, dave.hansen@linux.intel.com, luto@kernel.org, peterz@infradead.org, kvm@vger.kernel.org, x86@kernel.org, linux-mm@kvack.org, linux-kernel@vger.kernel.org Cc: konrad.wilk@oracle.com, jan.setjeeilers@oracle.com, liran.alon@oracle.com, jwadams@google.com, alexandre.chartre@oracle.com Subject: [RFC KVM 24/27] kvm/isolation: KVM page fault handler Date: Mon, 13 May 2019 16:38:32 +0200 Message-Id: <1557758315-12667-25-git-send-email-alexandre.chartre@oracle.com> In-Reply-To: <1557758315-12667-1-git-send-email-alexandre.chartre@oracle.com> References: <1557758315-12667-1-git-send-email-alexandre.chartre@oracle.com> Sender: kvm-owner@vger.kernel.org Precedence: bulk
Series	KVM Address Space Isolation \| expand [RFC,00/27] KVM Address Space Isolation [RFC,01/27] kernel: Export memory-management symbols required for KVM address space isolation [RFC,02/27] KVM: x86: Introduce address_space_isolation module parameter [RFC,03/27] KVM: x86: Introduce KVM separate virtual address space [RFC,04/27] KVM: x86: Switch to KVM address space on entry to guest [RFC,05/27] KVM: x86: Add handler to exit kvm isolation [RFC,06/27] KVM: x86: Exit KVM isolation on IRQ entry [RFC,07/27] KVM: x86: Switch to host address space when may access sensitive data [RFC,08/27] KVM: x86: Optimize branches which checks if address space isolation enabled [RFC,09/27] kvm/isolation: function to track buffers allocated for the KVM page table [RFC,10/27] kvm/isolation: add KVM page table entry free functions [RFC,11/27] kvm/isolation: add KVM page table entry offset functions [RFC,12/27] kvm/isolation: add KVM page table entry allocation functions [RFC,13/27] kvm/isolation: add KVM page table entry set functions [RFC,14/27] kvm/isolation: functions to copy page table entries for a VA range [RFC,15/27] kvm/isolation: keep track of VA range mapped in KVM address space [RFC,16/27] kvm/isolation: functions to clear page table entries for a VA range [RFC,17/27] kvm/isolation: improve mapping copy when mapping is already present [RFC,18/27] kvm/isolation: function to copy page table entries for percpu buffer [RFC,19/27] kvm/isolation: initialize the KVM page table with core mappings [RFC,20/27] kvm/isolation: initialize the KVM page table with vmx specific data [RFC,21/27] kvm/isolation: initialize the KVM page table with vmx VM data [RFC,22/27] kvm/isolation: initialize the KVM page table with vmx cpu data [RFC,23/27] kvm/isolation: initialize the KVM page table with the vcpu tasks [RFC,24/27] kvm/isolation: KVM page fault handler [RFC,25/27] kvm/isolation: implement actual KVM isolation enter/exit [RFC,26/27] kvm/isolation: initialize the KVM page table with KVM memslots [RFC,27/27] kvm/isolation: initialize the KVM page table with KVM buses

diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 2b58864..aa28763 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -292,6 +292,7 @@ void show_stack(struct task_struct *task, unsigned long *sp) show_trace_log_lvl(task, NULL, sp, KERN_DEFAULT); } +EXPORT_SYMBOL(show_stack); void show_stack_regs(struct pt_regs *regs) { diff --git a/arch/x86/kvm/isolation.c b/arch/x86/kvm/isolation.c index e7979b3..db0a7ce 100644 --- a/arch/x86/kvm/isolation.c +++ b/arch/x86/kvm/isolation.c @@ -8,6 +8,7 @@ #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/printk.h> +#include <linux/sched/debug.h> #include <linux/slab.h> #include <asm/cpu_entry_area.h> @@ -17,6 +18,9 @@ #include "isolation.h" +extern bool (*kvm_page_fault_handler)(struct pt_regs *regs, + unsigned long error_code, + unsigned long address); enum page_table_level { PGT_LEVEL_PTE, @@ -91,6 +95,25 @@ struct kvm_range_mapping { static LIST_HEAD(kvm_range_mapping_list); static DEFINE_MUTEX(kvm_range_mapping_lock); +/* + * When a page fault occurs, while running with the KVM address space, + * the KVM page fault handler prints information about the fault (in + * particular the stack trace), and it switches back to the kernel + * address space. + * + * Information printed by the KVM page fault handler can be used to find + * out data not mapped in the KVM address space. Then the KVM address + * space can be augmented to include the missing mapping so that we don't + * fault at that same place anymore. + * + * The following variables keep track of page faults occurring while running + * with the KVM address space to prevent displaying the same information. + */ + +#define KVM_LAST_FAULT_COUNT 128 + +static unsigned long kvm_last_fault[KVM_LAST_FAULT_COUNT]; + struct mm_struct kvm_mm = { .mm_rb = RB_ROOT, @@ -126,6 +149,14 @@ static void kvm_clear_mapping(void *ptr, size_t size, static bool __read_mostly address_space_isolation; module_param(address_space_isolation, bool, 0444); +/* + * When set to true, KVM dumps the stack when a page fault occurs while + * running with the KVM address space. Otherwise the page fault is still + * reported but without the stack trace. + */ +static bool __read_mostly page_fault_stack = true; +module_param(page_fault_stack, bool, 0444); + static struct kvm_range_mapping *kvm_get_range_mapping_locked(void *ptr, bool *subset) { @@ -1195,6 +1226,173 @@ static void kvm_reset_all_task_mapping(void) mutex_unlock(&kvm_task_mapping_lock); } +static int bad_address(void *p) +{ + unsigned long dummy; + + return probe_kernel_address((unsigned long *)p, dummy); +} + +static void kvm_dump_pagetable(pgd_t *base, unsigned long address) +{ + pgd_t *pgd = base + pgd_index(address); + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + pr_info("BASE %px ", base); + + if (bad_address(pgd)) + goto bad; + + pr_cont("PGD %lx ", pgd_val(*pgd)); + + if (!pgd_present(*pgd)) + goto out; + + p4d = p4d_offset(pgd, address); + if (bad_address(p4d)) + goto bad; + + pr_cont("P4D %lx ", p4d_val(*p4d)); + if (!p4d_present(*p4d) || p4d_large(*p4d)) + goto out; + + pud = pud_offset(p4d, address); + if (bad_address(pud)) + goto bad; + + pr_cont("PUD %lx ", pud_val(*pud)); + if (!pud_present(*pud) || pud_large(*pud)) + goto out; + + pmd = pmd_offset(pud, address); + if (bad_address(pmd)) + goto bad; + + pr_cont("PMD %lx ", pmd_val(*pmd)); + if (!pmd_present(*pmd) || pmd_large(*pmd)) + goto out; + + pte = pte_offset_kernel(pmd, address); + if (bad_address(pte)) + goto bad; + + pr_cont("PTE %lx", pte_val(*pte)); +out: + pr_cont("\n"); + return; +bad: + pr_info("BAD\n"); +} + +static void kvm_clear_page_fault(void) +{ + int i; + + for (i = 0; i < KVM_LAST_FAULT_COUNT; i++) + kvm_last_fault[i] = 0; +} + +static void kvm_log_page_fault(struct pt_regs *regs, unsigned long error_code, + unsigned long address) +{ + int i; + + /* + * Log information about the fault only if this is a fault + * we don't know about yet (or if the fault tracking buffer + * is full). + */ + for (i = 0; i < KVM_LAST_FAULT_COUNT; i++) { + if (!kvm_last_fault[i]) { + kvm_last_fault[i] = regs->ip; + break; + } + if (kvm_last_fault[i] == regs->ip) + return; + } + + if (i >= KVM_LAST_FAULT_COUNT) + pr_warn("KVM isolation: fault tracking buffer is full [%d]\n", + i); + + pr_info("KVM isolation: page fault #%d (%ld) at %pS on %px (%pS)\n", + i, error_code, (void *)regs->ip, + (void *)address, (void *)address); + if (page_fault_stack) + show_stack(NULL, (unsigned long *)regs->sp); +} + +/* + * KVM Page Fault Handler. The handler handles two simple cases: + * + * - If the fault occurs while using the kernel address space, then let + * the kernel handles the fault normally. + * + * - If the fault occurs while using the KVM address space, then switch + * to the kernel address space, and retry. + * + * It also handles a tricky case: if the fault occurs when using the KVM + * address space but while switching to the kernel address space then the + * switch is failing and we can't recover. In that case, we force switching + * to the kernel address space, print information and let the kernel + * handles the fault. + */ +static bool kvm_page_fault(struct pt_regs *regs, unsigned long error_code, + unsigned long address) +{ + struct mm_struct *active_mm = current->active_mm; + unsigned long cr3; + + /* + * First, do a quick and simple test to see if we are using + * the KVM address space. If we do then exit KVM isolation, + * log the fault and report that we have handled the fault. + */ + if (likely(active_mm == &kvm_mm)) { + kvm_isolation_exit(); + kvm_log_page_fault(regs, error_code, address); + return true; + } + + /* + * Verify that we are effectively using the kernel address space. + * When switching address space, active_mm is not necessarily up + * to date as it can already be set with the next mm while %cr3 + * has not been updated yet. So check loaded_mm which is updated + * after %cr3. + * + * If we are effectively using the kernel address space then report + * that we haven't handled the fault. + */ + if (this_cpu_read(cpu_tlbstate.loaded_mm) != &kvm_mm) + return false; + + /* + * We are actually using the KVM address space and faulting while + * switching address space. Force swiching to the kernel address + * space, log information and reported that we haven't handled + * the fault. + */ + cr3 = __read_cr3(); + write_cr3(build_cr3(active_mm->pgd, 0)); + kvm_dump_pagetable(kvm_mm.pgd, address); + kvm_dump_pagetable(active_mm->pgd, address); + printk(KERN_DEFAULT "KVM isolation: page fault %ld at %pS on %lx (%pS) while switching mm\n" + " cr3=%lx\n" + " kvm_mm=%px pgd=%px\n" + " active_mm=%px pgd=%px\n", + error_code, (void *)regs->ip, address, (void *)address, + cr3, + &kvm_mm, kvm_mm.pgd, + active_mm, active_mm->pgd); + dump_stack(); + + return false; +} + static int kvm_isolation_init_page_table(void) { @@ -1384,11 +1582,13 @@ static void kvm_isolation_uninit_mm(void) static void kvm_isolation_set_handlers(void) { kvm_set_isolation_exit_handler(kvm_isolation_exit); + kvm_page_fault_handler = kvm_page_fault; } static void kvm_isolation_clear_handlers(void) { kvm_set_isolation_exit_handler(NULL); + kvm_page_fault_handler = NULL; } int kvm_isolation_init_vm(struct kvm *kvm) @@ -1396,6 +1596,8 @@ int kvm_isolation_init_vm(struct kvm *kvm) if (!kvm_isolation()) return 0; + kvm_clear_page_fault(); + pr_debug("mapping kvm srcu sda\n"); return (kvm_copy_percpu_mapping(kvm->srcu.sda, diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 46df4c6..317e105 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -33,6 +33,10 @@ #define CREATE_TRACE_POINTS #include <asm/trace/exceptions.h> +bool (*kvm_page_fault_handler)(struct pt_regs *regs, unsigned long error_code, + unsigned long address); +EXPORT_SYMBOL(kvm_page_fault_handler); + /* * Returns 0 if mmiotrace is disabled, or if the fault is not * handled by mmiotrace: @@ -1253,6 +1257,14 @@ static int fault_in_kernel_space(unsigned long address) WARN_ON_ONCE(hw_error_code & X86_PF_PK); /* + * KVM might be able to handle the fault when running with the + * KVM address space. + */ + if (kvm_page_fault_handler && + kvm_page_fault_handler(regs, hw_error_code, address)) + return; + + /* * We can fault-in kernel-space virtual memory on-demand. The * 'reference' page table is init_mm.pgd. *

[RFC,24/27] kvm/isolation: KVM page fault handler

Commit Message

Comments

Patch