From patchwork Wed May 12 06:44:03 2010 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Huang, Ying" X-Patchwork-Id: 98924 Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by demeter.kernel.org (8.14.3/8.14.3) with ESMTP id o4C6ig8H007145 for ; Wed, 12 May 2010 06:44:42 GMT Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1751959Ab0ELGoI (ORCPT ); Wed, 12 May 2010 02:44:08 -0400 Received: from mga11.intel.com ([192.55.52.93]:24573 "EHLO mga11.intel.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751560Ab0ELGoF (ORCPT ); Wed, 12 May 2010 02:44:05 -0400 Received: from fmsmga002.fm.intel.com ([10.253.24.26]) by fmsmga102.fm.intel.com with ESMTP; 11 May 2010 23:42:01 -0700 X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="4.53,212,1272870000"; d="scan'208";a="566377777" Received: from yhuang-dev.sh.intel.com (HELO [10.239.13.66]) ([10.239.13.66]) by fmsmga002.fm.intel.com with ESMTP; 11 May 2010 23:43:17 -0700 Subject: [PATCH -v2] KVM, Fix QEMU-KVM is killed by guest SRAO MCE From: Huang Ying To: Avi Kivity , Andi Kleen , Andrew Morton , masbock@linux.vnet.ibm.com, "Wu, Fengguang" Cc: linux-kernel@vger.kernel.org, kvm@vger.kernel.org Date: Wed, 12 May 2010 14:44:03 +0800 Message-ID: <1273646643.3564.51.camel@yhuang-dev.sh.intel.com> Mime-Version: 1.0 X-Mailer: Evolution 2.28.3 Sender: kvm-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: kvm@vger.kernel.org X-Greylist: IP, sender and recipient auto-whitelisted, not delayed by milter-greylist-4.2.3 (demeter.kernel.org [140.211.167.41]); Wed, 12 May 2010 06:44:42 +0000 (UTC) --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include @@ -1975,6 +1976,27 @@ static int __direct_map(struct kvm_vcpu return pt_write; } +static void kvm_send_hwpoison_signal(struct kvm *kvm, gfn_t gfn) +{ + char buf[1]; + void __user *hva; + int r; + + /* Touch the page, so send SIGBUS */ + hva = (void __user *)gfn_to_hva(kvm, gfn); + r = copy_from_user(buf, hva, 1); +} + +static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn) +{ + kvm_release_pfn_clean(pfn); + if (is_hwpoison_pfn(pfn)) { + kvm_send_hwpoison_signal(kvm, gfn); + return 0; + } + return 1; +} + static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) { int r; @@ -1998,10 +2020,8 @@ static int nonpaging_map(struct kvm_vcpu pfn = gfn_to_pfn(vcpu->kvm, gfn); /* mmio */ - if (is_error_pfn(pfn)) { - kvm_release_pfn_clean(pfn); - return 1; - } + if (is_error_pfn(pfn)) + return kvm_handle_bad_page(vcpu->kvm, gfn, pfn); spin_lock(&vcpu->kvm->mmu_lock); if (mmu_notifier_retry(vcpu, mmu_seq)) @@ -2204,10 +2224,8 @@ static int tdp_page_fault(struct kvm_vcp mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); pfn = gfn_to_pfn(vcpu->kvm, gfn); - if (is_error_pfn(pfn)) { - kvm_release_pfn_clean(pfn); - return 1; - } + if (is_error_pfn(pfn)) + return kvm_handle_bad_page(vcpu->kvm, gfn, pfn); spin_lock(&vcpu->kvm->mmu_lock); if (mmu_notifier_retry(vcpu, mmu_seq)) goto out_unlock; --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -424,11 +424,8 @@ static int FNAME(page_fault)(struct kvm_ pfn = gfn_to_pfn(vcpu->kvm, walker.gfn); /* mmio */ - if (is_error_pfn(pfn)) { - pgprintk("gfn %lx is mmio\n", walker.gfn); - kvm_release_pfn_clean(pfn); - return 1; - } + if (is_error_pfn(pfn)) + return kvm_handle_bad_page(vcpu->kvm, walker.gfn, pfn); spin_lock(&vcpu->kvm->mmu_lock); if (mmu_notifier_retry(vcpu, mmu_seq)) --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -254,6 +254,7 @@ extern pfn_t bad_pfn; int is_error_page(struct page *page); int is_error_pfn(pfn_t pfn); +int is_hwpoison_pfn(pfn_t pfn); int kvm_is_error_hva(unsigned long addr); int kvm_set_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem, --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -92,6 +92,9 @@ static bool kvm_rebooting; static bool largepages_enabled = true; +struct page *hwpoison_page; +pfn_t hwpoison_pfn; + inline int kvm_is_mmio_pfn(pfn_t pfn) { if (pfn_valid(pfn)) { @@ -809,16 +812,22 @@ EXPORT_SYMBOL_GPL(kvm_disable_largepages int is_error_page(struct page *page) { - return page == bad_page; + return page == bad_page || page == hwpoison_page; } EXPORT_SYMBOL_GPL(is_error_page); int is_error_pfn(pfn_t pfn) { - return pfn == bad_pfn; + return pfn == bad_pfn || pfn == hwpoison_pfn; } EXPORT_SYMBOL_GPL(is_error_pfn); +int is_hwpoison_pfn(pfn_t pfn) +{ + return pfn == hwpoison_pfn; +} +EXPORT_SYMBOL_GPL(is_hwpoison_pfn); + static inline unsigned long bad_hva(void) { return PAGE_OFFSET; @@ -939,6 +948,11 @@ static pfn_t hva_to_pfn(struct kvm *kvm, if (unlikely(npages != 1)) { struct vm_area_struct *vma; + if (is_hwpoison_address(addr)) { + get_page(hwpoison_page); + return page_to_pfn(hwpoison_page); + } + down_read(¤t->mm->mmap_sem); vma = find_vma(current->mm, addr); @@ -2198,6 +2212,15 @@ int kvm_init(void *opaque, unsigned int bad_pfn = page_to_pfn(bad_page); + hwpoison_page = alloc_page(GFP_KERNEL | __GFP_ZERO); + + if (hwpoison_page == NULL) { + r = -ENOMEM; + goto out_free_0; + } + + hwpoison_pfn = page_to_pfn(hwpoison_page); + if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) { r = -ENOMEM; goto out_free_0; @@ -2269,6 +2292,8 @@ out_free_1: out_free_0a: free_cpumask_var(cpus_hardware_enabled); out_free_0: + if (hwpoison_page) + __free_page(hwpoison_page); __free_page(bad_page); out: kvm_arch_exit(); @@ -2291,6 +2316,7 @@ void kvm_exit(void) kvm_arch_hardware_unsetup(); kvm_arch_exit(); free_cpumask_var(cpus_hardware_enabled); + __free_page(hwpoison_page); __free_page(bad_page); } EXPORT_SYMBOL_GPL(kvm_exit); --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -45,6 +45,7 @@ #include #include #include +#include #include "internal.h" int sysctl_memory_failure_early_kill __read_mostly = 0; @@ -1296,3 +1297,30 @@ done: /* keep elevated page count for bad page */ return ret; } + +int is_hwpoison_address(unsigned long addr) +{ + pgd_t *pgdp; + pud_t *pudp; + pmd_t *pmdp; + pte_t pte, *ptep; + swp_entry_t entry; + + pgdp = pgd_offset(current->mm, addr); + if (!pgd_present(*pgdp)) + return 0; + pudp = pud_offset(pgdp, addr); + if (!pud_present(*pudp)) + return 0; + pmdp = pmd_offset(pudp, addr); + if (!pmd_present(*pmdp)) + return 0; + ptep = pte_offset_map(pmdp, addr); + pte = *ptep; + pte_unmap(ptep); + if (!is_swap_pte(pte)) + return 0; + entry = pte_to_swp_entry(pte); + return is_hwpoison_entry(entry); +} +EXPORT_SYMBOL_GPL(is_hwpoison_address); --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1464,6 +1464,14 @@ extern int sysctl_memory_failure_recover extern void shake_page(struct page *p, int access); extern atomic_long_t mce_bad_pages; extern int soft_offline_page(struct page *page, int flags); +#ifdef CONFIG_MEMORY_FAILURE +int is_hwpoison_address(unsigned long addr); +#else +static inline int is_hwpoison_address(unsigned long addr) +{ + return 0; +} +#endif extern void dump_page(struct page *page);