From patchwork Fri Sep 18 07:58:59 2009
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Huang, Ying" <ying.huang@intel.com>
X-Patchwork-Id: 48465
Received: from vger.kernel.org (vger.kernel.org [209.132.176.167])
	by demeter.kernel.org (8.14.2/8.14.2) with ESMTP id n8I7x47K023484
	for <patchwork-kvm@patchwork.kernel.org>;
	Fri, 18 Sep 2009 07:59:05 GMT
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
	id S1752270AbZIRH67 (ORCPT
	<rfc822;patchwork-kvm@patchwork.kernel.org>);
	Fri, 18 Sep 2009 03:58:59 -0400
Received: (majordomo@vger.kernel.org) by vger.kernel.org id S1752610AbZIRH67
	(ORCPT <rfc822;kvm-outgoing>); Fri, 18 Sep 2009 03:58:59 -0400
Received: from mga14.intel.com ([143.182.124.37]:61242 "EHLO mga14.intel.com"
	rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP
	id S1751059AbZIRH66 (ORCPT <rfc822;kvm@vger.kernel.org>);
	Fri, 18 Sep 2009 03:58:58 -0400
Received: from azsmga001.ch.intel.com ([10.2.17.19])
	by azsmga102.ch.intel.com with ESMTP; 18 Sep 2009 00:59:01 -0700
X-ExtLoop1: 1
X-IronPort-AV: E=Sophos;i="4.44,408,1249282800"; d="scan'208";a="189052128"
Received: from yhuang-dev.sh.intel.com (HELO [10.239.13.48]) ([10.239.13.48])
	by azsmga001.ch.intel.com with ESMTP; 18 Sep 2009 00:59:00 -0700
Subject: [PATCH -v3] QEMU-KVM: MCE: Relay UCR MCE to guest
From: Huang Ying <ying.huang@intel.com>
To: Marcelo Tosatti <mtosatti@redhat.com>, Avi Kivity <avi@redhat.com>
Cc: Andi Kleen <andi@firstfloor.org>, Anthony Liguori <aliguori@us.ibm.com>,
	"kvm@vger.kernel.org" <kvm@vger.kernel.org>
Date: Fri, 18 Sep 2009 15:58:59 +0800
Message-Id: <1253260739.15717.511.camel@yhuang-dev.sh.intel.com>
Mime-Version: 1.0
X-Mailer: Evolution 2.26.3 
Sender: kvm-owner@vger.kernel.org
Precedence: bulk
List-ID: <kvm.vger.kernel.org>
X-Mailing-List: kvm@vger.kernel.org

UCR (uncorrected recovery) MCE is supported in recent Intel CPUs,
where some hardware error such as some memory error can be reported
without PCC (processor context corrupted). To recover from such MCE,
the corresponding memory will be unmapped, and all processes accessing
the memory will be killed via SIGBUS.

For KVM, if QEMU/KVM is killed, all guest processes will be killed
too. So we relay SIGBUS from host OS to guest system via a UCR MCE
injection. Then guest OS can isolate corresponding memory and kill
necessary guest processes only. SIGBUS sent to main thread (not VCPU
threads) will be broadcast to all VCPU threads as UCR MCE.

v3:

- Re-raise SIGBUS for SIGBUS not from MCE
- Kill itself for error in kvm_inject_x86_mce

v2:

- Use qemu_ram_addr_from_host instead of self made one to covert from
  host address to guest RAM address. Thanks Anthony Liguori.

Signed-off-by: Huang Ying <ying.huang@intel.com>
---
 cpu-common.h         |    1 
 exec.c               |   20 ++++-
 qemu-kvm.c           |  178 +++++++++++++++++++++++++++++++++++++++++++++++----
 qemu-kvm.h           |    9 ++
 target-i386/cpu.h    |   20 +++++
 target-i386/helper.c |    2 
 6 files changed, 208 insertions(+), 22 deletions(-)


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

--- a/qemu-kvm.c
+++ b/qemu-kvm.c
@@ -27,10 +27,23 @@
 #include <sys/mman.h>
 #include <sys/ioctl.h>
 #include <signal.h>
+#include <sys/signalfd.h>
+#include <sys/prctl.h>
 
 #define false 0
 #define true 1
 
+#ifndef PR_MCE_KILL
+#define PR_MCE_KILL 33
+#endif
+
+#ifndef BUS_MCEERR_AR
+#define BUS_MCEERR_AR 4
+#endif
+#ifndef BUS_MCEERR_AO
+#define BUS_MCEERR_AO 5
+#endif
+
 #define EXPECTED_KVM_API_VERSION 12
 
 #if EXPECTED_KVM_API_VERSION != KVM_API_VERSION
@@ -1509,6 +1522,49 @@ static void sig_ipi_handler(int n)
 {
 }
 
+static void hardware_memory_error(void)
+{
+    fprintf(stderr, "Hardware memory error!\n");
+    exit(1);
+}
+
+static void sigbus_handler(int n, struct signalfd_siginfo *siginfo, void *ctx)
+{
+#if defined(KVM_CAP_MCE) && defined(TARGET_I386)
+    if (first_cpu->mcg_cap && siginfo->ssi_addr
+        && siginfo->ssi_code == BUS_MCEERR_AO) {
+        uint64_t status;
+        unsigned long paddr;
+        CPUState *cenv;
+
+        /* Hope we are lucky for AO MCE */
+        if (do_qemu_ram_addr_from_host((void *)siginfo->ssi_addr, &paddr)) {
+            fprintf(stderr, "Hardware memory error for memory used by "
+                    "QEMU itself instead of guest system!: %llx\n",
+                    (unsigned long long)siginfo->ssi_addr);
+            return;
+        }
+        status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
+            | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
+            | 0xc0;
+        kvm_inject_x86_mce(first_cpu, 9, status,
+                           MCG_STATUS_MCIP | MCG_STATUS_RIPV, paddr,
+                           (MCM_ADDR_PHYS << 6) | 0xc, 1);
+        for (cenv = first_cpu->next_cpu; cenv != NULL; cenv = cenv->next_cpu)
+            kvm_inject_x86_mce(cenv, 1, MCI_STATUS_VAL | MCI_STATUS_UC,
+                               MCG_STATUS_MCIP | MCG_STATUS_RIPV, 0, 0, 1);
+    } else
+#endif
+    {
+        if (siginfo->ssi_code == BUS_MCEERR_AO)
+            return;
+        else if (siginfo->ssi_code == BUS_MCEERR_AR)
+            hardware_memory_error();
+        else
+            raise(SIGBUS);
+    }
+}
+
 static void on_vcpu(CPUState *env, void (*func)(void *data), void *data)
 {
     struct qemu_work_item wi;
@@ -1666,29 +1722,101 @@ static void flush_queued_work(CPUState *
     pthread_cond_broadcast(&qemu_work_cond);
 }
 
+static void kvm_on_sigbus(CPUState *env, siginfo_t *siginfo)
+{
+#if defined(KVM_CAP_MCE) && defined(TARGET_I386)
+    struct kvm_x86_mce mce = {
+            .bank = 9,
+    };
+    unsigned long paddr;
+    int r;
+
+    if (env->mcg_cap && siginfo->si_addr
+        && (siginfo->si_code == BUS_MCEERR_AR
+            || siginfo->si_code == BUS_MCEERR_AO)) {
+        if (siginfo->si_code == BUS_MCEERR_AR) {
+            /* Fake an Intel architectural Data Load SRAR UCR */
+            mce.status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
+                | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
+                | MCI_STATUS_AR | 0x134;
+            mce.misc = (MCM_ADDR_PHYS << 6) | 0xc;
+            mce.mcg_status = MCG_STATUS_MCIP | MCG_STATUS_EIPV;
+        } else {
+            /* Fake an Intel architectural Memory scrubbing UCR */
+            mce.status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
+                | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
+                | 0xc0;
+            mce.misc = (MCM_ADDR_PHYS << 6) | 0xc;
+            mce.mcg_status = MCG_STATUS_MCIP | MCG_STATUS_RIPV;
+        }
+        if (do_qemu_ram_addr_from_host((void *)siginfo->si_addr, &paddr)) {
+            fprintf(stderr, "Hardware memory error for memory used by "
+                    "QEMU itself instaed of guest system!\n");
+            /* Hope we are lucky for AO MCE */
+            if (siginfo->si_code == BUS_MCEERR_AO)
+                return;
+            else
+                hardware_memory_error();
+        }
+        mce.addr = paddr;
+        r = kvm_set_mce(env->kvm_cpu_state.vcpu_ctx, &mce);
+        if (r < 0) {
+            fprintf(stderr, "kvm_set_mce: %s\n", strerror(errno));
+            abort();
+        }
+    } else
+#endif
+    {
+        if (siginfo->si_code == BUS_MCEERR_AO)
+            return;
+        else if (siginfo->si_code == BUS_MCEERR_AR)
+            hardware_memory_error();
+        else
+            raise(SIGBUS);
+    }
+}
+
 static void kvm_main_loop_wait(CPUState *env, int timeout)
 {
     struct timespec ts;
     int r, e;
     siginfo_t siginfo;
     sigset_t waitset;
-
-    pthread_mutex_unlock(&qemu_mutex);
+    sigset_t chkset;
 
     ts.tv_sec = timeout / 1000;
     ts.tv_nsec = (timeout % 1000) * 1000000;
     sigemptyset(&waitset);
     sigaddset(&waitset, SIG_IPI);
+    sigaddset(&waitset, SIGBUS);
 
-    r = sigtimedwait(&waitset, &siginfo, &ts);
-    e = errno;
+    do {
+        pthread_mutex_unlock(&qemu_mutex);
 
-    pthread_mutex_lock(&qemu_mutex);
+        r = sigtimedwait(&waitset, &siginfo, &ts);
+        e = errno;
 
-    if (r == -1 && !(e == EAGAIN || e == EINTR)) {
-        printf("sigtimedwait: %s\n", strerror(e));
-        exit(1);
-    }
+        pthread_mutex_lock(&qemu_mutex);
+
+        if (r == -1 && !(e == EAGAIN || e == EINTR)) {
+            printf("sigtimedwait: %s\n", strerror(e));
+            exit(1);
+        }
+
+        switch (r) {
+        case SIGBUS:
+            kvm_on_sigbus(env, &siginfo);
+            break;
+        default:
+            break;
+        }
+
+        r = sigpending(&chkset);
+        if (r == -1) {
+            printf("sigpending: %s\n", strerror(e));
+            exit(1);
+        }
+    } while (sigismember(&chkset, SIG_IPI) || sigismember(&chkset, SIGBUS));
 
     cpu_single_env = env;
     flush_queued_work(env);
@@ -1769,6 +1897,7 @@ static void setup_kernel_sigmask(CPUStat
 
     sigprocmask(SIG_BLOCK, NULL, &set);
     sigdelset(&set, SIG_IPI);
+    sigdelset(&set, SIGBUS);
 
     kvm_set_signal_mask(env->kvm_cpu_state.vcpu_ctx, &set);
 }
@@ -1896,12 +2025,20 @@ void kvm_hpet_enable_kpit(void)
 
 int kvm_init_ap(void)
 {
+    struct sigaction action;
+
 #ifdef TARGET_I386
     kvm_tpr_opt_setup();
 #endif
     qemu_add_vm_change_state_handler(kvm_vm_state_change_handler, NULL);
 
     signal(SIG_IPI, sig_ipi_handler);
+
+    memset(&action, 0, sizeof(action));
+    action.sa_flags = SA_SIGINFO;
+    action.sa_sigaction = (void (*)(int, siginfo_t*, void*))sigbus_handler;
+    sigaction(SIGBUS, &action, NULL);
+    prctl(PR_MCE_KILL, 1, 1);
     return 0;
 }
 
@@ -1962,7 +2099,10 @@ static void sigfd_handler(void *opaque)
         }
 
         sigaction(info.ssi_signo, NULL, &action);
-        if (action.sa_handler)
+        if ((action.sa_flags & SA_SIGINFO) && action.sa_sigaction)
+            action.sa_sigaction(info.ssi_signo,
+                                (siginfo_t *)&info, NULL);
+	else if (action.sa_handler)
             action.sa_handler(info.ssi_signo);
 
     }
@@ -2012,6 +2152,7 @@ int kvm_main_loop(void)
     sigemptyset(&mask);
     sigaddset(&mask, SIGIO);
     sigaddset(&mask, SIGALRM);
+    sigaddset(&mask, SIGBUS);
     sigprocmask(SIG_BLOCK, &mask, NULL);
 
     sigfd = qemu_signalfd(&mask);
@@ -2508,6 +2649,7 @@ int kvm_set_boot_cpu_id(uint32_t id)
 struct kvm_x86_mce_data {
     CPUState *env;
     struct kvm_x86_mce *mce;
+    int abort_on_error;
 };
 
 static void kvm_do_inject_x86_mce(void *_data)
@@ -2516,13 +2658,17 @@ static void kvm_do_inject_x86_mce(void *
     int r;
 
     r = kvm_set_mce(data->env->kvm_cpu_state.vcpu_ctx, data->mce);
-    if (r < 0)
+    if (r < 0) {
         perror("kvm_set_mce FAILED");
+        if (data->abort_on_error)
+            abort();
+    }
 }
 #endif
 
 void kvm_inject_x86_mce(CPUState *cenv, int bank, uint64_t status,
-                        uint64_t mcg_status, uint64_t addr, uint64_t misc)
+                        uint64_t mcg_status, uint64_t addr, uint64_t misc,
+                        int abort_on_error)
 {
 #ifdef KVM_CAP_MCE
     struct kvm_x86_mce mce = {
@@ -2535,9 +2681,17 @@ void kvm_inject_x86_mce(CPUState *cenv, 
     struct kvm_x86_mce_data data = {
         .env = cenv,
         .mce = &mce,
+        .abort_on_error = abort_on_error,
     };
 
+    if (!cenv->mcg_cap) {
+        fprintf(stderr, "MCE support is not enabled!\n");
+        return;
+    }
     on_vcpu(cenv, kvm_do_inject_x86_mce, &data);
+#else
+    if (abort_on_error)
+        abort();
 #endif
 }
 #endif
--- a/target-i386/cpu.h
+++ b/target-i386/cpu.h
@@ -250,16 +250,32 @@
 #define PG_ERROR_RSVD_MASK 0x08
 #define PG_ERROR_I_D_MASK  0x10
 
-#define MCG_CTL_P	(1UL<<8)   /* MCG_CAP register available */
+#define MCG_CTL_P	(1ULL<<8)   /* MCG_CAP register available */
+#define MCG_SER_P	(1ULL<<24) /* MCA recovery/new status bits */
 
-#define MCE_CAP_DEF	MCG_CTL_P
+#define MCE_CAP_DEF	(MCG_CTL_P|MCG_SER_P)
 #define MCE_BANKS_DEF	10
 
+#define MCG_STATUS_RIPV	(1ULL<<0)   /* restart ip valid */
+#define MCG_STATUS_EIPV	(1ULL<<1)   /* ip points to correct instruction */
 #define MCG_STATUS_MCIP	(1ULL<<2)   /* machine check in progress */
 
 #define MCI_STATUS_VAL	(1ULL<<63)  /* valid error */
 #define MCI_STATUS_OVER	(1ULL<<62)  /* previous errors lost */
 #define MCI_STATUS_UC	(1ULL<<61)  /* uncorrected error */
+#define MCI_STATUS_EN	(1ULL<<60)  /* error enabled */
+#define MCI_STATUS_MISCV (1ULL<<59) /* misc error reg. valid */
+#define MCI_STATUS_ADDRV (1ULL<<58) /* addr reg. valid */
+#define MCI_STATUS_PCC	(1ULL<<57)  /* processor context corrupt */
+#define MCI_STATUS_S	(1ULL<<56)  /* Signaled machine check */
+#define MCI_STATUS_AR	(1ULL<<55)  /* Action required */
+
+/* MISC register defines */
+#define MCM_ADDR_SEGOFF	0	/* segment offset */
+#define MCM_ADDR_LINEAR	1	/* linear address */
+#define MCM_ADDR_PHYS	2	/* physical address */
+#define MCM_ADDR_MEM	3	/* memory address */
+#define MCM_ADDR_GENERIC 7	/* generic */
 
 #define MSR_IA32_TSC                    0x10
 #define MSR_IA32_APICBASE               0x1b
--- a/cpu-common.h
+++ b/cpu-common.h
@@ -34,6 +34,7 @@ void qemu_ram_free(ram_addr_t addr);
 /* This should only be used for ram local to a device.  */
 void *qemu_get_ram_ptr(ram_addr_t addr);
 /* This should not be used by devices.  */
+int do_qemu_ram_addr_from_host(void *ptr, ram_addr_t *ram_addr);
 ram_addr_t qemu_ram_addr_from_host(void *ptr);
 
 int cpu_register_io_memory(CPUReadMemoryFunc * const *mem_read,
--- a/exec.c
+++ b/exec.c
@@ -2600,9 +2600,7 @@ void *qemu_get_ram_ptr(ram_addr_t addr)
     return block->host + (addr - block->offset);
 }
 
-/* Some of the softmmu routines need to translate from a host pointer
-   (typically a TLB entry) back to a ram offset.  */
-ram_addr_t qemu_ram_addr_from_host(void *ptr)
+int do_qemu_ram_addr_from_host(void *ptr, ram_addr_t *ram_addr)
 {
     RAMBlock *prev;
     RAMBlock **prevp;
@@ -2619,11 +2617,23 @@ ram_addr_t qemu_ram_addr_from_host(void 
         prev = block;
         block = block->next;
     }
-    if (!block) {
+    if (!block)
+        return -1;
+    *ram_addr = block->offset + (host - block->host);
+    return 0;
+}
+
+/* Some of the softmmu routines need to translate from a host pointer
+   (typically a TLB entry) back to a ram offset.  */
+ram_addr_t qemu_ram_addr_from_host(void *ptr)
+{
+    ram_addr_t ram_addr;
+
+    if (do_qemu_ram_addr_from_host(ptr, &ram_addr)) {
         fprintf(stderr, "Bad ram pointer %p\n", ptr);
         abort();
     }
-    return block->offset + (host - block->host);
+    return ram_addr;
 }
 
 static uint32_t unassigned_mem_readb(void *opaque, target_phys_addr_t addr)
--- a/qemu-kvm.h
+++ b/qemu-kvm.h
@@ -625,9 +625,11 @@ int kvm_inject_nmi(kvm_vcpu_context_t vc
  * \param mcg_status MSR_MCG_STATUS
  * \param addr MSR_MCI_ADDR
  * \param misc MSR_MCI_MISC
+ * \param abort_on_error abort on error
  */
 void kvm_inject_x86_mce(CPUState *cenv, int bank, uint64_t status,
-                        uint64_t mcg_status, uint64_t addr, uint64_t misc);
+                        uint64_t mcg_status, uint64_t addr, uint64_t misc,
+                        int abort_on_error);
 
 /*!
  * \brief Query wheather in kernel pit is used
@@ -943,8 +945,11 @@ static inline int kvm_init(int smp_cpus)
 
 static inline void kvm_inject_x86_mce(CPUState *cenv, int bank,
                                       uint64_t status, uint64_t mcg_status,
-                                      uint64_t addr, uint64_t misc)
+                                      uint64_t addr, uint64_t misc,
+                                      int abort_on_error)
 {
+    if (abort_on_error)
+        abort();
 }
 

--- a/target-i386/helper.c
+++ b/target-i386/helper.c
@@ -1547,7 +1547,7 @@ void cpu_inject_x86_mce(CPUState *cenv, 
     uint64_t *banks = cenv->mce_banks;
 
     if (kvm_enabled()) {
-        kvm_inject_x86_mce(cenv, bank, status, mcg_status, addr, misc);
+        kvm_inject_x86_mce(cenv, bank, status, mcg_status, addr, misc, 0);
         return;
     }