[-v4] QEMU-KVM: MCE: Relay UCR MCE to guest

Message ID	1253501005.15717.548.camel@yhuang-dev.sh.intel.com (mailing list archive)
State	New, archived
Headers	show Received: from vger.kernel.org (vger.kernel.org [209.132.176.167]) by demeter.kernel.org (8.14.2/8.14.2) with ESMTP id n8L2hUqM023533 for <patchwork-kvm@patchwork.kernel.org>; Mon, 21 Sep 2009 02:43:30 GMT Subject: [PATCH -v4] QEMU-KVM: MCE: Relay UCR MCE to guest From: Huang Ying <ying.huang@intel.com> To: Marcelo Tosatti <mtosatti@redhat.com>, Avi Kivity <avi@redhat.com> Cc: Andi Kleen <andi@firstfloor.org>, Anthony Liguori <aliguori@us.ibm.com>, "kvm@vger.kernel.org" <kvm@vger.kernel.org> Content-Type: text/plain Date: Mon, 21 Sep 2009 10:43:25 +0800 Message-Id: <1253501005.15717.548.camel@yhuang-dev.sh.intel.com> Mime-Version: 1.0 Content-Transfer-Encoding: 7bit Sender: kvm-owner@vger.kernel.org Precedence: bulk

--- a/qemu-kvm.c +++ b/qemu-kvm.c @@ -27,10 +27,23 @@ #include <sys/mman.h> #include <sys/ioctl.h> #include <signal.h> +#include <sys/signalfd.h> +#include <sys/prctl.h> #define false 0 #define true 1 +#ifndef PR_MCE_KILL +#define PR_MCE_KILL 33 +#endif + +#ifndef BUS_MCEERR_AR +#define BUS_MCEERR_AR 4 +#endif +#ifndef BUS_MCEERR_AO +#define BUS_MCEERR_AO 5 +#endif + #define EXPECTED_KVM_API_VERSION 12 #if EXPECTED_KVM_API_VERSION != KVM_API_VERSION @@ -1509,6 +1522,66 @@ static void sig_ipi_handler(int n) { } +static void hardware_memory_error(void) +{ + fprintf(stderr, "Hardware memory error!\n"); + exit(1); +} + +static void sigbus_reraise(void) +{ + sigset_t set; + struct sigaction action; + + memset(&action, 0, sizeof(action)); + action.sa_handler = SIG_DFL; + if (!sigaction(SIGBUS, &action, NULL)) { + raise(SIGBUS); + sigemptyset(&set); + sigaddset(&set, SIGBUS); + sigprocmask(SIG_UNBLOCK, &set, NULL); + } + perror("Failed to re-raise SIGBUS!\n"); + abort(); +} + +static void sigbus_handler(int n, struct signalfd_siginfo *siginfo, void *ctx) +{ +#if defined(KVM_CAP_MCE) && defined(TARGET_I386) + if (first_cpu->mcg_cap && siginfo->ssi_addr + && siginfo->ssi_code == BUS_MCEERR_AO) { + uint64_t status; + unsigned long paddr; + CPUState *cenv; + + /* Hope we are lucky for AO MCE */ + if (do_qemu_ram_addr_from_host((void *)siginfo->ssi_addr, &paddr)) { + fprintf(stderr, "Hardware memory error for memory used by " + "QEMU itself instead of guest system!: %llx\n", + (unsigned long long)siginfo->ssi_addr); + return; + } + status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN + | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S + | 0xc0; + kvm_inject_x86_mce(first_cpu, 9, status, + MCG_STATUS_MCIP | MCG_STATUS_RIPV, paddr, + (MCM_ADDR_PHYS << 6) | 0xc, 1); + for (cenv = first_cpu->next_cpu; cenv != NULL; cenv = cenv->next_cpu) + kvm_inject_x86_mce(cenv, 1, MCI_STATUS_VAL | MCI_STATUS_UC, + MCG_STATUS_MCIP | MCG_STATUS_RIPV, 0, 0, 1); + } else +#endif + { + if (siginfo->ssi_code == BUS_MCEERR_AO) + return; + else if (siginfo->ssi_code == BUS_MCEERR_AR) + hardware_memory_error(); + else + sigbus_reraise(); + } +} + static void on_vcpu(CPUState *env, void (*func)(void *data), void *data) { struct qemu_work_item wi; @@ -1666,29 +1739,101 @@ static void flush_queued_work(CPUState * pthread_cond_broadcast(&qemu_work_cond); } +static void kvm_on_sigbus(CPUState *env, siginfo_t *siginfo) +{ +#if defined(KVM_CAP_MCE) && defined(TARGET_I386) + struct kvm_x86_mce mce = { + .bank = 9, + }; + unsigned long paddr; + int r; + + if (env->mcg_cap && siginfo->si_addr + && (siginfo->si_code == BUS_MCEERR_AR + || siginfo->si_code == BUS_MCEERR_AO)) { + if (siginfo->si_code == BUS_MCEERR_AR) { + /* Fake an Intel architectural Data Load SRAR UCR */ + mce.status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN + | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S + | MCI_STATUS_AR | 0x134; + mce.misc = (MCM_ADDR_PHYS << 6) | 0xc; + mce.mcg_status = MCG_STATUS_MCIP | MCG_STATUS_EIPV; + } else { + /* Fake an Intel architectural Memory scrubbing UCR */ + mce.status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN + | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S + | 0xc0; + mce.misc = (MCM_ADDR_PHYS << 6) | 0xc; + mce.mcg_status = MCG_STATUS_MCIP | MCG_STATUS_RIPV; + } + if (do_qemu_ram_addr_from_host((void *)siginfo->si_addr, &paddr)) { + fprintf(stderr, "Hardware memory error for memory used by " + "QEMU itself instaed of guest system!\n"); + /* Hope we are lucky for AO MCE */ + if (siginfo->si_code == BUS_MCEERR_AO) + return; + else + hardware_memory_error(); + } + mce.addr = paddr; + r = kvm_set_mce(env->kvm_cpu_state.vcpu_ctx, &mce); + if (r < 0) { + fprintf(stderr, "kvm_set_mce: %s\n", strerror(errno)); + abort(); + } + } else +#endif + { + if (siginfo->si_code == BUS_MCEERR_AO) + return; + else if (siginfo->si_code == BUS_MCEERR_AR) + hardware_memory_error(); + else + sigbus_reraise(); + } +} + static void kvm_main_loop_wait(CPUState *env, int timeout) { struct timespec ts; int r, e; siginfo_t siginfo; sigset_t waitset; - - pthread_mutex_unlock(&qemu_mutex); + sigset_t chkset; ts.tv_sec = timeout / 1000; ts.tv_nsec = (timeout % 1000) * 1000000; sigemptyset(&waitset); sigaddset(&waitset, SIG_IPI); + sigaddset(&waitset, SIGBUS); - r = sigtimedwait(&waitset, &siginfo, &ts); - e = errno; + do { + pthread_mutex_unlock(&qemu_mutex); - pthread_mutex_lock(&qemu_mutex); + r = sigtimedwait(&waitset, &siginfo, &ts); + e = errno; - if (r == -1 && !(e == EAGAIN || e == EINTR)) { - printf("sigtimedwait: %s\n", strerror(e)); - exit(1); - } + pthread_mutex_lock(&qemu_mutex); + + if (r == -1 && !(e == EAGAIN || e == EINTR)) { + printf("sigtimedwait: %s\n", strerror(e)); + exit(1); + } + + switch (r) { + case SIGBUS: + kvm_on_sigbus(env, &siginfo); + break; + default: + break; + } + + r = sigpending(&chkset); + if (r == -1) { + printf("sigpending: %s\n", strerror(e)); + exit(1); + } + } while (sigismember(&chkset, SIG_IPI) || sigismember(&chkset, SIGBUS)); cpu_single_env = env; flush_queued_work(env); @@ -1769,6 +1914,7 @@ static void setup_kernel_sigmask(CPUStat sigprocmask(SIG_BLOCK, NULL, &set); sigdelset(&set, SIG_IPI); + sigdelset(&set, SIGBUS); kvm_set_signal_mask(env->kvm_cpu_state.vcpu_ctx, &set); } @@ -1896,12 +2042,20 @@ void kvm_hpet_enable_kpit(void) int kvm_init_ap(void) { + struct sigaction action; + #ifdef TARGET_I386 kvm_tpr_opt_setup(); #endif qemu_add_vm_change_state_handler(kvm_vm_state_change_handler, NULL); signal(SIG_IPI, sig_ipi_handler); + + memset(&action, 0, sizeof(action)); + action.sa_flags = SA_SIGINFO; + action.sa_sigaction = (void (*)(int, siginfo_t*, void*))sigbus_handler; + sigaction(SIGBUS, &action, NULL); + prctl(PR_MCE_KILL, 1, 1); return 0; } @@ -1962,7 +2116,10 @@ static void sigfd_handler(void *opaque) } sigaction(info.ssi_signo, NULL, &action); - if (action.sa_handler) + if ((action.sa_flags & SA_SIGINFO) && action.sa_sigaction) + action.sa_sigaction(info.ssi_signo, + (siginfo_t *)&info, NULL); + else if (action.sa_handler) action.sa_handler(info.ssi_signo); } @@ -2012,6 +2169,7 @@ int kvm_main_loop(void) sigemptyset(&mask); sigaddset(&mask, SIGIO); sigaddset(&mask, SIGALRM); + sigaddset(&mask, SIGBUS); sigprocmask(SIG_BLOCK, &mask, NULL); sigfd = qemu_signalfd(&mask); @@ -2512,6 +2670,7 @@ int kvm_set_boot_cpu_id(uint32_t id) struct kvm_x86_mce_data { CPUState *env; struct kvm_x86_mce *mce; + int abort_on_error; }; static void kvm_do_inject_x86_mce(void *_data) @@ -2520,13 +2679,17 @@ static void kvm_do_inject_x86_mce(void * int r; r = kvm_set_mce(data->env->kvm_cpu_state.vcpu_ctx, data->mce); - if (r < 0) + if (r < 0) { perror("kvm_set_mce FAILED"); + if (data->abort_on_error) + abort(); + } } #endif void kvm_inject_x86_mce(CPUState *cenv, int bank, uint64_t status, - uint64_t mcg_status, uint64_t addr, uint64_t misc) + uint64_t mcg_status, uint64_t addr, uint64_t misc, + int abort_on_error) { #ifdef KVM_CAP_MCE struct kvm_x86_mce mce = { @@ -2539,9 +2702,17 @@ void kvm_inject_x86_mce(CPUState *cenv, struct kvm_x86_mce_data data = { .env = cenv, .mce = &mce, + .abort_on_error = abort_on_error, }; + if (!cenv->mcg_cap) { + fprintf(stderr, "MCE support is not enabled!\n"); + return; + } on_vcpu(cenv, kvm_do_inject_x86_mce, &data); +#else + if (abort_on_error) + abort(); #endif } #endif --- a/target-i386/cpu.h +++ b/target-i386/cpu.h @@ -250,16 +250,32 @@ #define PG_ERROR_RSVD_MASK 0x08 #define PG_ERROR_I_D_MASK 0x10 -#define MCG_CTL_P (1UL<<8) /* MCG_CAP register available */ +#define MCG_CTL_P (1ULL<<8) /* MCG_CAP register available */ +#define MCG_SER_P (1ULL<<24) /* MCA recovery/new status bits */ -#define MCE_CAP_DEF MCG_CTL_P +#define MCE_CAP_DEF (MCG_CTL_P|MCG_SER_P) #define MCE_BANKS_DEF 10 +#define MCG_STATUS_RIPV (1ULL<<0) /* restart ip valid */ +#define MCG_STATUS_EIPV (1ULL<<1) /* ip points to correct instruction */ #define MCG_STATUS_MCIP (1ULL<<2) /* machine check in progress */ #define MCI_STATUS_VAL (1ULL<<63) /* valid error */ #define MCI_STATUS_OVER (1ULL<<62) /* previous errors lost */ #define MCI_STATUS_UC (1ULL<<61) /* uncorrected error */ +#define MCI_STATUS_EN (1ULL<<60) /* error enabled */ +#define MCI_STATUS_MISCV (1ULL<<59) /* misc error reg. valid */ +#define MCI_STATUS_ADDRV (1ULL<<58) /* addr reg. valid */ +#define MCI_STATUS_PCC (1ULL<<57) /* processor context corrupt */ +#define MCI_STATUS_S (1ULL<<56) /* Signaled machine check */ +#define MCI_STATUS_AR (1ULL<<55) /* Action required */ + +/* MISC register defines */ +#define MCM_ADDR_SEGOFF 0 /* segment offset */ +#define MCM_ADDR_LINEAR 1 /* linear address */ +#define MCM_ADDR_PHYS 2 /* physical address */ +#define MCM_ADDR_MEM 3 /* memory address */ +#define MCM_ADDR_GENERIC 7 /* generic */ #define MSR_IA32_TSC 0x10 #define MSR_IA32_APICBASE 0x1b --- a/cpu-common.h +++ b/cpu-common.h @@ -34,6 +34,7 @@ void qemu_ram_free(ram_addr_t addr); /* This should only be used for ram local to a device. */ void *qemu_get_ram_ptr(ram_addr_t addr); /* This should not be used by devices. */ +int do_qemu_ram_addr_from_host(void *ptr, ram_addr_t *ram_addr); ram_addr_t qemu_ram_addr_from_host(void *ptr); int cpu_register_io_memory(CPUReadMemoryFunc * const *mem_read, --- a/exec.c +++ b/exec.c @@ -2600,9 +2600,7 @@ void *qemu_get_ram_ptr(ram_addr_t addr) return block->host + (addr - block->offset); } -/* Some of the softmmu routines need to translate from a host pointer - (typically a TLB entry) back to a ram offset. */ -ram_addr_t qemu_ram_addr_from_host(void *ptr) +int do_qemu_ram_addr_from_host(void *ptr, ram_addr_t *ram_addr) { RAMBlock *prev; RAMBlock **prevp; @@ -2619,11 +2617,23 @@ ram_addr_t qemu_ram_addr_from_host(void prev = block; block = block->next; } - if (!block) { + if (!block) + return -1; + *ram_addr = block->offset + (host - block->host); + return 0; +} + +/* Some of the softmmu routines need to translate from a host pointer + (typically a TLB entry) back to a ram offset. */ +ram_addr_t qemu_ram_addr_from_host(void *ptr) +{ + ram_addr_t ram_addr; + + if (do_qemu_ram_addr_from_host(ptr, &ram_addr)) { fprintf(stderr, "Bad ram pointer %p\n", ptr); abort(); } - return block->offset + (host - block->host); + return ram_addr; } static uint32_t unassigned_mem_readb(void *opaque, target_phys_addr_t addr) --- a/qemu-kvm.h +++ b/qemu-kvm.h @@ -625,9 +625,11 @@ int kvm_inject_nmi(kvm_vcpu_context_t vc * \param mcg_status MSR_MCG_STATUS * \param addr MSR_MCI_ADDR * \param misc MSR_MCI_MISC + * \param abort_on_error abort on error */ void kvm_inject_x86_mce(CPUState *cenv, int bank, uint64_t status, - uint64_t mcg_status, uint64_t addr, uint64_t misc); + uint64_t mcg_status, uint64_t addr, uint64_t misc, + int abort_on_error); /*! * \brief Query wheather in kernel pit is used @@ -943,8 +945,11 @@ static inline int kvm_init(int smp_cpus) static inline void kvm_inject_x86_mce(CPUState *cenv, int bank, uint64_t status, uint64_t mcg_status, - uint64_t addr, uint64_t misc) + uint64_t addr, uint64_t misc, + int abort_on_error) { + if (abort_on_error) + abort(); } --- a/target-i386/helper.c +++ b/target-i386/helper.c @@ -1547,7 +1547,7 @@ void cpu_inject_x86_mce(CPUState *cenv, uint64_t *banks = cenv->mce_banks; if (kvm_enabled()) { - kvm_inject_x86_mce(cenv, bank, status, mcg_status, addr, misc); + kvm_inject_x86_mce(cenv, bank, status, mcg_status, addr, misc, 0); return; }

[-v4] QEMU-KVM: MCE: Relay UCR MCE to guest

Commit Message

Comments

Patch