[-v2] QEMU-KVM: MCE: Relay UCR MCE to guest

Message ID	1252463282.5212.44.camel@yhuang-dev.sh.intel.com (mailing list archive)
State	New, archived
Headers	show Received: from vger.kernel.org (vger.kernel.org [209.132.176.167]) by demeter.kernel.org (8.14.2/8.14.2) with ESMTP id n892S9o8015967 for <patchwork-kvm@patchwork.kernel.org>; Wed, 9 Sep 2009 02:28:09 GMT Subject: [PATCH -v2] QEMU-KVM: MCE: Relay UCR MCE to guest From: Huang Ying <ying.huang@intel.com> To: Avi Kivity <avi@redhat.com> Cc: Andi Kleen <andi@firstfloor.org>, Anthony Liguori <aliguori@us.ibm.com>, "kvm@vger.kernel.org" <kvm@vger.kernel.org> Content-Type: text/plain Date: Wed, 09 Sep 2009 10:28:02 +0800 Message-Id: <1252463282.5212.44.camel@yhuang-dev.sh.intel.com> Mime-Version: 1.0 Content-Transfer-Encoding: 7bit Sender: kvm-owner@vger.kernel.org Precedence: bulk

--- a/qemu-kvm.c +++ b/qemu-kvm.c @@ -27,10 +27,23 @@ #include <sys/mman.h> #include <sys/ioctl.h> #include <signal.h> +#include <sys/signalfd.h> +#include <sys/prctl.h> #define false 0 #define true 1 +#ifndef PR_MCE_KILL +#define PR_MCE_KILL 33 +#endif + +#ifndef BUS_MCEERR_AR +#define BUS_MCEERR_AR 4 +#endif +#ifndef BUS_MCEERR_AO +#define BUS_MCEERR_AO 5 +#endif + #define EXPECTED_KVM_API_VERSION 12 #if EXPECTED_KVM_API_VERSION != KVM_API_VERSION @@ -1507,6 +1520,37 @@ static void sig_ipi_handler(int n) { } +static void sigbus_handler(int n, struct signalfd_siginfo *siginfo, void *ctx) +{ + if (siginfo->ssi_code == BUS_MCEERR_AO) { + uint64_t status; + unsigned long paddr; + CPUState *cenv; + + /* Hope we are lucky for AO MCE */ + if (do_qemu_ram_addr_from_host((void *)siginfo->ssi_addr, &paddr)) { + fprintf(stderr, "Hardware memory error for memory used by " + "QEMU itself instead of guest system!: %llx\n", + (unsigned long long)siginfo->ssi_addr); + return; + } + status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN + | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S + | 0xc0; + kvm_inject_x86_mce(first_cpu, 9, status, + MCG_STATUS_MCIP | MCG_STATUS_RIPV, paddr, + (MCM_ADDR_PHYS << 6) | 0xc); + for (cenv = first_cpu->next_cpu; cenv != NULL; cenv = cenv->next_cpu) + kvm_inject_x86_mce(cenv, 1, MCI_STATUS_VAL | MCI_STATUS_UC, + MCG_STATUS_MCIP | MCG_STATUS_RIPV, 0, 0); + return; + } else if (siginfo->ssi_code == BUS_MCEERR_AR) + fprintf(stderr, "Hardware memory error!\n"); + else + fprintf(stderr, "Internal error in QEMU!\n"); + exit(1); +} + static void on_vcpu(CPUState *env, void (*func)(void *data), void *data) { struct qemu_work_item wi; @@ -1649,29 +1693,102 @@ static void flush_queued_work(CPUState * pthread_cond_broadcast(&qemu_work_cond); } +static void kvm_on_sigbus(CPUState *env, siginfo_t *siginfo) +{ +#if defined(KVM_CAP_MCE) && defined(TARGET_I386) + struct kvm_x86_mce mce = { + .bank = 9, + }; + unsigned long paddr; + int r; + + if (env->mcg_cap && siginfo->si_addr + && (siginfo->si_code == BUS_MCEERR_AR + || siginfo->si_code == BUS_MCEERR_AO)) { + if (siginfo->si_code == BUS_MCEERR_AR) { + /* Fake an Intel architectural Data Load SRAR UCR */ + mce.status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN + | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S + | MCI_STATUS_AR | 0x134; + mce.misc = (MCM_ADDR_PHYS << 6) | 0xc; + mce.mcg_status = MCG_STATUS_MCIP | MCG_STATUS_EIPV; + } else { + /* Fake an Intel architectural Memory scrubbing UCR */ + mce.status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN + | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S + | 0xc0; + mce.misc = (MCM_ADDR_PHYS << 6) | 0xc; + mce.mcg_status = MCG_STATUS_MCIP | MCG_STATUS_RIPV; + } + if (do_qemu_ram_addr_from_host((void *)siginfo->si_addr, &paddr)) { + fprintf(stderr, "Hardware memory error for memory used by " + "QEMU itself instaed of guest system!\n"); + /* Hope we are lucky for AO MCE */ + if (siginfo->si_code == BUS_MCEERR_AO) + return; + else + exit(1); + } + mce.addr = paddr; + r = kvm_set_mce(env->kvm_cpu_state.vcpu_ctx, &mce); + if (r < 0) { + fprintf(stderr, "kvm_set_mce: %s\n", strerror(errno)); + exit(1); + } + } else +#endif + { + if (siginfo->si_code == BUS_MCEERR_AO) + return; + if (siginfo->si_code == BUS_MCEERR_AR) + fprintf(stderr, "Hardware memory error!\n"); + else + fprintf(stderr, "Internal error in QEMU!\n"); + exit(1); + } +} + static void kvm_main_loop_wait(CPUState *env, int timeout) { struct timespec ts; int r, e; siginfo_t siginfo; sigset_t waitset; - - pthread_mutex_unlock(&qemu_mutex); + sigset_t chkset; ts.tv_sec = timeout / 1000; ts.tv_nsec = (timeout % 1000) * 1000000; sigemptyset(&waitset); sigaddset(&waitset, SIG_IPI); + sigaddset(&waitset, SIGBUS); - r = sigtimedwait(&waitset, &siginfo, &ts); - e = errno; + do { + pthread_mutex_unlock(&qemu_mutex); - pthread_mutex_lock(&qemu_mutex); + r = sigtimedwait(&waitset, &siginfo, &ts); + e = errno; - if (r == -1 && !(e == EAGAIN || e == EINTR)) { - printf("sigtimedwait: %s\n", strerror(e)); - exit(1); - } + pthread_mutex_lock(&qemu_mutex); + + if (r == -1 && !(e == EAGAIN || e == EINTR)) { + printf("sigtimedwait: %s\n", strerror(e)); + exit(1); + } + + switch (r) { + case SIGBUS: + kvm_on_sigbus(env, &siginfo); + break; + default: + break; + } + + r = sigpending(&chkset); + if (r == -1) { + printf("sigpending: %s\n", strerror(e)); + exit(1); + } + } while (sigismember(&chkset, SIG_IPI) || sigismember(&chkset, SIGBUS)); cpu_single_env = env; flush_queued_work(env); @@ -1752,6 +1869,7 @@ static void setup_kernel_sigmask(CPUStat sigprocmask(SIG_BLOCK, NULL, &set); sigdelset(&set, SIG_IPI); + sigdelset(&set, SIGBUS); kvm_set_signal_mask(env->kvm_cpu_state.vcpu_ctx, &set); } @@ -1877,12 +1995,20 @@ void kvm_hpet_enable_kpit(void) int kvm_init_ap(void) { + struct sigaction action; + #ifdef TARGET_I386 kvm_tpr_opt_setup(); #endif qemu_add_vm_change_state_handler(kvm_vm_state_change_handler, NULL); signal(SIG_IPI, sig_ipi_handler); + + memset(&action, 0, sizeof(action)); + action.sa_flags = SA_SIGINFO; + action.sa_sigaction = (void (*)(int, siginfo_t*, void*))sigbus_handler; + sigaction(SIGBUS, &action, NULL); + prctl(PR_MCE_KILL, 1, 1); return 0; } @@ -1943,7 +2069,10 @@ static void sigfd_handler(void *opaque) } sigaction(info.ssi_signo, NULL, &action); - if (action.sa_handler) + if ((action.sa_flags & SA_SIGINFO) && action.sa_sigaction) + action.sa_sigaction(info.ssi_signo, + (siginfo_t *)&info, NULL); + else if (action.sa_handler) action.sa_handler(info.ssi_signo); } @@ -1993,6 +2122,7 @@ int kvm_main_loop(void) sigemptyset(&mask); sigaddset(&mask, SIGIO); sigaddset(&mask, SIGALRM); + sigaddset(&mask, SIGBUS); sigprocmask(SIG_BLOCK, &mask, NULL); sigfd = qemu_signalfd(&mask); @@ -2518,6 +2648,10 @@ void kvm_inject_x86_mce(CPUState *cenv, .mce = &mce, }; + if (!cenv->mcg_cap) { + fprintf(stderr, "MCE support is not enabled!\n"); + return; + } on_vcpu(cenv, kvm_do_inject_x86_mce, &data); #endif } --- a/target-i386/cpu.h +++ b/target-i386/cpu.h @@ -250,16 +250,32 @@ #define PG_ERROR_RSVD_MASK 0x08 #define PG_ERROR_I_D_MASK 0x10 -#define MCG_CTL_P (1UL<<8) /* MCG_CAP register available */ +#define MCG_CTL_P (1ULL<<8) /* MCG_CAP register available */ +#define MCG_SER_P (1ULL<<24) /* MCA recovery/new status bits */ -#define MCE_CAP_DEF MCG_CTL_P +#define MCE_CAP_DEF (MCG_CTL_P|MCG_SER_P) #define MCE_BANKS_DEF 10 +#define MCG_STATUS_RIPV (1ULL<<0) /* restart ip valid */ +#define MCG_STATUS_EIPV (1ULL<<1) /* ip points to correct instruction */ #define MCG_STATUS_MCIP (1ULL<<2) /* machine check in progress */ #define MCI_STATUS_VAL (1ULL<<63) /* valid error */ #define MCI_STATUS_OVER (1ULL<<62) /* previous errors lost */ #define MCI_STATUS_UC (1ULL<<61) /* uncorrected error */ +#define MCI_STATUS_EN (1ULL<<60) /* error enabled */ +#define MCI_STATUS_MISCV (1ULL<<59) /* misc error reg. valid */ +#define MCI_STATUS_ADDRV (1ULL<<58) /* addr reg. valid */ +#define MCI_STATUS_PCC (1ULL<<57) /* processor context corrupt */ +#define MCI_STATUS_S (1ULL<<56) /* Signaled machine check */ +#define MCI_STATUS_AR (1ULL<<55) /* Action required */ + +/* MISC register defines */ +#define MCM_ADDR_SEGOFF 0 /* segment offset */ +#define MCM_ADDR_LINEAR 1 /* linear address */ +#define MCM_ADDR_PHYS 2 /* physical address */ +#define MCM_ADDR_MEM 3 /* memory address */ +#define MCM_ADDR_GENERIC 7 /* generic */ #define MSR_IA32_TSC 0x10 #define MSR_IA32_APICBASE 0x1b --- a/cpu-common.h +++ b/cpu-common.h @@ -34,6 +34,7 @@ void qemu_ram_free(ram_addr_t addr); /* This should only be used for ram local to a device. */ void *qemu_get_ram_ptr(ram_addr_t addr); /* This should not be used by devices. */ +int do_qemu_ram_addr_from_host(void *ptr, ram_addr_t *ram_addr); ram_addr_t qemu_ram_addr_from_host(void *ptr); int cpu_register_io_memory(CPUReadMemoryFunc * const *mem_read, --- a/exec.c +++ b/exec.c @@ -2589,9 +2589,7 @@ void *qemu_get_ram_ptr(ram_addr_t addr) return block->host + (addr - block->offset); } -/* Some of the softmmu routines need to translate from a host pointer - (typically a TLB entry) back to a ram offset. */ -ram_addr_t qemu_ram_addr_from_host(void *ptr) +int do_qemu_ram_addr_from_host(void *ptr, ram_addr_t *ram_addr) { RAMBlock *prev; RAMBlock **prevp; @@ -2608,11 +2606,23 @@ ram_addr_t qemu_ram_addr_from_host(void prev = block; block = block->next; } - if (!block) { + if (!block) + return -1; + *ram_addr = block->offset + (host - block->host); + return 0; +} + +/* Some of the softmmu routines need to translate from a host pointer + (typically a TLB entry) back to a ram offset. */ +ram_addr_t qemu_ram_addr_from_host(void *ptr) +{ + ram_addr_t ram_addr; + + if (do_qemu_ram_addr_from_host(ptr, &ram_addr)) { fprintf(stderr, "Bad ram pointer %p\n", ptr); abort(); } - return block->offset + (host - block->host); + return ram_addr; } static uint32_t unassigned_mem_readb(void *opaque, target_phys_addr_t addr)

[-v2] QEMU-KVM: MCE: Relay UCR MCE to guest

Commit Message

Comments

Patch