Message ID | 155323643836.18748.13006461397179281455.stgit@aravinda (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | target-ppc/spapr: Add FWNMI support in QEMU for PowerKVM guests | expand |
On Fri, Mar 22, 2019 at 12:03:58PM +0530, Aravinda Prasad wrote: > Memory error such as bit flips that cannot be corrected > by hardware are passed on to the kernel for handling. > If the memory address in error belongs to guest then > the guest kernel is responsible for taking suitable action. > Patch [1] enhances KVM to exit guest with exit reason > set to KVM_EXIT_NMI in such cases. This patch handles > KVM_EXIT_NMI exit. > > [1] https://www.spinics.net/lists/kvm-ppc/msg12637.html > (e20bbd3d and related commits) > > Signed-off-by: Aravinda Prasad <aravinda@linux.vnet.ibm.com> > --- > hw/ppc/spapr_events.c | 22 ++++++++++++++++++++++ > include/hw/ppc/spapr.h | 1 + > target/ppc/kvm.c | 16 ++++++++++++++++ > target/ppc/kvm_ppc.h | 2 ++ > 4 files changed, 41 insertions(+) > > diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c > index ae0f093..e7a24ad 100644 > --- a/hw/ppc/spapr_events.c > +++ b/hw/ppc/spapr_events.c > @@ -620,6 +620,28 @@ void spapr_hotplug_req_remove_by_count_indexed(SpaprDrcType drc_type, > RTAS_LOG_V6_HP_ACTION_REMOVE, drc_type, &drc_id); > } > > +void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered) > +{ > + SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine()); > + > + while (spapr->mc_status != -1) { > + /* > + * Check whether the same CPU got machine check error > + * while still handling the mc error (i.e., before > + * that CPU called "ibm,nmi-interlock" > + */ > + if (spapr->mc_status == cpu->vcpu_id) { > + qemu_system_guest_panicked(NULL); > + } > + qemu_cond_wait_iothread(&spapr->mc_delivery_cond); > + /* If the system is reset meanwhile, then just return */ > + if (spapr->mc_reset) { I don't really see what this accomplishes. IIUC mc_reset is true from reset time until nmi-register is called. Which means you could just check for guest_mnachine_check_addre being unset - in which case don't you need to fallback to the old machine check behaviour anyway? > + return; > + } > + } > + spapr->mc_status = cpu->vcpu_id; > +} > + > static void check_exception(PowerPCCPU *cpu, SpaprMachineState *spapr, > uint32_t token, uint32_t nargs, > target_ulong args, > diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h > index ee5589d..b0d8c18 100644 > --- a/include/hw/ppc/spapr.h > +++ b/include/hw/ppc/spapr.h > @@ -792,6 +792,7 @@ void spapr_reallocate_hpt(SpaprMachineState *spapr, int shift, > Error **errp); > void spapr_clear_pending_events(SpaprMachineState *spapr); > int spapr_max_server_number(SpaprMachineState *spapr); > +void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered); > > /* DRC callbacks. */ > void spapr_core_release(DeviceState *dev); > diff --git a/target/ppc/kvm.c b/target/ppc/kvm.c > index 2427c8e..a593448 100644 > --- a/target/ppc/kvm.c > +++ b/target/ppc/kvm.c > @@ -1738,6 +1738,11 @@ int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run) > ret = 0; > break; > > + case KVM_EXIT_NMI: > + DPRINTF("handle NMI exception\n"); tracepoints are generally preferred to new DPRINTFs. > + ret = kvm_handle_nmi(cpu, run); > + break; > + > default: > fprintf(stderr, "KVM: unknown exit reason %d\n", run->exit_reason); > ret = -1; > @@ -2803,6 +2808,17 @@ int kvm_arch_msi_data_to_gsi(uint32_t data) > return data & 0xffff; > } > > +int kvm_handle_nmi(PowerPCCPU *cpu, struct kvm_run *run) > +{ > + bool recovered = run->flags & KVM_RUN_PPC_NMI_DISP_FULLY_RECOV; > + > + cpu_synchronize_state(CPU(cpu)); > + > + spapr_mce_req_event(cpu, recovered); > + > + return 0; > +} > + > int kvmppc_enable_hwrng(void) > { > if (!kvm_enabled() || !kvm_check_extension(kvm_state, KVM_CAP_PPC_HWRNG)) { > diff --git a/target/ppc/kvm_ppc.h b/target/ppc/kvm_ppc.h > index 2c2ea30..df5e85f 100644 > --- a/target/ppc/kvm_ppc.h > +++ b/target/ppc/kvm_ppc.h > @@ -80,6 +80,8 @@ bool kvmppc_hpt_needs_host_contiguous_pages(void); > void kvm_check_mmu(PowerPCCPU *cpu, Error **errp); > void kvmppc_set_reg_ppc_online(PowerPCCPU *cpu, unsigned int online); > > +int kvm_handle_nmi(PowerPCCPU *cpu, struct kvm_run *run); > + > #else > > static inline uint32_t kvmppc_get_tbfreq(void) >
On Monday 25 March 2019 11:52 AM, David Gibson wrote: > On Fri, Mar 22, 2019 at 12:03:58PM +0530, Aravinda Prasad wrote: >> Memory error such as bit flips that cannot be corrected >> by hardware are passed on to the kernel for handling. >> If the memory address in error belongs to guest then >> the guest kernel is responsible for taking suitable action. >> Patch [1] enhances KVM to exit guest with exit reason >> set to KVM_EXIT_NMI in such cases. This patch handles >> KVM_EXIT_NMI exit. >> >> [1] https://www.spinics.net/lists/kvm-ppc/msg12637.html >> (e20bbd3d and related commits) >> >> Signed-off-by: Aravinda Prasad <aravinda@linux.vnet.ibm.com> >> --- >> hw/ppc/spapr_events.c | 22 ++++++++++++++++++++++ >> include/hw/ppc/spapr.h | 1 + >> target/ppc/kvm.c | 16 ++++++++++++++++ >> target/ppc/kvm_ppc.h | 2 ++ >> 4 files changed, 41 insertions(+) >> >> diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c >> index ae0f093..e7a24ad 100644 >> --- a/hw/ppc/spapr_events.c >> +++ b/hw/ppc/spapr_events.c >> @@ -620,6 +620,28 @@ void spapr_hotplug_req_remove_by_count_indexed(SpaprDrcType drc_type, >> RTAS_LOG_V6_HP_ACTION_REMOVE, drc_type, &drc_id); >> } >> >> +void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered) >> +{ >> + SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine()); >> + >> + while (spapr->mc_status != -1) { >> + /* >> + * Check whether the same CPU got machine check error >> + * while still handling the mc error (i.e., before >> + * that CPU called "ibm,nmi-interlock" >> + */ >> + if (spapr->mc_status == cpu->vcpu_id) { >> + qemu_system_guest_panicked(NULL); >> + } >> + qemu_cond_wait_iothread(&spapr->mc_delivery_cond); >> + /* If the system is reset meanwhile, then just return */ >> + if (spapr->mc_reset) { > > I don't really see what this accomplishes. IIUC mc_reset is true from > reset time until nmi-register is called. Which means you could just > check for guest_mnachine_check_addre being unset - in which case don't > you need to fallback to the old machine check behaviour anyway? We can check for guest_machine_check_addr being unset instead of mc_reset. Do we need any kind of memory barriers to ensure that this thread reads the updated guest_machine_check_addr/mc_reset? We don't have to fallback to the old machine check behavior, because guest_machine_check_addr is unset only during system reset. > >> + return; >> + } >> + } >> + spapr->mc_status = cpu->vcpu_id; >> +} >> + >> static void check_exception(PowerPCCPU *cpu, SpaprMachineState *spapr, >> uint32_t token, uint32_t nargs, >> target_ulong args, >> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h >> index ee5589d..b0d8c18 100644 >> --- a/include/hw/ppc/spapr.h >> +++ b/include/hw/ppc/spapr.h >> @@ -792,6 +792,7 @@ void spapr_reallocate_hpt(SpaprMachineState *spapr, int shift, >> Error **errp); >> void spapr_clear_pending_events(SpaprMachineState *spapr); >> int spapr_max_server_number(SpaprMachineState *spapr); >> +void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered); >> >> /* DRC callbacks. */ >> void spapr_core_release(DeviceState *dev); >> diff --git a/target/ppc/kvm.c b/target/ppc/kvm.c >> index 2427c8e..a593448 100644 >> --- a/target/ppc/kvm.c >> +++ b/target/ppc/kvm.c >> @@ -1738,6 +1738,11 @@ int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run) >> ret = 0; >> break; >> >> + case KVM_EXIT_NMI: >> + DPRINTF("handle NMI exception\n"); > > tracepoints are generally preferred to new DPRINTFs. sure. > >> + ret = kvm_handle_nmi(cpu, run); >> + break; >> + >> default: >> fprintf(stderr, "KVM: unknown exit reason %d\n", run->exit_reason); >> ret = -1; >> @@ -2803,6 +2808,17 @@ int kvm_arch_msi_data_to_gsi(uint32_t data) >> return data & 0xffff; >> } >> >> +int kvm_handle_nmi(PowerPCCPU *cpu, struct kvm_run *run) >> +{ >> + bool recovered = run->flags & KVM_RUN_PPC_NMI_DISP_FULLY_RECOV; >> + >> + cpu_synchronize_state(CPU(cpu)); >> + >> + spapr_mce_req_event(cpu, recovered); >> + >> + return 0; >> +} >> + >> int kvmppc_enable_hwrng(void) >> { >> if (!kvm_enabled() || !kvm_check_extension(kvm_state, KVM_CAP_PPC_HWRNG)) { >> diff --git a/target/ppc/kvm_ppc.h b/target/ppc/kvm_ppc.h >> index 2c2ea30..df5e85f 100644 >> --- a/target/ppc/kvm_ppc.h >> +++ b/target/ppc/kvm_ppc.h >> @@ -80,6 +80,8 @@ bool kvmppc_hpt_needs_host_contiguous_pages(void); >> void kvm_check_mmu(PowerPCCPU *cpu, Error **errp); >> void kvmppc_set_reg_ppc_online(PowerPCCPU *cpu, unsigned int online); >> >> +int kvm_handle_nmi(PowerPCCPU *cpu, struct kvm_run *run); >> + >> #else >> >> static inline uint32_t kvmppc_get_tbfreq(void) >> >
On Mon, Mar 25, 2019 at 01:31:12PM +0530, Aravinda Prasad wrote: > > > On Monday 25 March 2019 11:52 AM, David Gibson wrote: > > On Fri, Mar 22, 2019 at 12:03:58PM +0530, Aravinda Prasad wrote: > >> Memory error such as bit flips that cannot be corrected > >> by hardware are passed on to the kernel for handling. > >> If the memory address in error belongs to guest then > >> the guest kernel is responsible for taking suitable action. > >> Patch [1] enhances KVM to exit guest with exit reason > >> set to KVM_EXIT_NMI in such cases. This patch handles > >> KVM_EXIT_NMI exit. > >> > >> [1] https://www.spinics.net/lists/kvm-ppc/msg12637.html > >> (e20bbd3d and related commits) > >> > >> Signed-off-by: Aravinda Prasad <aravinda@linux.vnet.ibm.com> > >> --- > >> hw/ppc/spapr_events.c | 22 ++++++++++++++++++++++ > >> include/hw/ppc/spapr.h | 1 + > >> target/ppc/kvm.c | 16 ++++++++++++++++ > >> target/ppc/kvm_ppc.h | 2 ++ > >> 4 files changed, 41 insertions(+) > >> > >> diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c > >> index ae0f093..e7a24ad 100644 > >> --- a/hw/ppc/spapr_events.c > >> +++ b/hw/ppc/spapr_events.c > >> @@ -620,6 +620,28 @@ void spapr_hotplug_req_remove_by_count_indexed(SpaprDrcType drc_type, > >> RTAS_LOG_V6_HP_ACTION_REMOVE, drc_type, &drc_id); > >> } > >> > >> +void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered) > >> +{ > >> + SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine()); > >> + > >> + while (spapr->mc_status != -1) { > >> + /* > >> + * Check whether the same CPU got machine check error > >> + * while still handling the mc error (i.e., before > >> + * that CPU called "ibm,nmi-interlock" > >> + */ > >> + if (spapr->mc_status == cpu->vcpu_id) { > >> + qemu_system_guest_panicked(NULL); > >> + } > >> + qemu_cond_wait_iothread(&spapr->mc_delivery_cond); > >> + /* If the system is reset meanwhile, then just return */ > >> + if (spapr->mc_reset) { > > > > I don't really see what this accomplishes. IIUC mc_reset is true from > > reset time until nmi-register is called. Which means you could just > > check for guest_mnachine_check_addre being unset - in which case don't > > you need to fallback to the old machine check behaviour anyway? > > We can check for guest_machine_check_addr being unset instead of mc_reset. > > Do we need any kind of memory barriers to ensure that this thread reads > the updated guest_machine_check_addr/mc_reset? IIUC these variables are all protected by the global qemu mutex, so that should include the necessary memory barriers already. > We don't have to fallback to the old machine check behavior, because > guest_machine_check_addr is unset only during system reset. Yes... which means that between reset and re-registering, we should be using the old machine check behaviour, yes? Which is exactly this situation.
On Tuesday 26 March 2019 05:01 AM, David Gibson wrote: > On Mon, Mar 25, 2019 at 01:31:12PM +0530, Aravinda Prasad wrote: >> >> >> On Monday 25 March 2019 11:52 AM, David Gibson wrote: >>> On Fri, Mar 22, 2019 at 12:03:58PM +0530, Aravinda Prasad wrote: >>>> Memory error such as bit flips that cannot be corrected >>>> by hardware are passed on to the kernel for handling. >>>> If the memory address in error belongs to guest then >>>> the guest kernel is responsible for taking suitable action. >>>> Patch [1] enhances KVM to exit guest with exit reason >>>> set to KVM_EXIT_NMI in such cases. This patch handles >>>> KVM_EXIT_NMI exit. >>>> >>>> [1] https://www.spinics.net/lists/kvm-ppc/msg12637.html >>>> (e20bbd3d and related commits) >>>> >>>> Signed-off-by: Aravinda Prasad <aravinda@linux.vnet.ibm.com> >>>> --- >>>> hw/ppc/spapr_events.c | 22 ++++++++++++++++++++++ >>>> include/hw/ppc/spapr.h | 1 + >>>> target/ppc/kvm.c | 16 ++++++++++++++++ >>>> target/ppc/kvm_ppc.h | 2 ++ >>>> 4 files changed, 41 insertions(+) >>>> >>>> diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c >>>> index ae0f093..e7a24ad 100644 >>>> --- a/hw/ppc/spapr_events.c >>>> +++ b/hw/ppc/spapr_events.c >>>> @@ -620,6 +620,28 @@ void spapr_hotplug_req_remove_by_count_indexed(SpaprDrcType drc_type, >>>> RTAS_LOG_V6_HP_ACTION_REMOVE, drc_type, &drc_id); >>>> } >>>> >>>> +void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered) >>>> +{ >>>> + SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine()); >>>> + >>>> + while (spapr->mc_status != -1) { >>>> + /* >>>> + * Check whether the same CPU got machine check error >>>> + * while still handling the mc error (i.e., before >>>> + * that CPU called "ibm,nmi-interlock" >>>> + */ >>>> + if (spapr->mc_status == cpu->vcpu_id) { >>>> + qemu_system_guest_panicked(NULL); >>>> + } >>>> + qemu_cond_wait_iothread(&spapr->mc_delivery_cond); >>>> + /* If the system is reset meanwhile, then just return */ >>>> + if (spapr->mc_reset) { >>> >>> I don't really see what this accomplishes. IIUC mc_reset is true from >>> reset time until nmi-register is called. Which means you could just >>> check for guest_mnachine_check_addre being unset - in which case don't >>> you need to fallback to the old machine check behaviour anyway? >> >> We can check for guest_machine_check_addr being unset instead of mc_reset. >> >> Do we need any kind of memory barriers to ensure that this thread reads >> the updated guest_machine_check_addr/mc_reset? > > IIUC these variables are all protected by the global qemu mutex, so > that should include the necessary memory barriers already. ok. > >> We don't have to fallback to the old machine check behavior, because >> guest_machine_check_addr is unset only during system reset. > > Yes... which means that between reset and re-registering, we should be > using the old machine check behaviour, yes? Which is exactly this > situation. Yes..we should use the old behavior during that time window. >
On Monday 25 March 2019 11:52 AM, David Gibson wrote: > On Fri, Mar 22, 2019 at 12:03:58PM +0530, Aravinda Prasad wrote: >> Memory error such as bit flips that cannot be corrected >> by hardware are passed on to the kernel for handling. >> If the memory address in error belongs to guest then >> the guest kernel is responsible for taking suitable action. >> Patch [1] enhances KVM to exit guest with exit reason >> set to KVM_EXIT_NMI in such cases. This patch handles >> KVM_EXIT_NMI exit. >> >> [1] https://www.spinics.net/lists/kvm-ppc/msg12637.html >> (e20bbd3d and related commits) >> >> Signed-off-by: Aravinda Prasad <aravinda@linux.vnet.ibm.com> >> --- >> hw/ppc/spapr_events.c | 22 ++++++++++++++++++++++ >> include/hw/ppc/spapr.h | 1 + >> target/ppc/kvm.c | 16 ++++++++++++++++ >> target/ppc/kvm_ppc.h | 2 ++ >> 4 files changed, 41 insertions(+) >> [...] >> diff --git a/target/ppc/kvm.c b/target/ppc/kvm.c >> index 2427c8e..a593448 100644 >> --- a/target/ppc/kvm.c >> +++ b/target/ppc/kvm.c >> @@ -1738,6 +1738,11 @@ int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run) >> ret = 0; >> break; >> >> + case KVM_EXIT_NMI: >> + DPRINTF("handle NMI exception\n"); > > tracepoints are generally preferred to new DPRINTFs. I see DPRINTFs used in all other exit reasons in this function. Do you want me to change this particular exit case to tracepoints? I think it is better to keep this DPRINTF as of now and change all the DPRINTFs to tracepoints in a separate patch set. Regards, Aravinda > >> + ret = kvm_handle_nmi(cpu, run); >> + break; >> + >> default: >> fprintf(stderr, "KVM: unknown exit reason %d\n", run->exit_reason); >> ret = -1; >> @@ -2803,6 +2808,17 @@ int kvm_arch_msi_data_to_gsi(uint32_t data) >> return data & 0xffff; >> } >> >> +int kvm_handle_nmi(PowerPCCPU *cpu, struct kvm_run *run) >> +{ >> + bool recovered = run->flags & KVM_RUN_PPC_NMI_DISP_FULLY_RECOV; >> + >> + cpu_synchronize_state(CPU(cpu)); >> + >> + spapr_mce_req_event(cpu, recovered); >> + >> + return 0; >> +} >> + >> int kvmppc_enable_hwrng(void) >> { >> if (!kvm_enabled() || !kvm_check_extension(kvm_state, KVM_CAP_PPC_HWRNG)) { >> diff --git a/target/ppc/kvm_ppc.h b/target/ppc/kvm_ppc.h >> index 2c2ea30..df5e85f 100644 >> --- a/target/ppc/kvm_ppc.h >> +++ b/target/ppc/kvm_ppc.h >> @@ -80,6 +80,8 @@ bool kvmppc_hpt_needs_host_contiguous_pages(void); >> void kvm_check_mmu(PowerPCCPU *cpu, Error **errp); >> void kvmppc_set_reg_ppc_online(PowerPCCPU *cpu, unsigned int online); >> >> +int kvm_handle_nmi(PowerPCCPU *cpu, struct kvm_run *run); >> + >> #else >> >> static inline uint32_t kvmppc_get_tbfreq(void) >> >
On Thu, 4 Apr 2019 14:40:45 +0530 Aravinda Prasad <aravinda@linux.vnet.ibm.com> wrote: > On Monday 25 March 2019 11:52 AM, David Gibson wrote: > > On Fri, Mar 22, 2019 at 12:03:58PM +0530, Aravinda Prasad wrote: > >> Memory error such as bit flips that cannot be corrected > >> by hardware are passed on to the kernel for handling. > >> If the memory address in error belongs to guest then > >> the guest kernel is responsible for taking suitable action. > >> Patch [1] enhances KVM to exit guest with exit reason > >> set to KVM_EXIT_NMI in such cases. This patch handles > >> KVM_EXIT_NMI exit. > >> > >> [1] https://www.spinics.net/lists/kvm-ppc/msg12637.html > >> (e20bbd3d and related commits) > >> > >> Signed-off-by: Aravinda Prasad <aravinda@linux.vnet.ibm.com> > >> --- > >> hw/ppc/spapr_events.c | 22 ++++++++++++++++++++++ > >> include/hw/ppc/spapr.h | 1 + > >> target/ppc/kvm.c | 16 ++++++++++++++++ > >> target/ppc/kvm_ppc.h | 2 ++ > >> 4 files changed, 41 insertions(+) > >> > > [...] > > >> diff --git a/target/ppc/kvm.c b/target/ppc/kvm.c > >> index 2427c8e..a593448 100644 > >> --- a/target/ppc/kvm.c > >> +++ b/target/ppc/kvm.c > >> @@ -1738,6 +1738,11 @@ int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run) > >> ret = 0; > >> break; > >> > >> + case KVM_EXIT_NMI: > >> + DPRINTF("handle NMI exception\n"); > > > > tracepoints are generally preferred to new DPRINTFs. > > I see DPRINTFs used in all other exit reasons in this function. Do you > want me to change this particular exit case to tracepoints? I think it > is better to keep this DPRINTF as of now and change all the DPRINTFs to > tracepoints in a separate patch set. > git grep shows that all DPRINTFs (25 of them) are in target/ppc/kvm.c. I guess this can be addressed with a single, mechanical and easy to review preparatory patch. > Regards, > Aravinda > > > > >> + ret = kvm_handle_nmi(cpu, run); > >> + break; > >> + > >> default: > >> fprintf(stderr, "KVM: unknown exit reason %d\n", run->exit_reason); > >> ret = -1; > >> @@ -2803,6 +2808,17 @@ int kvm_arch_msi_data_to_gsi(uint32_t data) > >> return data & 0xffff; > >> } > >> > >> +int kvm_handle_nmi(PowerPCCPU *cpu, struct kvm_run *run) > >> +{ > >> + bool recovered = run->flags & KVM_RUN_PPC_NMI_DISP_FULLY_RECOV; > >> + > >> + cpu_synchronize_state(CPU(cpu)); > >> + > >> + spapr_mce_req_event(cpu, recovered); > >> + > >> + return 0; > >> +} > >> + > >> int kvmppc_enable_hwrng(void) > >> { > >> if (!kvm_enabled() || !kvm_check_extension(kvm_state, KVM_CAP_PPC_HWRNG)) { > >> diff --git a/target/ppc/kvm_ppc.h b/target/ppc/kvm_ppc.h > >> index 2c2ea30..df5e85f 100644 > >> --- a/target/ppc/kvm_ppc.h > >> +++ b/target/ppc/kvm_ppc.h > >> @@ -80,6 +80,8 @@ bool kvmppc_hpt_needs_host_contiguous_pages(void); > >> void kvm_check_mmu(PowerPCCPU *cpu, Error **errp); > >> void kvmppc_set_reg_ppc_online(PowerPCCPU *cpu, unsigned int online); > >> > >> +int kvm_handle_nmi(PowerPCCPU *cpu, struct kvm_run *run); > >> + > >> #else > >> > >> static inline uint32_t kvmppc_get_tbfreq(void) > >> > > >
On Thu, Apr 04, 2019 at 02:40:45PM +0530, Aravinda Prasad wrote: > > > On Monday 25 March 2019 11:52 AM, David Gibson wrote: > > On Fri, Mar 22, 2019 at 12:03:58PM +0530, Aravinda Prasad wrote: > >> Memory error such as bit flips that cannot be corrected > >> by hardware are passed on to the kernel for handling. > >> If the memory address in error belongs to guest then > >> the guest kernel is responsible for taking suitable action. > >> Patch [1] enhances KVM to exit guest with exit reason > >> set to KVM_EXIT_NMI in such cases. This patch handles > >> KVM_EXIT_NMI exit. > >> > >> [1] https://www.spinics.net/lists/kvm-ppc/msg12637.html > >> (e20bbd3d and related commits) > >> > >> Signed-off-by: Aravinda Prasad <aravinda@linux.vnet.ibm.com> > >> --- > >> hw/ppc/spapr_events.c | 22 ++++++++++++++++++++++ > >> include/hw/ppc/spapr.h | 1 + > >> target/ppc/kvm.c | 16 ++++++++++++++++ > >> target/ppc/kvm_ppc.h | 2 ++ > >> 4 files changed, 41 insertions(+) > >> > > [...] > > >> diff --git a/target/ppc/kvm.c b/target/ppc/kvm.c > >> index 2427c8e..a593448 100644 > >> --- a/target/ppc/kvm.c > >> +++ b/target/ppc/kvm.c > >> @@ -1738,6 +1738,11 @@ int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run) > >> ret = 0; > >> break; > >> > >> + case KVM_EXIT_NMI: > >> + DPRINTF("handle NMI exception\n"); > > > > tracepoints are generally preferred to new DPRINTFs. > > I see DPRINTFs used in all other exit reasons in this function. Do you > want me to change this particular exit case to tracepoints? I think it > is better to keep this DPRINTF as of now and change all the DPRINTFs to > tracepoints in a separate patch set. Ah, good point. Tracepoints are generally preferred, but since DPRINTFs are in use here, stick with that (at some point it would be good to change the whole file, but that's out of scope here).
On 05/04/2019 10:17, David Gibson wrote: > On Thu, Apr 04, 2019 at 02:40:45PM +0530, Aravinda Prasad wrote: >> >> >> On Monday 25 March 2019 11:52 AM, David Gibson wrote: >>> On Fri, Mar 22, 2019 at 12:03:58PM +0530, Aravinda Prasad wrote: >>>> Memory error such as bit flips that cannot be corrected >>>> by hardware are passed on to the kernel for handling. >>>> If the memory address in error belongs to guest then >>>> the guest kernel is responsible for taking suitable action. >>>> Patch [1] enhances KVM to exit guest with exit reason >>>> set to KVM_EXIT_NMI in such cases. This patch handles >>>> KVM_EXIT_NMI exit. >>>> >>>> [1] https://www.spinics.net/lists/kvm-ppc/msg12637.html >>>> (e20bbd3d and related commits) >>>> >>>> Signed-off-by: Aravinda Prasad <aravinda@linux.vnet.ibm.com> >>>> --- >>>> hw/ppc/spapr_events.c | 22 ++++++++++++++++++++++ >>>> include/hw/ppc/spapr.h | 1 + >>>> target/ppc/kvm.c | 16 ++++++++++++++++ >>>> target/ppc/kvm_ppc.h | 2 ++ >>>> 4 files changed, 41 insertions(+) >>>> >> >> [...] >> >>>> diff --git a/target/ppc/kvm.c b/target/ppc/kvm.c >>>> index 2427c8e..a593448 100644 >>>> --- a/target/ppc/kvm.c >>>> +++ b/target/ppc/kvm.c >>>> @@ -1738,6 +1738,11 @@ int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run) >>>> ret = 0; >>>> break; >>>> >>>> + case KVM_EXIT_NMI: >>>> + DPRINTF("handle NMI exception\n"); >>> >>> tracepoints are generally preferred to new DPRINTFs. >> >> I see DPRINTFs used in all other exit reasons in this function. Do you >> want me to change this particular exit case to tracepoints? I think it >> is better to keep this DPRINTF as of now and change all the DPRINTFs to >> tracepoints in a separate patch set. > > Ah, good point. imho not. The kvm.c already knows about traces (there are two) and even if every other trace in kvm_arch_handle_exit() is DPRINTF (enabled all at once), having at least one which can be enabled without QEMU recompile and separately from the others is a small but nice bonus before someone gets rid of DPRINTF. > Tracepoints are generally preferred, but since > DPRINTFs are in use here, stick with that (at some point it would be > good to change the whole file, but that's out of scope here).
On Fri, 5 Apr 2019 16:04:27 +1100 Alexey Kardashevskiy <aik@linux.ibm.com> wrote: > On 05/04/2019 10:17, David Gibson wrote: > > On Thu, Apr 04, 2019 at 02:40:45PM +0530, Aravinda Prasad wrote: > >> > >> > >> On Monday 25 March 2019 11:52 AM, David Gibson wrote: > >>> On Fri, Mar 22, 2019 at 12:03:58PM +0530, Aravinda Prasad wrote: > >>>> Memory error such as bit flips that cannot be corrected > >>>> by hardware are passed on to the kernel for handling. > >>>> If the memory address in error belongs to guest then > >>>> the guest kernel is responsible for taking suitable action. > >>>> Patch [1] enhances KVM to exit guest with exit reason > >>>> set to KVM_EXIT_NMI in such cases. This patch handles > >>>> KVM_EXIT_NMI exit. > >>>> > >>>> [1] https://www.spinics.net/lists/kvm-ppc/msg12637.html > >>>> (e20bbd3d and related commits) > >>>> > >>>> Signed-off-by: Aravinda Prasad <aravinda@linux.vnet.ibm.com> > >>>> --- > >>>> hw/ppc/spapr_events.c | 22 ++++++++++++++++++++++ > >>>> include/hw/ppc/spapr.h | 1 + > >>>> target/ppc/kvm.c | 16 ++++++++++++++++ > >>>> target/ppc/kvm_ppc.h | 2 ++ > >>>> 4 files changed, 41 insertions(+) > >>>> > >> > >> [...] > >> > >>>> diff --git a/target/ppc/kvm.c b/target/ppc/kvm.c > >>>> index 2427c8e..a593448 100644 > >>>> --- a/target/ppc/kvm.c > >>>> +++ b/target/ppc/kvm.c > >>>> @@ -1738,6 +1738,11 @@ int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run) > >>>> ret = 0; > >>>> break; > >>>> > >>>> + case KVM_EXIT_NMI: > >>>> + DPRINTF("handle NMI exception\n"); > >>> > >>> tracepoints are generally preferred to new DPRINTFs. > >> > >> I see DPRINTFs used in all other exit reasons in this function. Do you > >> want me to change this particular exit case to tracepoints? I think it > >> is better to keep this DPRINTF as of now and change all the DPRINTFs to > >> tracepoints in a separate patch set. > > > > Ah, good point. > > imho not. The kvm.c already knows about traces (there are two) and even > if every other trace in kvm_arch_handle_exit() is DPRINTF (enabled all > at once), having at least one which can be enabled without QEMU > recompile and separately from the others is a small but nice bonus > before someone gets rid of DPRINTF. > Done. https://patchwork.ozlabs.org/project/qemu-devel/list/?series=101141 > > > Tracepoints are generally preferred, but since > > DPRINTFs are in use here, stick with that (at some point it would be > > good to change the whole file, but that's out of scope here). > >
diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c index ae0f093..e7a24ad 100644 --- a/hw/ppc/spapr_events.c +++ b/hw/ppc/spapr_events.c @@ -620,6 +620,28 @@ void spapr_hotplug_req_remove_by_count_indexed(SpaprDrcType drc_type, RTAS_LOG_V6_HP_ACTION_REMOVE, drc_type, &drc_id); } +void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered) +{ + SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine()); + + while (spapr->mc_status != -1) { + /* + * Check whether the same CPU got machine check error + * while still handling the mc error (i.e., before + * that CPU called "ibm,nmi-interlock" + */ + if (spapr->mc_status == cpu->vcpu_id) { + qemu_system_guest_panicked(NULL); + } + qemu_cond_wait_iothread(&spapr->mc_delivery_cond); + /* If the system is reset meanwhile, then just return */ + if (spapr->mc_reset) { + return; + } + } + spapr->mc_status = cpu->vcpu_id; +} + static void check_exception(PowerPCCPU *cpu, SpaprMachineState *spapr, uint32_t token, uint32_t nargs, target_ulong args, diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h index ee5589d..b0d8c18 100644 --- a/include/hw/ppc/spapr.h +++ b/include/hw/ppc/spapr.h @@ -792,6 +792,7 @@ void spapr_reallocate_hpt(SpaprMachineState *spapr, int shift, Error **errp); void spapr_clear_pending_events(SpaprMachineState *spapr); int spapr_max_server_number(SpaprMachineState *spapr); +void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered); /* DRC callbacks. */ void spapr_core_release(DeviceState *dev); diff --git a/target/ppc/kvm.c b/target/ppc/kvm.c index 2427c8e..a593448 100644 --- a/target/ppc/kvm.c +++ b/target/ppc/kvm.c @@ -1738,6 +1738,11 @@ int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run) ret = 0; break; + case KVM_EXIT_NMI: + DPRINTF("handle NMI exception\n"); + ret = kvm_handle_nmi(cpu, run); + break; + default: fprintf(stderr, "KVM: unknown exit reason %d\n", run->exit_reason); ret = -1; @@ -2803,6 +2808,17 @@ int kvm_arch_msi_data_to_gsi(uint32_t data) return data & 0xffff; } +int kvm_handle_nmi(PowerPCCPU *cpu, struct kvm_run *run) +{ + bool recovered = run->flags & KVM_RUN_PPC_NMI_DISP_FULLY_RECOV; + + cpu_synchronize_state(CPU(cpu)); + + spapr_mce_req_event(cpu, recovered); + + return 0; +} + int kvmppc_enable_hwrng(void) { if (!kvm_enabled() || !kvm_check_extension(kvm_state, KVM_CAP_PPC_HWRNG)) { diff --git a/target/ppc/kvm_ppc.h b/target/ppc/kvm_ppc.h index 2c2ea30..df5e85f 100644 --- a/target/ppc/kvm_ppc.h +++ b/target/ppc/kvm_ppc.h @@ -80,6 +80,8 @@ bool kvmppc_hpt_needs_host_contiguous_pages(void); void kvm_check_mmu(PowerPCCPU *cpu, Error **errp); void kvmppc_set_reg_ppc_online(PowerPCCPU *cpu, unsigned int online); +int kvm_handle_nmi(PowerPCCPU *cpu, struct kvm_run *run); + #else static inline uint32_t kvmppc_get_tbfreq(void)
Memory error such as bit flips that cannot be corrected by hardware are passed on to the kernel for handling. If the memory address in error belongs to guest then the guest kernel is responsible for taking suitable action. Patch [1] enhances KVM to exit guest with exit reason set to KVM_EXIT_NMI in such cases. This patch handles KVM_EXIT_NMI exit. [1] https://www.spinics.net/lists/kvm-ppc/msg12637.html (e20bbd3d and related commits) Signed-off-by: Aravinda Prasad <aravinda@linux.vnet.ibm.com> --- hw/ppc/spapr_events.c | 22 ++++++++++++++++++++++ include/hw/ppc/spapr.h | 1 + target/ppc/kvm.c | 16 ++++++++++++++++ target/ppc/kvm_ppc.h | 2 ++ 4 files changed, 41 insertions(+)