diff mbox series

[PULL,06/10] ppc/spapr: Don't kill the guest if a recovered FWNMI machine check delivery fails

Message ID 20200407043606.291546-7-david@gibson.dropbear.id.au (mailing list archive)
State New, archived
Headers show
Series [PULL,01/10] hw/ppc/e500.c: Handle qemu_find_file() failure | expand

Commit Message

David Gibson April 7, 2020, 4:36 a.m. UTC
From: Nicholas Piggin <npiggin@gmail.com>

Try to be tolerant of FWNMI delivery errors if the machine check had been
recovered by the host.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Message-Id: <20200325142906.221248-5-npiggin@gmail.com>
Reviewed-by: Greg Kurz <groug@kaod.org>
[dwg: Updated comment at Greg's suggestion]
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 hw/ppc/spapr_events.c | 30 +++++++++++++++++++++++++-----
 1 file changed, 25 insertions(+), 5 deletions(-)
diff mbox series

Patch

diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c
index c8964eb25d..1069d0197b 100644
--- a/hw/ppc/spapr_events.c
+++ b/hw/ppc/spapr_events.c
@@ -833,13 +833,28 @@  static void spapr_mce_dispatch_elog(PowerPCCPU *cpu, bool recovered)
     /* get rtas addr from fdt */
     rtas_addr = spapr_get_rtas_addr();
     if (!rtas_addr) {
-        error_report(
+        if (!recovered) {
+            error_report(
 "FWNMI: Unable to deliver machine check to guest: rtas_addr not found.");
-        qemu_system_guest_panicked(NULL);
+            qemu_system_guest_panicked(NULL);
+        } else {
+            warn_report(
+"FWNMI: Unable to deliver machine check to guest: rtas_addr not found. "
+"Machine check recovered.");
+        }
         g_free(ext_elog);
         return;
     }
 
+    /*
+     * By taking the interlock, we assume that the MCE will be
+     * delivered to the guest. CAUTION: don't add anything that could
+     * prevent the MCE to be delivered after this line, otherwise the
+     * guest won't be able to release the interlock and ultimately
+     * hang/crash?
+     */
+    spapr->fwnmi_machine_check_interlock = cpu->vcpu_id;
+
     stq_be_phys(&address_space_memory, rtas_addr + RTAS_ERROR_LOG_OFFSET,
                 env->gpr[3]);
     cpu_physical_memory_write(rtas_addr + RTAS_ERROR_LOG_OFFSET +
@@ -876,9 +891,15 @@  void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered)
          * that CPU called "ibm,nmi-interlock")
          */
         if (spapr->fwnmi_machine_check_interlock == cpu->vcpu_id) {
-            error_report(
+            if (!recovered) {
+                error_report(
 "FWNMI: Unable to deliver machine check to guest: nested machine check.");
-            qemu_system_guest_panicked(NULL);
+                qemu_system_guest_panicked(NULL);
+            } else {
+                warn_report(
+"FWNMI: Unable to deliver machine check to guest: nested machine check. "
+"Machine check recovered.");
+            }
             return;
         }
         qemu_cond_wait_iothread(&spapr->fwnmi_machine_check_interlock_cond);
@@ -906,7 +927,6 @@  void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered)
         warn_report("Received a fwnmi while migration was in progress");
     }
 
-    spapr->fwnmi_machine_check_interlock = cpu->vcpu_id;
     spapr_mce_dispatch_elog(cpu, recovered);
 }