diff mbox series

[3/4] acpi/ghes: Allow retry to write CPER errors

Message ID 20250214041635.608012-4-gshan@redhat.com (mailing list archive)
State New
Headers show
Series target/arm: Improvement on memory error handling | expand

Commit Message

Gavin Shan Feb. 14, 2025, 4:16 a.m. UTC
Multiple CPER errors can be raised on multiple vCPUs at the same
time. The error -1 is returned from ghes_record_cper_errors() and
QEMU is terminated due to abort() in kvm_arch_on_sigbus_vcpu().
it isn't correct and expected behaviour since the affected vCPU
can't proceed with execution. It's reasonable to retry if the
reported error is transient, for example the previously reported
CPER error isn't claimed by the guest.

Add one more parameter (@retry_allowed) to acpi_ghes_memory_errors(),
passed down to ghes_record_cper_errors(). The differentiated error
number (1 or -1) is returned if the the previously reported CPER
error hasn't been claimed by the guest. The caller will retry the
request if the returned error number is 1.

Signed-off-by: Gavin Shan <gshan@redhat.com>
---
 hw/acpi/ghes-stub.c    |  3 ++-
 hw/acpi/ghes.c         | 12 +++++++++---
 include/hw/acpi/ghes.h |  3 ++-
 target/arm/kvm.c       |  2 +-
 4 files changed, 14 insertions(+), 6 deletions(-)
diff mbox series

Patch

diff --git a/hw/acpi/ghes-stub.c b/hw/acpi/ghes-stub.c
index 7cec1812da..5c807afe21 100644
--- a/hw/acpi/ghes-stub.c
+++ b/hw/acpi/ghes-stub.c
@@ -11,7 +11,8 @@ 
 #include "qemu/osdep.h"
 #include "hw/acpi/ghes.h"
 
-int acpi_ghes_memory_errors(uint16_t source_id, uint64_t physical_address)
+int acpi_ghes_memory_errors(uint16_t source_id, uint64_t physical_address,
+                            bool retry_allowed)
 {
     return -1;
 }
diff --git a/hw/acpi/ghes.c b/hw/acpi/ghes.c
index a67326fd50..60587f3c1b 100644
--- a/hw/acpi/ghes.c
+++ b/hw/acpi/ghes.c
@@ -391,7 +391,7 @@  static void get_hw_error_offsets(uint64_t ghes_addr,
 }
 
 static int ghes_record_cper_errors(const void *cper, size_t len,
-                                   uint16_t source_id)
+                                   uint16_t source_id, bool retry_allowed)
 {
     uint64_t cper_addr = 0, read_ack_register_addr = 0, read_ack_register;
     AcpiGedState *acpi_ged_state;
@@ -424,6 +424,10 @@  static int ghes_record_cper_errors(const void *cper, size_t len,
 
     /* zero means OSPM does not acknowledge the error */
     if (!read_ack_register) {
+        if (retry_allowed) {
+            return 1;
+        }
+
         error_report("OSPM does not acknowledge previous error,"
                      " so can not record CPER for current error anymore");
         return -1;
@@ -443,7 +447,8 @@  static int ghes_record_cper_errors(const void *cper, size_t len,
     return 0;
 }
 
-int acpi_ghes_memory_errors(uint16_t source_id, uint64_t physical_address)
+int acpi_ghes_memory_errors(uint16_t source_id, uint64_t physical_address,
+                            bool retry_allowed)
 {
     /* Memory Error Section Type */
     const uint8_t guid[] =
@@ -468,7 +473,8 @@  int acpi_ghes_memory_errors(uint16_t source_id, uint64_t physical_address)
     acpi_ghes_build_append_mem_cper(block, physical_address);
 
     /* Report the error */
-    ret = ghes_record_cper_errors(block->data, block->len, source_id);
+    ret = ghes_record_cper_errors(block->data, block->len,
+                                  source_id, retry_allowed);
 
     g_array_free(block, true);
 
diff --git a/include/hw/acpi/ghes.h b/include/hw/acpi/ghes.h
index 578a582203..1dad62100a 100644
--- a/include/hw/acpi/ghes.h
+++ b/include/hw/acpi/ghes.h
@@ -74,7 +74,8 @@  void acpi_build_hest(GArray *table_data, GArray *hardware_errors,
                      const char *oem_id, const char *oem_table_id);
 void acpi_ghes_add_fw_cfg(AcpiGhesState *vms, FWCfgState *s,
                           GArray *hardware_errors);
-int acpi_ghes_memory_errors(uint16_t source_id, uint64_t error_physical_addr);
+int acpi_ghes_memory_errors(uint16_t source_id, uint64_t error_physical_addr,
+                            bool retry_allowed);
 
 /**
  * acpi_ghes_present: Report whether ACPI GHES table is present
diff --git a/target/arm/kvm.c b/target/arm/kvm.c
index da30bdbb23..5c0bf99aec 100644
--- a/target/arm/kvm.c
+++ b/target/arm/kvm.c
@@ -2387,7 +2387,7 @@  void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr)
              */
             if (code == BUS_MCEERR_AR) {
                 kvm_cpu_synchronize_state(c);
-                if (!acpi_ghes_memory_errors(ACPI_HEST_SRC_ID_SEA, paddr)) {
+                if (!acpi_ghes_memory_errors(ACPI_HEST_SRC_ID_SEA, paddr, false)) {
                     kvm_inject_arm_sea(c);
                 } else {
                     error_report("failed to record the error");