diff mbox series

[v5,3/6] accel/kvm: Report the loss of a large memory page

Message ID 20250110211405.2284121-4-william.roche@oracle.com (mailing list archive)
State New
Headers show
Series Poisoned memory recovery on reboot | expand

Commit Message

“William Roche Jan. 10, 2025, 9:14 p.m. UTC
From: William Roche <william.roche@oracle.com>

In case of a large page impacted by a memory error, enhance
the existing Qemu error message which indicates that the error
is injected in the VM, adding "on lost large page SIZE@ADDR".

Include also a similar message to the ARM platform.

In the case of a large page impacted, we now report:
...Memory Error at QEMU addr X and GUEST addr Y on lost large page SIZE@ADDR of type...

Signed-off-by: William Roche <william.roche@oracle.com>
---
 accel/kvm/kvm-all.c   |  4 ----
 target/arm/kvm.c      | 13 +++++++++++++
 target/i386/kvm/kvm.c | 18 ++++++++++++++----
 3 files changed, 27 insertions(+), 8 deletions(-)

Comments

David Hildenbrand Jan. 14, 2025, 2:09 p.m. UTC | #1
On 10.01.25 22:14, “William Roche wrote:
> From: William Roche <william.roche@oracle.com>
> 
> In case of a large page impacted by a memory error, enhance
> the existing Qemu error message which indicates that the error
> is injected in the VM, adding "on lost large page SIZE@ADDR".
> 
> Include also a similar message to the ARM platform.
> 
> In the case of a large page impacted, we now report:
> ...Memory Error at QEMU addr X and GUEST addr Y on lost large page SIZE@ADDR of type...
> 
> Signed-off-by: William Roche <william.roche@oracle.com>
> ---
>   accel/kvm/kvm-all.c   |  4 ----
>   target/arm/kvm.c      | 13 +++++++++++++
>   target/i386/kvm/kvm.c | 18 ++++++++++++++----
>   3 files changed, 27 insertions(+), 8 deletions(-)
> 
> diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
> index 4f2abd5774..f89568bfa3 100644
> --- a/accel/kvm/kvm-all.c
> +++ b/accel/kvm/kvm-all.c
> @@ -1296,10 +1296,6 @@ static void kvm_unpoison_all(void *param)
>   void kvm_hwpoison_page_add(ram_addr_t ram_addr)
>   {
>       HWPoisonPage *page;
> -    size_t page_size = qemu_ram_pagesize_from_addr(ram_addr);
> -
> -    if (page_size > TARGET_PAGE_SIZE)
> -        ram_addr = QEMU_ALIGN_DOWN(ram_addr, page_size);
>   
>       QLIST_FOREACH(page, &hwpoison_page_list, list) {
>           if (page->ram_addr == ram_addr) {
> diff --git a/target/arm/kvm.c b/target/arm/kvm.c
> index a9444a2c7a..323ce0045d 100644
> --- a/target/arm/kvm.c
> +++ b/target/arm/kvm.c
> @@ -2366,6 +2366,8 @@ void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr)
>   {
>       ram_addr_t ram_addr;
>       hwaddr paddr;
> +    size_t page_size;
> +    char lp_msg[54];
>   
>       assert(code == BUS_MCEERR_AR || code == BUS_MCEERR_AO);
>   
> @@ -2373,6 +2375,14 @@ void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr)
>           ram_addr = qemu_ram_addr_from_host(addr);
>           if (ram_addr != RAM_ADDR_INVALID &&
>               kvm_physical_memory_addr_from_host(c->kvm_state, addr, &paddr)) {
> +            page_size = qemu_ram_pagesize_from_addr(ram_addr);
> +            if (page_size > TARGET_PAGE_SIZE) {
> +                ram_addr = ROUND_DOWN(ram_addr, page_size);
> +                snprintf(lp_msg, sizeof(lp_msg), " on lost large page "
> +                    RAM_ADDR_FMT "@" RAM_ADDR_FMT "", page_size, ram_addr);
> +            } else {
> +                lp_msg[0] = '\0';
> +            }
>               kvm_hwpoison_page_add(ram_addr);
>               /*
>                * If this is a BUS_MCEERR_AR, we know we have been called
> @@ -2389,6 +2399,9 @@ void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr)
>                   kvm_cpu_synchronize_state(c);
>                   if (!acpi_ghes_record_errors(ACPI_HEST_SRC_ID_SEA, paddr)) {
>                       kvm_inject_arm_sea(c);
> +                    error_report("Guest Memory Error at QEMU addr %p and "
> +                        "GUEST addr 0x%" HWADDR_PRIx "%s of type %s injected",
> +                        addr, paddr, lp_msg, "BUS_MCEERR_AR");
>                   } else {
>                       error_report("failed to record the error");
>                       abort();
> diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c
> index 2f66e63b88..7715cab7cf 100644
> --- a/target/i386/kvm/kvm.c
> +++ b/target/i386/kvm/kvm.c
> @@ -741,6 +741,8 @@ void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr)
>       CPUX86State *env = &cpu->env;
>       ram_addr_t ram_addr;
>       hwaddr paddr;
> +    size_t page_size;
> +    char lp_msg[54];
>   
>       /* If we get an action required MCE, it has been injected by KVM
>        * while the VM was running.  An action optional MCE instead should
> @@ -753,6 +755,14 @@ void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr)
>           ram_addr = qemu_ram_addr_from_host(addr);
>           if (ram_addr != RAM_ADDR_INVALID &&
>               kvm_physical_memory_addr_from_host(c->kvm_state, addr, &paddr)) {
> +            page_size = qemu_ram_pagesize_from_addr(ram_addr);
> +            if (page_size > TARGET_PAGE_SIZE) {
> +                ram_addr = ROUND_DOWN(ram_addr, page_size);

As raised, aligning ram_addr_t addresses to page_size is wrong.

Maybe we really want to print block->idstr, offset, size like I proposed 
at the other place, here as well?
diff mbox series

Patch

diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index 4f2abd5774..f89568bfa3 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -1296,10 +1296,6 @@  static void kvm_unpoison_all(void *param)
 void kvm_hwpoison_page_add(ram_addr_t ram_addr)
 {
     HWPoisonPage *page;
-    size_t page_size = qemu_ram_pagesize_from_addr(ram_addr);
-
-    if (page_size > TARGET_PAGE_SIZE)
-        ram_addr = QEMU_ALIGN_DOWN(ram_addr, page_size);
 
     QLIST_FOREACH(page, &hwpoison_page_list, list) {
         if (page->ram_addr == ram_addr) {
diff --git a/target/arm/kvm.c b/target/arm/kvm.c
index a9444a2c7a..323ce0045d 100644
--- a/target/arm/kvm.c
+++ b/target/arm/kvm.c
@@ -2366,6 +2366,8 @@  void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr)
 {
     ram_addr_t ram_addr;
     hwaddr paddr;
+    size_t page_size;
+    char lp_msg[54];
 
     assert(code == BUS_MCEERR_AR || code == BUS_MCEERR_AO);
 
@@ -2373,6 +2375,14 @@  void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr)
         ram_addr = qemu_ram_addr_from_host(addr);
         if (ram_addr != RAM_ADDR_INVALID &&
             kvm_physical_memory_addr_from_host(c->kvm_state, addr, &paddr)) {
+            page_size = qemu_ram_pagesize_from_addr(ram_addr);
+            if (page_size > TARGET_PAGE_SIZE) {
+                ram_addr = ROUND_DOWN(ram_addr, page_size);
+                snprintf(lp_msg, sizeof(lp_msg), " on lost large page "
+                    RAM_ADDR_FMT "@" RAM_ADDR_FMT "", page_size, ram_addr);
+            } else {
+                lp_msg[0] = '\0';
+            }
             kvm_hwpoison_page_add(ram_addr);
             /*
              * If this is a BUS_MCEERR_AR, we know we have been called
@@ -2389,6 +2399,9 @@  void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr)
                 kvm_cpu_synchronize_state(c);
                 if (!acpi_ghes_record_errors(ACPI_HEST_SRC_ID_SEA, paddr)) {
                     kvm_inject_arm_sea(c);
+                    error_report("Guest Memory Error at QEMU addr %p and "
+                        "GUEST addr 0x%" HWADDR_PRIx "%s of type %s injected",
+                        addr, paddr, lp_msg, "BUS_MCEERR_AR");
                 } else {
                     error_report("failed to record the error");
                     abort();
diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c
index 2f66e63b88..7715cab7cf 100644
--- a/target/i386/kvm/kvm.c
+++ b/target/i386/kvm/kvm.c
@@ -741,6 +741,8 @@  void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr)
     CPUX86State *env = &cpu->env;
     ram_addr_t ram_addr;
     hwaddr paddr;
+    size_t page_size;
+    char lp_msg[54];
 
     /* If we get an action required MCE, it has been injected by KVM
      * while the VM was running.  An action optional MCE instead should
@@ -753,6 +755,14 @@  void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr)
         ram_addr = qemu_ram_addr_from_host(addr);
         if (ram_addr != RAM_ADDR_INVALID &&
             kvm_physical_memory_addr_from_host(c->kvm_state, addr, &paddr)) {
+            page_size = qemu_ram_pagesize_from_addr(ram_addr);
+            if (page_size > TARGET_PAGE_SIZE) {
+                ram_addr = ROUND_DOWN(ram_addr, page_size);
+                snprintf(lp_msg, sizeof(lp_msg), " on lost large page "
+                        RAM_ADDR_FMT "@" RAM_ADDR_FMT "", page_size, ram_addr);
+            } else {
+                lp_msg[0] = '\0';
+            }
             kvm_hwpoison_page_add(ram_addr);
             kvm_mce_inject(cpu, paddr, code);
 
@@ -763,12 +773,12 @@  void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr)
              */
             if (code == BUS_MCEERR_AR) {
                 error_report("Guest MCE Memory Error at QEMU addr %p and "
-                    "GUEST addr 0x%" HWADDR_PRIx " of type %s injected",
-                    addr, paddr, "BUS_MCEERR_AR");
+                    "GUEST addr 0x%" HWADDR_PRIx "%s of type %s injected",
+                    addr, paddr, lp_msg, "BUS_MCEERR_AR");
             } else {
                  warn_report("Guest MCE Memory Error at QEMU addr %p and "
-                     "GUEST addr 0x%" HWADDR_PRIx " of type %s injected",
-                     addr, paddr, "BUS_MCEERR_AO");
+                     "GUEST addr 0x%" HWADDR_PRIx "%s of type %s injected",
+                     addr, paddr, lp_msg, "BUS_MCEERR_AO");
             }
 
             return;