Message ID | 20250110211405.2284121-4-william.roche@oracle.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | Poisoned memory recovery on reboot | expand |
On 10.01.25 22:14, “William Roche wrote: > From: William Roche <william.roche@oracle.com> > > In case of a large page impacted by a memory error, enhance > the existing Qemu error message which indicates that the error > is injected in the VM, adding "on lost large page SIZE@ADDR". > > Include also a similar message to the ARM platform. > > In the case of a large page impacted, we now report: > ...Memory Error at QEMU addr X and GUEST addr Y on lost large page SIZE@ADDR of type... > > Signed-off-by: William Roche <william.roche@oracle.com> > --- > accel/kvm/kvm-all.c | 4 ---- > target/arm/kvm.c | 13 +++++++++++++ > target/i386/kvm/kvm.c | 18 ++++++++++++++---- > 3 files changed, 27 insertions(+), 8 deletions(-) > > diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c > index 4f2abd5774..f89568bfa3 100644 > --- a/accel/kvm/kvm-all.c > +++ b/accel/kvm/kvm-all.c > @@ -1296,10 +1296,6 @@ static void kvm_unpoison_all(void *param) > void kvm_hwpoison_page_add(ram_addr_t ram_addr) > { > HWPoisonPage *page; > - size_t page_size = qemu_ram_pagesize_from_addr(ram_addr); > - > - if (page_size > TARGET_PAGE_SIZE) > - ram_addr = QEMU_ALIGN_DOWN(ram_addr, page_size); > > QLIST_FOREACH(page, &hwpoison_page_list, list) { > if (page->ram_addr == ram_addr) { > diff --git a/target/arm/kvm.c b/target/arm/kvm.c > index a9444a2c7a..323ce0045d 100644 > --- a/target/arm/kvm.c > +++ b/target/arm/kvm.c > @@ -2366,6 +2366,8 @@ void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr) > { > ram_addr_t ram_addr; > hwaddr paddr; > + size_t page_size; > + char lp_msg[54]; > > assert(code == BUS_MCEERR_AR || code == BUS_MCEERR_AO); > > @@ -2373,6 +2375,14 @@ void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr) > ram_addr = qemu_ram_addr_from_host(addr); > if (ram_addr != RAM_ADDR_INVALID && > kvm_physical_memory_addr_from_host(c->kvm_state, addr, &paddr)) { > + page_size = qemu_ram_pagesize_from_addr(ram_addr); > + if (page_size > TARGET_PAGE_SIZE) { > + ram_addr = ROUND_DOWN(ram_addr, page_size); > + snprintf(lp_msg, sizeof(lp_msg), " on lost large page " > + RAM_ADDR_FMT "@" RAM_ADDR_FMT "", page_size, ram_addr); > + } else { > + lp_msg[0] = '\0'; > + } > kvm_hwpoison_page_add(ram_addr); > /* > * If this is a BUS_MCEERR_AR, we know we have been called > @@ -2389,6 +2399,9 @@ void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr) > kvm_cpu_synchronize_state(c); > if (!acpi_ghes_record_errors(ACPI_HEST_SRC_ID_SEA, paddr)) { > kvm_inject_arm_sea(c); > + error_report("Guest Memory Error at QEMU addr %p and " > + "GUEST addr 0x%" HWADDR_PRIx "%s of type %s injected", > + addr, paddr, lp_msg, "BUS_MCEERR_AR"); > } else { > error_report("failed to record the error"); > abort(); > diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c > index 2f66e63b88..7715cab7cf 100644 > --- a/target/i386/kvm/kvm.c > +++ b/target/i386/kvm/kvm.c > @@ -741,6 +741,8 @@ void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr) > CPUX86State *env = &cpu->env; > ram_addr_t ram_addr; > hwaddr paddr; > + size_t page_size; > + char lp_msg[54]; > > /* If we get an action required MCE, it has been injected by KVM > * while the VM was running. An action optional MCE instead should > @@ -753,6 +755,14 @@ void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr) > ram_addr = qemu_ram_addr_from_host(addr); > if (ram_addr != RAM_ADDR_INVALID && > kvm_physical_memory_addr_from_host(c->kvm_state, addr, &paddr)) { > + page_size = qemu_ram_pagesize_from_addr(ram_addr); > + if (page_size > TARGET_PAGE_SIZE) { > + ram_addr = ROUND_DOWN(ram_addr, page_size); As raised, aligning ram_addr_t addresses to page_size is wrong. Maybe we really want to print block->idstr, offset, size like I proposed at the other place, here as well?
diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c index 4f2abd5774..f89568bfa3 100644 --- a/accel/kvm/kvm-all.c +++ b/accel/kvm/kvm-all.c @@ -1296,10 +1296,6 @@ static void kvm_unpoison_all(void *param) void kvm_hwpoison_page_add(ram_addr_t ram_addr) { HWPoisonPage *page; - size_t page_size = qemu_ram_pagesize_from_addr(ram_addr); - - if (page_size > TARGET_PAGE_SIZE) - ram_addr = QEMU_ALIGN_DOWN(ram_addr, page_size); QLIST_FOREACH(page, &hwpoison_page_list, list) { if (page->ram_addr == ram_addr) { diff --git a/target/arm/kvm.c b/target/arm/kvm.c index a9444a2c7a..323ce0045d 100644 --- a/target/arm/kvm.c +++ b/target/arm/kvm.c @@ -2366,6 +2366,8 @@ void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr) { ram_addr_t ram_addr; hwaddr paddr; + size_t page_size; + char lp_msg[54]; assert(code == BUS_MCEERR_AR || code == BUS_MCEERR_AO); @@ -2373,6 +2375,14 @@ void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr) ram_addr = qemu_ram_addr_from_host(addr); if (ram_addr != RAM_ADDR_INVALID && kvm_physical_memory_addr_from_host(c->kvm_state, addr, &paddr)) { + page_size = qemu_ram_pagesize_from_addr(ram_addr); + if (page_size > TARGET_PAGE_SIZE) { + ram_addr = ROUND_DOWN(ram_addr, page_size); + snprintf(lp_msg, sizeof(lp_msg), " on lost large page " + RAM_ADDR_FMT "@" RAM_ADDR_FMT "", page_size, ram_addr); + } else { + lp_msg[0] = '\0'; + } kvm_hwpoison_page_add(ram_addr); /* * If this is a BUS_MCEERR_AR, we know we have been called @@ -2389,6 +2399,9 @@ void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr) kvm_cpu_synchronize_state(c); if (!acpi_ghes_record_errors(ACPI_HEST_SRC_ID_SEA, paddr)) { kvm_inject_arm_sea(c); + error_report("Guest Memory Error at QEMU addr %p and " + "GUEST addr 0x%" HWADDR_PRIx "%s of type %s injected", + addr, paddr, lp_msg, "BUS_MCEERR_AR"); } else { error_report("failed to record the error"); abort(); diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c index 2f66e63b88..7715cab7cf 100644 --- a/target/i386/kvm/kvm.c +++ b/target/i386/kvm/kvm.c @@ -741,6 +741,8 @@ void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr) CPUX86State *env = &cpu->env; ram_addr_t ram_addr; hwaddr paddr; + size_t page_size; + char lp_msg[54]; /* If we get an action required MCE, it has been injected by KVM * while the VM was running. An action optional MCE instead should @@ -753,6 +755,14 @@ void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr) ram_addr = qemu_ram_addr_from_host(addr); if (ram_addr != RAM_ADDR_INVALID && kvm_physical_memory_addr_from_host(c->kvm_state, addr, &paddr)) { + page_size = qemu_ram_pagesize_from_addr(ram_addr); + if (page_size > TARGET_PAGE_SIZE) { + ram_addr = ROUND_DOWN(ram_addr, page_size); + snprintf(lp_msg, sizeof(lp_msg), " on lost large page " + RAM_ADDR_FMT "@" RAM_ADDR_FMT "", page_size, ram_addr); + } else { + lp_msg[0] = '\0'; + } kvm_hwpoison_page_add(ram_addr); kvm_mce_inject(cpu, paddr, code); @@ -763,12 +773,12 @@ void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr) */ if (code == BUS_MCEERR_AR) { error_report("Guest MCE Memory Error at QEMU addr %p and " - "GUEST addr 0x%" HWADDR_PRIx " of type %s injected", - addr, paddr, "BUS_MCEERR_AR"); + "GUEST addr 0x%" HWADDR_PRIx "%s of type %s injected", + addr, paddr, lp_msg, "BUS_MCEERR_AR"); } else { warn_report("Guest MCE Memory Error at QEMU addr %p and " - "GUEST addr 0x%" HWADDR_PRIx " of type %s injected", - addr, paddr, "BUS_MCEERR_AO"); + "GUEST addr 0x%" HWADDR_PRIx "%s of type %s injected", + addr, paddr, lp_msg, "BUS_MCEERR_AO"); } return;