Message ID | 20250110211405.2284121-4-william.roche@oracle.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Poisoned memory recovery on reboot | expand |
On 10.01.25 22:14, “William Roche wrote: > From: William Roche <william.roche@oracle.com> > > In case of a large page impacted by a memory error, enhance > the existing Qemu error message which indicates that the error > is injected in the VM, adding "on lost large page SIZE@ADDR". > > Include also a similar message to the ARM platform. > > In the case of a large page impacted, we now report: > ...Memory Error at QEMU addr X and GUEST addr Y on lost large page SIZE@ADDR of type... > > Signed-off-by: William Roche <william.roche@oracle.com> > --- > accel/kvm/kvm-all.c | 4 ---- > target/arm/kvm.c | 13 +++++++++++++ > target/i386/kvm/kvm.c | 18 ++++++++++++++---- > 3 files changed, 27 insertions(+), 8 deletions(-) > > diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c > index 4f2abd5774..f89568bfa3 100644 > --- a/accel/kvm/kvm-all.c > +++ b/accel/kvm/kvm-all.c > @@ -1296,10 +1296,6 @@ static void kvm_unpoison_all(void *param) > void kvm_hwpoison_page_add(ram_addr_t ram_addr) > { > HWPoisonPage *page; > - size_t page_size = qemu_ram_pagesize_from_addr(ram_addr); > - > - if (page_size > TARGET_PAGE_SIZE) > - ram_addr = QEMU_ALIGN_DOWN(ram_addr, page_size); > > QLIST_FOREACH(page, &hwpoison_page_list, list) { > if (page->ram_addr == ram_addr) { > diff --git a/target/arm/kvm.c b/target/arm/kvm.c > index a9444a2c7a..323ce0045d 100644 > --- a/target/arm/kvm.c > +++ b/target/arm/kvm.c > @@ -2366,6 +2366,8 @@ void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr) > { > ram_addr_t ram_addr; > hwaddr paddr; > + size_t page_size; > + char lp_msg[54]; > > assert(code == BUS_MCEERR_AR || code == BUS_MCEERR_AO); > > @@ -2373,6 +2375,14 @@ void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr) > ram_addr = qemu_ram_addr_from_host(addr); > if (ram_addr != RAM_ADDR_INVALID && > kvm_physical_memory_addr_from_host(c->kvm_state, addr, &paddr)) { > + page_size = qemu_ram_pagesize_from_addr(ram_addr); > + if (page_size > TARGET_PAGE_SIZE) { > + ram_addr = ROUND_DOWN(ram_addr, page_size); > + snprintf(lp_msg, sizeof(lp_msg), " on lost large page " > + RAM_ADDR_FMT "@" RAM_ADDR_FMT "", page_size, ram_addr); > + } else { > + lp_msg[0] = '\0'; > + } > kvm_hwpoison_page_add(ram_addr); > /* > * If this is a BUS_MCEERR_AR, we know we have been called > @@ -2389,6 +2399,9 @@ void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr) > kvm_cpu_synchronize_state(c); > if (!acpi_ghes_record_errors(ACPI_HEST_SRC_ID_SEA, paddr)) { > kvm_inject_arm_sea(c); > + error_report("Guest Memory Error at QEMU addr %p and " > + "GUEST addr 0x%" HWADDR_PRIx "%s of type %s injected", > + addr, paddr, lp_msg, "BUS_MCEERR_AR"); > } else { > error_report("failed to record the error"); > abort(); > diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c > index 2f66e63b88..7715cab7cf 100644 > --- a/target/i386/kvm/kvm.c > +++ b/target/i386/kvm/kvm.c > @@ -741,6 +741,8 @@ void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr) > CPUX86State *env = &cpu->env; > ram_addr_t ram_addr; > hwaddr paddr; > + size_t page_size; > + char lp_msg[54]; > > /* If we get an action required MCE, it has been injected by KVM > * while the VM was running. An action optional MCE instead should > @@ -753,6 +755,14 @@ void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr) > ram_addr = qemu_ram_addr_from_host(addr); > if (ram_addr != RAM_ADDR_INVALID && > kvm_physical_memory_addr_from_host(c->kvm_state, addr, &paddr)) { > + page_size = qemu_ram_pagesize_from_addr(ram_addr); > + if (page_size > TARGET_PAGE_SIZE) { > + ram_addr = ROUND_DOWN(ram_addr, page_size); As raised, aligning ram_addr_t addresses to page_size is wrong. Maybe we really want to print block->idstr, offset, size like I proposed at the other place, here as well?
On 1/14/25 15:09, David Hildenbrand wrote: > On 10.01.25 22:14, “William Roche wrote: >> From: William Roche <william.roche@oracle.com> >> >> In case of a large page impacted by a memory error, enhance >> the existing Qemu error message which indicates that the error >> is injected in the VM, adding "on lost large page SIZE@ADDR". >> >> Include also a similar message to the ARM platform. >> >> [...] >> diff --git a/target/arm/kvm.c b/target/arm/kvm.c >> index a9444a2c7a..323ce0045d 100644 >> --- a/target/arm/kvm.c >> +++ b/target/arm/kvm.c >> @@ -2366,6 +2366,8 @@ void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr) >> { >> ram_addr_t ram_addr; >> hwaddr paddr; >> + size_t page_size; >> + char lp_msg[54]; >> assert(code == BUS_MCEERR_AR || code == BUS_MCEERR_AO); >> @@ -2373,6 +2375,14 @@ void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr) >> ram_addr = qemu_ram_addr_from_host(addr); >> if (ram_addr != RAM_ADDR_INVALID && >> kvm_physical_memory_addr_from_host(c->kvm_state, addr, &paddr)) { >> + page_size = qemu_ram_pagesize_from_addr(ram_addr); >> + if (page_size > TARGET_PAGE_SIZE) { >> + ram_addr = ROUND_DOWN(ram_addr, page_size); >> + snprintf(lp_msg, sizeof(lp_msg), " on lost large page " >> + RAM_ADDR_FMT "@" RAM_ADDR_FMT "", page_size, ram_addr); >> + } else { >> + lp_msg[0] = '\0'; >> + } >> kvm_hwpoison_page_add(ram_addr); >> /* >> * If this is a BUS_MCEERR_AR, we know we have been called >> @@ -2389,6 +2399,9 @@ void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr) >> kvm_cpu_synchronize_state(c); >> if (!acpi_ghes_record_errors(ACPI_HEST_SRC_ID_SEA, paddr)) { >> kvm_inject_arm_sea(c); >> + error_report("Guest Memory Error at QEMU addr %p and " >> + "GUEST addr 0x%" HWADDR_PRIx "%s of type %s injected", >> + addr, paddr, lp_msg, "BUS_MCEERR_AR"); >> } else { >> error_report("failed to record the error"); >> abort(); >> diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c >> index 2f66e63b88..7715cab7cf 100644 >> --- a/target/i386/kvm/kvm.c >> +++ b/target/i386/kvm/kvm.c >> @@ -741,6 +741,8 @@ void kvm_arch_on_sigbus_vcpu(CPUState *c, int >> code, void *addr) >> CPUX86State *env = &cpu->env; >> ram_addr_t ram_addr; >> hwaddr paddr; >> + size_t page_size; >> + char lp_msg[54]; >> /* If we get an action required MCE, it has been injected by KVM >> * while the VM was running. An action optional MCE instead should >> @@ -753,6 +755,14 @@ void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr) >> ram_addr = qemu_ram_addr_from_host(addr); >> if (ram_addr != RAM_ADDR_INVALID && >> kvm_physical_memory_addr_from_host(c->kvm_state, addr, >> &paddr)) { >> + page_size = qemu_ram_pagesize_from_addr(ram_addr); >> + if (page_size > TARGET_PAGE_SIZE) { >> + ram_addr = ROUND_DOWN(ram_addr, page_size); > > As raised, aligning ram_addr_t addresses to page_size is wrong. > > Maybe we really want to print block->idstr, offset, size like I proposed > at the other place, here as well? Yes, we can collect the information from the block associated to this ram_addr. But instead of duplicating the necessary code into both i386 and ARM, I came back to adding the change into the kvm_hwpoison_page_add() function called from both i386 and ARM specific code. I also needed a new possibility to retrieve the information while we are dealing with the SIGBUS signal, and created a new function to gather the information from the RAMBlock: qemu_ram_block_location_info_from_addr(ram_addr_t ram_addr, struct RAMBlockInfo *b_info) with the associated struct. So that we can use the RCU_READ_LOCK_GUARD() and retrieve all the data. Note about ARM failing on large pages: ----------=====---------------------- I could test that ARM VMs impacted by memory errors on a large underlying memory page, can end up looping on reporting the error: The VM encountering an error has a high probability to crash and can try to save a vmcore with a kdump phase. This fix introduces qemu messages reporting errors when they are relayed to the VM. A large page being poisoned by an error on ARM can make a VM loop on the vmcore collection phase and the console would show messages like that appearing every 10 seconds (before the change): vvv Starting Kdump Vmcore Save Service... [ 3.095399] kdump[445]: Kdump is using the default log level(3). [ 3.173998] kdump[481]: saving to /sysroot/var/crash/127.0.0.1-2025-01-27-20:17:40/ [ 3.189683] kdump[486]: saving vmcore-dmesg.txt to /sysroot/var/crash/127.0.0.1-2025-01-27-20:17:40/ [ 3.213584] kdump[492]: saving vmcore-dmesg.txt complete [ 3.220295] kdump[494]: saving vmcore [ 10.029515] EDAC MC0: 1 UE unknown on unknown memory ( page:0x116c60 offset:0x0 grain:1 - APEI location: ) [ 10.033647] [Firmware Warn]: GHES: Invalid address in generic error data: 0x116c60000 [ 10.036974] {2}[Hardware Error]: Hardware error from APEI Generic Hardware Error Source: 0 [ 10.040514] {2}[Hardware Error]: event severity: recoverable [ 10.042911] {2}[Hardware Error]: Error 0, type: recoverable [ 10.045310] {2}[Hardware Error]: section_type: memory error [ 10.047666] {2}[Hardware Error]: physical_address: 0x0000000116c60000 [ 10.050486] {2}[Hardware Error]: error_type: 0, unknown [ 20.053205] EDAC MC0: 1 UE unknown on unknown memory ( page:0x116c60 offset:0x0 grain:1 - APEI location: ) [ 20.057416] [Firmware Warn]: GHES: Invalid address in generic error data: 0x116c60000 [ 20.060781] {3}[Hardware Error]: Hardware error from APEI Generic Hardware Error Source: 0 [ 20.065472] {3}[Hardware Error]: event severity: recoverable [ 20.067878] {3}[Hardware Error]: Error 0, type: recoverable [ 20.070273] {3}[Hardware Error]: section_type: memory error [ 20.072686] {3}[Hardware Error]: physical_address: 0x0000000116c60000 [ 20.075590] {3}[Hardware Error]: error_type: 0, unknown ^^^ with the fix, we now have a flood of messages like: vvv qemu-system-aarch64: Memory Error on large page from ram-node1:d5e00000+0 +200000 qemu-system-aarch64: Guest Memory Error at QEMU addr 0xffff35c79000 and GUEST addr 0x115e79000 of type BUS_MCEERR_AR injected qemu-system-aarch64: Memory Error on large page from ram-node1:d5e00000+0 +200000 qemu-system-aarch64: Guest Memory Error at QEMU addr 0xffff35c79000 and GUEST addr 0x115e79000 of type BUS_MCEERR_AR injected qemu-system-aarch64: Memory Error on large page from ram-node1:d5e00000+0 +200000 qemu-system-aarch64: Guest Memory Error at QEMU addr 0xffff35c79000 and GUEST addr 0x115e79000 of type BUS_MCEERR_AR injected ^^^ In both cases, this situation loops indefinitely ! I'm just informing of a change of behavior, fixing this issue would most probably require VM kernel modifications or a work-around in qemu when errors are reported too often, but is out of the scope of this current qemu fix.
> Yes, we can collect the information from the block associated to this > ram_addr. But instead of duplicating the necessary code into both i386 > and ARM, I came back to adding the change into the > kvm_hwpoison_page_add() function called from both i386 and ARM specific > code. > > I also needed a new possibility to retrieve the information while we are > dealing with the SIGBUS signal, and created a new function to gather the > information from the RAMBlock: > qemu_ram_block_location_info_from_addr(ram_addr_t ram_addr, > struct RAMBlockInfo *b_info) > with the associated struct. > > So that we can use the RCU_READ_LOCK_GUARD() and retrieve all the data. Makes sense. > > > Note about ARM failing on large pages: > ----------=====---------------------- > I could test that ARM VMs impacted by memory errors on a large > underlying memory page, can end up looping on reporting the error: > The VM encountering an error has a high probability to crash and can try > to save a vmcore with a kdump phase. Yeah, that's what I thought. If you rip out 1 GiB of memory, your VM is going to have a bad time :/ > > This fix introduces qemu messages reporting errors when they are relayed > to the VM. > A large page being poisoned by an error on ARM can make a VM loop on the > vmcore collection phase and the console would show messages like that > appearing every 10 seconds (before the change): > > vvv > Starting Kdump Vmcore Save Service... > [ 3.095399] kdump[445]: Kdump is using the default log level(3). > [ 3.173998] kdump[481]: saving to > /sysroot/var/crash/127.0.0.1-2025-01-27-20:17:40/ > [ 3.189683] kdump[486]: saving vmcore-dmesg.txt to > /sysroot/var/crash/127.0.0.1-2025-01-27-20:17:40/ > [ 3.213584] kdump[492]: saving vmcore-dmesg.txt complete > [ 3.220295] kdump[494]: saving vmcore > [ 10.029515] EDAC MC0: 1 UE unknown on unknown memory ( page:0x116c60 > offset:0x0 grain:1 - APEI location: ) > [ 10.033647] [Firmware Warn]: GHES: Invalid address in generic error > data: 0x116c60000 > [ 10.036974] {2}[Hardware Error]: Hardware error from APEI Generic > Hardware Error Source: 0 > [ 10.040514] {2}[Hardware Error]: event severity: recoverable > [ 10.042911] {2}[Hardware Error]: Error 0, type: recoverable > [ 10.045310] {2}[Hardware Error]: section_type: memory error > [ 10.047666] {2}[Hardware Error]: physical_address: 0x0000000116c60000 > [ 10.050486] {2}[Hardware Error]: error_type: 0, unknown > [ 20.053205] EDAC MC0: 1 UE unknown on unknown memory ( page:0x116c60 > offset:0x0 grain:1 - APEI location: ) > [ 20.057416] [Firmware Warn]: GHES: Invalid address in generic error > data: 0x116c60000 > [ 20.060781] {3}[Hardware Error]: Hardware error from APEI Generic > Hardware Error Source: 0 > [ 20.065472] {3}[Hardware Error]: event severity: recoverable > [ 20.067878] {3}[Hardware Error]: Error 0, type: recoverable > [ 20.070273] {3}[Hardware Error]: section_type: memory error > [ 20.072686] {3}[Hardware Error]: physical_address: 0x0000000116c60000 > [ 20.075590] {3}[Hardware Error]: error_type: 0, unknown > ^^^ > > with the fix, we now have a flood of messages like: > > vvv > qemu-system-aarch64: Memory Error on large page from > ram-node1:d5e00000+0 +200000 > qemu-system-aarch64: Guest Memory Error at QEMU addr 0xffff35c79000 and > GUEST addr 0x115e79000 of type BUS_MCEERR_AR injected > qemu-system-aarch64: Memory Error on large page from > ram-node1:d5e00000+0 +200000 > qemu-system-aarch64: Guest Memory Error at QEMU addr 0xffff35c79000 and > GUEST addr 0x115e79000 of type BUS_MCEERR_AR injected > qemu-system-aarch64: Memory Error on large page from > ram-node1:d5e00000+0 +200000 > qemu-system-aarch64: Guest Memory Error at QEMU addr 0xffff35c79000 and > GUEST addr 0x115e79000 of type BUS_MCEERR_AR injected > ^^^ > > > In both cases, this situation loops indefinitely ! > > I'm just informing of a change of behavior, fixing this issue would most > probably require VM kernel modifications or a work-around in qemu when > errors are reported too often, but is out of the scope of this current > qemu fix. Agreed. I think one problem is that kdump cannot really cope with new memory errors (it tries to not touch pages that had a memory error in the old kernel). Maybe this is also due to the fact that we inform the kernel only about a single page vanishing, whereby actually a whole 1 GiB is vanishing.
diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c index 4f2abd5774..f89568bfa3 100644 --- a/accel/kvm/kvm-all.c +++ b/accel/kvm/kvm-all.c @@ -1296,10 +1296,6 @@ static void kvm_unpoison_all(void *param) void kvm_hwpoison_page_add(ram_addr_t ram_addr) { HWPoisonPage *page; - size_t page_size = qemu_ram_pagesize_from_addr(ram_addr); - - if (page_size > TARGET_PAGE_SIZE) - ram_addr = QEMU_ALIGN_DOWN(ram_addr, page_size); QLIST_FOREACH(page, &hwpoison_page_list, list) { if (page->ram_addr == ram_addr) { diff --git a/target/arm/kvm.c b/target/arm/kvm.c index a9444a2c7a..323ce0045d 100644 --- a/target/arm/kvm.c +++ b/target/arm/kvm.c @@ -2366,6 +2366,8 @@ void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr) { ram_addr_t ram_addr; hwaddr paddr; + size_t page_size; + char lp_msg[54]; assert(code == BUS_MCEERR_AR || code == BUS_MCEERR_AO); @@ -2373,6 +2375,14 @@ void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr) ram_addr = qemu_ram_addr_from_host(addr); if (ram_addr != RAM_ADDR_INVALID && kvm_physical_memory_addr_from_host(c->kvm_state, addr, &paddr)) { + page_size = qemu_ram_pagesize_from_addr(ram_addr); + if (page_size > TARGET_PAGE_SIZE) { + ram_addr = ROUND_DOWN(ram_addr, page_size); + snprintf(lp_msg, sizeof(lp_msg), " on lost large page " + RAM_ADDR_FMT "@" RAM_ADDR_FMT "", page_size, ram_addr); + } else { + lp_msg[0] = '\0'; + } kvm_hwpoison_page_add(ram_addr); /* * If this is a BUS_MCEERR_AR, we know we have been called @@ -2389,6 +2399,9 @@ void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr) kvm_cpu_synchronize_state(c); if (!acpi_ghes_record_errors(ACPI_HEST_SRC_ID_SEA, paddr)) { kvm_inject_arm_sea(c); + error_report("Guest Memory Error at QEMU addr %p and " + "GUEST addr 0x%" HWADDR_PRIx "%s of type %s injected", + addr, paddr, lp_msg, "BUS_MCEERR_AR"); } else { error_report("failed to record the error"); abort(); diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c index 2f66e63b88..7715cab7cf 100644 --- a/target/i386/kvm/kvm.c +++ b/target/i386/kvm/kvm.c @@ -741,6 +741,8 @@ void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr) CPUX86State *env = &cpu->env; ram_addr_t ram_addr; hwaddr paddr; + size_t page_size; + char lp_msg[54]; /* If we get an action required MCE, it has been injected by KVM * while the VM was running. An action optional MCE instead should @@ -753,6 +755,14 @@ void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr) ram_addr = qemu_ram_addr_from_host(addr); if (ram_addr != RAM_ADDR_INVALID && kvm_physical_memory_addr_from_host(c->kvm_state, addr, &paddr)) { + page_size = qemu_ram_pagesize_from_addr(ram_addr); + if (page_size > TARGET_PAGE_SIZE) { + ram_addr = ROUND_DOWN(ram_addr, page_size); + snprintf(lp_msg, sizeof(lp_msg), " on lost large page " + RAM_ADDR_FMT "@" RAM_ADDR_FMT "", page_size, ram_addr); + } else { + lp_msg[0] = '\0'; + } kvm_hwpoison_page_add(ram_addr); kvm_mce_inject(cpu, paddr, code); @@ -763,12 +773,12 @@ void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr) */ if (code == BUS_MCEERR_AR) { error_report("Guest MCE Memory Error at QEMU addr %p and " - "GUEST addr 0x%" HWADDR_PRIx " of type %s injected", - addr, paddr, "BUS_MCEERR_AR"); + "GUEST addr 0x%" HWADDR_PRIx "%s of type %s injected", + addr, paddr, lp_msg, "BUS_MCEERR_AR"); } else { warn_report("Guest MCE Memory Error at QEMU addr %p and " - "GUEST addr 0x%" HWADDR_PRIx " of type %s injected", - addr, paddr, "BUS_MCEERR_AO"); + "GUEST addr 0x%" HWADDR_PRIx "%s of type %s injected", + addr, paddr, lp_msg, "BUS_MCEERR_AO"); } return;