diff mbox series

[v6,3/6] accel/kvm: Report the loss of a large memory page

Message ID 20250127213107.3454680-4-william.roche@oracle.com (mailing list archive)
State New
Headers show
Series Poisoned memory recovery on reboot | expand

Commit Message

“William Roche Jan. 27, 2025, 9:31 p.m. UTC
From: William Roche <william.roche@oracle.com>

In case of a large page impacted by a memory error, provide an
information about the impacted large page before the memory
error injection message.

This message would also appear on ras enabled ARM platforms, with
the introduction of an x86 similar error injection message.

In the case of a large page impacted, we now report:
Memory Error on large page from <backend>:<address>+<fd_offset> +<page_size>

Signed-off-by: William Roche <william.roche@oracle.com>
---
 accel/kvm/kvm-all.c       | 11 +++++++++++
 include/exec/cpu-common.h |  9 +++++++++
 system/physmem.c          | 21 +++++++++++++++++++++
 target/arm/kvm.c          |  3 +++
 4 files changed, 44 insertions(+)

Comments

David Hildenbrand Jan. 30, 2025, 12:02 p.m. UTC | #1
On 27.01.25 22:31, “William Roche wrote:
> From: William Roche <william.roche@oracle.com>
> 
> In case of a large page impacted by a memory error, provide an
> information about the impacted large page before the memory
> error injection message.
> 
> This message would also appear on ras enabled ARM platforms, with
> the introduction of an x86 similar error injection message.
> 
> In the case of a large page impacted, we now report:
> Memory Error on large page from <backend>:<address>+<fd_offset> +<page_size>
> 
> Signed-off-by: William Roche <william.roche@oracle.com>
> ---
>   accel/kvm/kvm-all.c       | 11 +++++++++++
>   include/exec/cpu-common.h |  9 +++++++++
>   system/physmem.c          | 21 +++++++++++++++++++++
>   target/arm/kvm.c          |  3 +++
>   4 files changed, 44 insertions(+)
> 
> diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
> index f89568bfa3..08e14f8960 100644
> --- a/accel/kvm/kvm-all.c
> +++ b/accel/kvm/kvm-all.c
> @@ -1296,6 +1296,17 @@ static void kvm_unpoison_all(void *param)
>   void kvm_hwpoison_page_add(ram_addr_t ram_addr)
>   {
>       HWPoisonPage *page;
> +    struct RAMBlockInfo rb_info;
> +
> +    if (qemu_ram_block_location_info_from_addr(ram_addr, &rb_info)) {
> +        size_t ps = rb_info.page_size;

Empty line missing.

> +        if (ps > TARGET_PAGE_SIZE) {
> +            uint64_t offset = ram_addr - rb_info.offset;

Empty line missing.

Don't you have to align the fd_offset also down?

I suggest doing the alignment already when calculating "uint64_t offset"
David Hildenbrand Jan. 30, 2025, 5:02 p.m. UTC | #2
On 27.01.25 22:31, “William Roche wrote:
> From: William Roche <william.roche@oracle.com>
> 
> In case of a large page impacted by a memory error, provide an
> information about the impacted large page before the memory
> error injection message.
> 
> This message would also appear on ras enabled ARM platforms, with
> the introduction of an x86 similar error injection message.
> 
> In the case of a large page impacted, we now report:
> Memory Error on large page from <backend>:<address>+<fd_offset> +<page_size>
> 
> Signed-off-by: William Roche <william.roche@oracle.com>
> ---
>   accel/kvm/kvm-all.c       | 11 +++++++++++
>   include/exec/cpu-common.h |  9 +++++++++
>   system/physmem.c          | 21 +++++++++++++++++++++
>   target/arm/kvm.c          |  3 +++
>   4 files changed, 44 insertions(+)
> 
> diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
> index f89568bfa3..08e14f8960 100644
> --- a/accel/kvm/kvm-all.c
> +++ b/accel/kvm/kvm-all.c
> @@ -1296,6 +1296,17 @@ static void kvm_unpoison_all(void *param)
>   void kvm_hwpoison_page_add(ram_addr_t ram_addr)
>   {
>       HWPoisonPage *page;
> +    struct RAMBlockInfo rb_info;
> +
> +    if (qemu_ram_block_location_info_from_addr(ram_addr, &rb_info)) {
> +        size_t ps = rb_info.page_size;
> +        if (ps > TARGET_PAGE_SIZE) {
> +            uint64_t offset = ram_addr - rb_info.offset;
> +            error_report("Memory Error on large page from %s:%" PRIx64
> +                         "+%" PRIx64 " +%zx", rb_info.idstr,
> +                         QEMU_ALIGN_DOWN(offset, ps), rb_info.fd_offset, ps);
> +        }
> +    }

Some smaller nits:

1) I'd call it qemu_ram_block_info_from_addr() --  drop the "_location"

2) Printing the fd_offset only makes sense if there is an fd, right? 
You'd have to communicate that information as well.



Apart from that, this series LGTM, thanks!
“William Roche Feb. 1, 2025, 9:57 a.m. UTC | #3
On 1/30/25 18:02, David Hildenbrand wrote:
> On 27.01.25 22:31, “William Roche wrote:
>> From: William Roche <william.roche@oracle.com>
>>
>> In case of a large page impacted by a memory error, provide an
>> information about the impacted large page before the memory
>> error injection message.
>>
>> This message would also appear on ras enabled ARM platforms, with
>> the introduction of an x86 similar error injection message.
>>
>> In the case of a large page impacted, we now report:
>> Memory Error on large page from <backend>:<address>+<fd_offset> 
>> +<page_size>
>>
>> Signed-off-by: William Roche <william.roche@oracle.com>
>> ---
>>   accel/kvm/kvm-all.c       | 11 +++++++++++
>>   include/exec/cpu-common.h |  9 +++++++++
>>   system/physmem.c          | 21 +++++++++++++++++++++
>>   target/arm/kvm.c          |  3 +++
>>   4 files changed, 44 insertions(+)
>>
>> diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
>> index f89568bfa3..08e14f8960 100644
>> --- a/accel/kvm/kvm-all.c
>> +++ b/accel/kvm/kvm-all.c
>> @@ -1296,6 +1296,17 @@ static void kvm_unpoison_all(void *param)
>>   void kvm_hwpoison_page_add(ram_addr_t ram_addr)
>>   {
>>       HWPoisonPage *page;
>> +    struct RAMBlockInfo rb_info;
>> +
>> +    if (qemu_ram_block_location_info_from_addr(ram_addr, &rb_info)) {
>> +        size_t ps = rb_info.page_size;
>> +        if (ps > TARGET_PAGE_SIZE) {
>> +            uint64_t offset = ram_addr - rb_info.offset;
>> +            error_report("Memory Error on large page from %s:%" PRIx64
>> +                         "+%" PRIx64 " +%zx", rb_info.idstr,
>> +                         QEMU_ALIGN_DOWN(offset, ps), 
>> rb_info.fd_offset, ps);
>> +        }
>> +    }
> 
> Some smaller nits:
> 
> 1) I'd call it qemu_ram_block_info_from_addr() --  drop the "_location"
> 
> 2) Printing the fd_offset only makes sense if there is an fd, right? 
> You'd have to communicate that information as well.
> 
> 
> 
> Apart from that, this series LGTM, thanks!


Thank you David for your feedback.

I'll change the 2 nits above, and add the 2 empty lines missing (in 
patch 3/6).
I also removed the fd_offset information in the message from the 
qemu_ram_remap() function when we don't have an associated fd (patch 2/6).

You also asked me:

> Don't you have to align the fd_offset also down?
> 

fd_offset doesn't need to be aligned as it is used with the value given, 
which should already be adapted to the backend needs (when given by the 
administrator for example)


> I suggest doing the alignment already when calculating "uint64_t offset"
> 

But yes for the offset value itself, it's much better to already align 
it when giving it an initial value. Thanks, I've also made the change.

I'm sending a v7 now including all these changes.

I'll also send an update about the kdump behavior on ARM later next week.

Thanks again,
William.
diff mbox series

Patch

diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index f89568bfa3..08e14f8960 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -1296,6 +1296,17 @@  static void kvm_unpoison_all(void *param)
 void kvm_hwpoison_page_add(ram_addr_t ram_addr)
 {
     HWPoisonPage *page;
+    struct RAMBlockInfo rb_info;
+
+    if (qemu_ram_block_location_info_from_addr(ram_addr, &rb_info)) {
+        size_t ps = rb_info.page_size;
+        if (ps > TARGET_PAGE_SIZE) {
+            uint64_t offset = ram_addr - rb_info.offset;
+            error_report("Memory Error on large page from %s:%" PRIx64
+                         "+%" PRIx64 " +%zx", rb_info.idstr,
+                         QEMU_ALIGN_DOWN(offset, ps), rb_info.fd_offset, ps);
+        }
+    }
 
     QLIST_FOREACH(page, &hwpoison_page_list, list) {
         if (page->ram_addr == ram_addr) {
diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h
index 3771b2130c..176537fd80 100644
--- a/include/exec/cpu-common.h
+++ b/include/exec/cpu-common.h
@@ -110,6 +110,15 @@  int qemu_ram_get_fd(RAMBlock *rb);
 size_t qemu_ram_pagesize(RAMBlock *block);
 size_t qemu_ram_pagesize_largest(void);
 
+struct RAMBlockInfo {
+    char idstr[256];
+    ram_addr_t offset;
+    uint64_t fd_offset;
+    size_t page_size;
+};
+bool qemu_ram_block_location_info_from_addr(ram_addr_t ram_addr,
+                                            struct RAMBlockInfo *block);
+
 /**
  * cpu_address_space_init:
  * @cpu: CPU to add this address space to
diff --git a/system/physmem.c b/system/physmem.c
index 3dc10ae27b..c835fef26b 100644
--- a/system/physmem.c
+++ b/system/physmem.c
@@ -1678,6 +1678,27 @@  size_t qemu_ram_pagesize_largest(void)
     return largest;
 }
 
+/* Copy RAMBlock information associated to the given ram_addr location */
+bool qemu_ram_block_location_info_from_addr(ram_addr_t ram_addr,
+                                            struct RAMBlockInfo *b_info)
+{
+    RAMBlock *rb;
+
+    assert(b_info);
+
+    RCU_READ_LOCK_GUARD();
+    rb =  qemu_get_ram_block(ram_addr);
+    if (!rb) {
+        return false;
+    }
+
+    pstrcat(b_info->idstr, sizeof(b_info->idstr), rb->idstr);
+    b_info->offset = rb->offset;
+    b_info->fd_offset = rb->fd_offset;
+    b_info->page_size = rb->page_size;
+    return true;
+}
+
 static int memory_try_enable_merging(void *addr, size_t len)
 {
     if (!machine_mem_merge(current_machine)) {
diff --git a/target/arm/kvm.c b/target/arm/kvm.c
index da30bdbb23..d9dedc6d74 100644
--- a/target/arm/kvm.c
+++ b/target/arm/kvm.c
@@ -2389,6 +2389,9 @@  void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr)
                 kvm_cpu_synchronize_state(c);
                 if (!acpi_ghes_memory_errors(ACPI_HEST_SRC_ID_SEA, paddr)) {
                     kvm_inject_arm_sea(c);
+                    error_report("Guest Memory Error at QEMU addr %p and "
+                        "GUEST addr 0x%" HWADDR_PRIx " of type %s injected",
+                        addr, paddr, "BUS_MCEERR_AR");
                 } else {
                     error_report("failed to record the error");
                     abort();