diff mbox series

KVM: HWPoison: Fix memory address&size during remap

Message ID 20220420064542.423508-1-pizhenwei@bytedance.com (mailing list archive)
State New, archived
Headers show
Series KVM: HWPoison: Fix memory address&size during remap | expand

Commit Message

zhenwei pi April 20, 2022, 6:45 a.m. UTC
qemu exits during reset with log:
qemu-system-x86_64: Could not remap addr: 1000@22001000

Currently, after MCE on RAM of a guest, qemu records a ram_addr only,
remaps this address with a fixed size(TARGET_PAGE_SIZE) during reset.
In the hugetlbfs scenario, mmap(addr...) needs page_size aligned
address and correct size. Unaligned address leads mmap to fail.

What's more, hitting MCE on RAM of a guest, qemu records this address
and try to fix it during reset, this should be a common logic. So
remove kvm_hwpoison_page_add from architecture dependent code, record
this in SIGBUS handler instead. Finally poisoning/unpoisoning a page
gets static in kvm-all.c,

Signed-off-by: zhenwei pi <pizhenwei@bytedance.com>
---
 accel/kvm/kvm-all.c      | 47 ++++++++++++++++++++++++++++++----------
 include/sysemu/kvm_int.h | 12 ----------
 target/arm/kvm64.c       |  1 -
 target/i386/kvm/kvm.c    |  1 -
 4 files changed, 36 insertions(+), 25 deletions(-)

Comments

zhenwei pi April 27, 2022, 2:16 a.m. UTC | #1
Hi, Paolo & Peter

Could you please review this patch?

On 4/20/22 14:45, zhenwei pi wrote:
> qemu exits during reset with log:
> qemu-system-x86_64: Could not remap addr: 1000@22001000
> 
> Currently, after MCE on RAM of a guest, qemu records a ram_addr only,
> remaps this address with a fixed size(TARGET_PAGE_SIZE) during reset.
> In the hugetlbfs scenario, mmap(addr...) needs page_size aligned
> address and correct size. Unaligned address leads mmap to fail.
> 
> What's more, hitting MCE on RAM of a guest, qemu records this address
> and try to fix it during reset, this should be a common logic. So
> remove kvm_hwpoison_page_add from architecture dependent code, record
> this in SIGBUS handler instead. Finally poisoning/unpoisoning a page
> gets static in kvm-all.c,
> 
> Signed-off-by: zhenwei pi <pizhenwei@bytedance.com>
> ---
>   accel/kvm/kvm-all.c      | 47 ++++++++++++++++++++++++++++++----------
>   include/sysemu/kvm_int.h | 12 ----------
>   target/arm/kvm64.c       |  1 -
>   target/i386/kvm/kvm.c    |  1 -
>   4 files changed, 36 insertions(+), 25 deletions(-)
> 
> diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
> index 5f1377ca04..2a91c5a461 100644
> --- a/accel/kvm/kvm-all.c
> +++ b/accel/kvm/kvm-all.c
> @@ -1167,11 +1167,14 @@ int kvm_vm_check_extension(KVMState *s, unsigned int extension)
>       return ret;
>   }
>   
> +#ifdef KVM_HAVE_MCE_INJECTION
>   typedef struct HWPoisonPage {
>       ram_addr_t ram_addr;
> +    size_t page_size; /* normal page or hugeTLB page? */
>       QLIST_ENTRY(HWPoisonPage) list;
>   } HWPoisonPage;
>   
> +/* hwpoison_page_list stores the poisoned pages, unpoison them during reset */
>   static QLIST_HEAD(, HWPoisonPage) hwpoison_page_list =
>       QLIST_HEAD_INITIALIZER(hwpoison_page_list);
>   
> @@ -1181,25 +1184,48 @@ static void kvm_unpoison_all(void *param)
>   
>       QLIST_FOREACH_SAFE(page, &hwpoison_page_list, list, next_page) {
>           QLIST_REMOVE(page, list);
> -        qemu_ram_remap(page->ram_addr, TARGET_PAGE_SIZE);
> +        qemu_ram_remap(page->ram_addr, page->page_size);
>           g_free(page);
>       }
>   }
>   
> -void kvm_hwpoison_page_add(ram_addr_t ram_addr)
> +static void kvm_hwpoison_page_add(CPUState *cpu, int sigbus_code, void *addr)
>   {
>       HWPoisonPage *page;
> +    ram_addr_t ram_addr, align_ram_addr;
> +    ram_addr_t offset;
> +    hwaddr paddr;
> +    size_t page_size;
> +
> +    assert(sigbus_code == BUS_MCEERR_AR || sigbus_code == BUS_MCEERR_AO);
> +    ram_addr = qemu_ram_addr_from_host(addr);
> +    if (ram_addr == RAM_ADDR_INVALID ||
> +        !kvm_physical_memory_addr_from_host(cpu->kvm_state, addr, &paddr)) {
> +        /* only deal with valid guest RAM here */
> +        return;
> +    }
>   
> +    /* get page size of RAM block, test it's a normal page or huge page */
> +    page_size = qemu_ram_block_from_host(addr, false, &offset)->page_size;
> +    align_ram_addr = QEMU_ALIGN_DOWN(ram_addr, page_size);
>       QLIST_FOREACH(page, &hwpoison_page_list, list) {
> -        if (page->ram_addr == ram_addr) {
> +        if (page->ram_addr == align_ram_addr) {
> +            assert(page->page_size == page_size);
>               return;
>           }
>       }
> -    page = g_new(HWPoisonPage, 1);
> -    page->ram_addr = ram_addr;
> +
> +    page = g_new0(HWPoisonPage, 1);
> +    page->ram_addr = align_ram_addr;
> +    page->page_size = page_size;
>       QLIST_INSERT_HEAD(&hwpoison_page_list, page, list);
>   }
>   
> +static __thread void *pending_sigbus_addr;
> +static __thread int pending_sigbus_code;
> +static __thread bool have_sigbus_pending;
> +#endif
> +
>   static uint32_t adjust_ioeventfd_endianness(uint32_t val, uint32_t size)
>   {
>   #if defined(HOST_WORDS_BIGENDIAN) != defined(TARGET_WORDS_BIGENDIAN)
> @@ -2601,7 +2627,9 @@ static int kvm_init(MachineState *ms)
>           s->kernel_irqchip_split = mc->default_kernel_irqchip_split ? ON_OFF_AUTO_ON : ON_OFF_AUTO_OFF;
>       }
>   
> +#if defined KVM_HAVE_MCE_INJECTION
>       qemu_register_reset(kvm_unpoison_all, NULL);
> +#endif
>   
>       if (s->kernel_irqchip_allowed) {
>           kvm_irqchip_create(s);
> @@ -2782,12 +2810,6 @@ void kvm_cpu_synchronize_pre_loadvm(CPUState *cpu)
>       run_on_cpu(cpu, do_kvm_cpu_synchronize_pre_loadvm, RUN_ON_CPU_NULL);
>   }
>   
> -#ifdef KVM_HAVE_MCE_INJECTION
> -static __thread void *pending_sigbus_addr;
> -static __thread int pending_sigbus_code;
> -static __thread bool have_sigbus_pending;
> -#endif
> -
>   static void kvm_cpu_kick(CPUState *cpu)
>   {
>       qatomic_set(&cpu->kvm_run->immediate_exit, 1);
> @@ -2883,6 +2905,8 @@ int kvm_cpu_exec(CPUState *cpu)
>   #ifdef KVM_HAVE_MCE_INJECTION
>           if (unlikely(have_sigbus_pending)) {
>               qemu_mutex_lock_iothread();
> +            kvm_hwpoison_page_add(cpu, pending_sigbus_code,
> +                                  pending_sigbus_addr);
>               kvm_arch_on_sigbus_vcpu(cpu, pending_sigbus_code,
>                                       pending_sigbus_addr);
>               have_sigbus_pending = false;
> @@ -3436,6 +3460,7 @@ int kvm_on_sigbus(int code, void *addr)
>        * we can only get action optional here.
>        */
>       assert(code != BUS_MCEERR_AR);
> +    kvm_hwpoison_page_add(first_cpu, code, addr);
>       kvm_arch_on_sigbus_vcpu(first_cpu, code, addr);
>       return 0;
>   #else
> diff --git a/include/sysemu/kvm_int.h b/include/sysemu/kvm_int.h
> index 1f5487d9b7..52ec8ef99c 100644
> --- a/include/sysemu/kvm_int.h
> +++ b/include/sysemu/kvm_int.h
> @@ -40,16 +40,4 @@ void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml,
>                                     AddressSpace *as, int as_id, const char *name);
>   
>   void kvm_set_max_memslot_size(hwaddr max_slot_size);
> -
> -/**
> - * kvm_hwpoison_page_add:
> - *
> - * Parameters:
> - *  @ram_addr: the address in the RAM for the poisoned page
> - *
> - * Add a poisoned page to the list
> - *
> - * Return: None.
> - */
> -void kvm_hwpoison_page_add(ram_addr_t ram_addr);
>   #endif
> diff --git a/target/arm/kvm64.c b/target/arm/kvm64.c
> index ccadfbbe72..a3184eb3d2 100644
> --- a/target/arm/kvm64.c
> +++ b/target/arm/kvm64.c
> @@ -1450,7 +1450,6 @@ void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr)
>           ram_addr = qemu_ram_addr_from_host(addr);
>           if (ram_addr != RAM_ADDR_INVALID &&
>               kvm_physical_memory_addr_from_host(c->kvm_state, addr, &paddr)) {
> -            kvm_hwpoison_page_add(ram_addr);
>               /*
>                * If this is a BUS_MCEERR_AR, we know we have been called
>                * synchronously from the vCPU thread, so we can easily
> diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c
> index 9cf8e03669..fb72b349ed 100644
> --- a/target/i386/kvm/kvm.c
> +++ b/target/i386/kvm/kvm.c
> @@ -622,7 +622,6 @@ void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr)
>           ram_addr = qemu_ram_addr_from_host(addr);
>           if (ram_addr != RAM_ADDR_INVALID &&
>               kvm_physical_memory_addr_from_host(c->kvm_state, addr, &paddr)) {
> -            kvm_hwpoison_page_add(ram_addr);
>               kvm_mce_inject(cpu, paddr, code);
>   
>               /*
zhenwei pi May 5, 2022, 12:32 a.m. UTC | #2
Hi, Paolo

I would appreciate it if you could review patch.

On 4/20/22 14:45, zhenwei pi wrote:
> qemu exits during reset with log:
> qemu-system-x86_64: Could not remap addr: 1000@22001000
> 
> Currently, after MCE on RAM of a guest, qemu records a ram_addr only,
> remaps this address with a fixed size(TARGET_PAGE_SIZE) during reset.
> In the hugetlbfs scenario, mmap(addr...) needs page_size aligned
> address and correct size. Unaligned address leads mmap to fail.
> 
> What's more, hitting MCE on RAM of a guest, qemu records this address
> and try to fix it during reset, this should be a common logic. So
> remove kvm_hwpoison_page_add from architecture dependent code, record
> this in SIGBUS handler instead. Finally poisoning/unpoisoning a page
> gets static in kvm-all.c,
> 
> Signed-off-by: zhenwei pi <pizhenwei@bytedance.com>
> ---
>   accel/kvm/kvm-all.c      | 47 ++++++++++++++++++++++++++++++----------
>   include/sysemu/kvm_int.h | 12 ----------
>   target/arm/kvm64.c       |  1 -
>   target/i386/kvm/kvm.c    |  1 -
>   4 files changed, 36 insertions(+), 25 deletions(-)
> 
> diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
> index 5f1377ca04..2a91c5a461 100644
> --- a/accel/kvm/kvm-all.c
> +++ b/accel/kvm/kvm-all.c
> @@ -1167,11 +1167,14 @@ int kvm_vm_check_extension(KVMState *s, unsigned int extension)
>       return ret;
>   }
>   
> +#ifdef KVM_HAVE_MCE_INJECTION
>   typedef struct HWPoisonPage {
>       ram_addr_t ram_addr;
> +    size_t page_size; /* normal page or hugeTLB page? */
>       QLIST_ENTRY(HWPoisonPage) list;
>   } HWPoisonPage;
>   
> +/* hwpoison_page_list stores the poisoned pages, unpoison them during reset */
>   static QLIST_HEAD(, HWPoisonPage) hwpoison_page_list =
>       QLIST_HEAD_INITIALIZER(hwpoison_page_list);
>   
> @@ -1181,25 +1184,48 @@ static void kvm_unpoison_all(void *param)
>   
>       QLIST_FOREACH_SAFE(page, &hwpoison_page_list, list, next_page) {
>           QLIST_REMOVE(page, list);
> -        qemu_ram_remap(page->ram_addr, TARGET_PAGE_SIZE);
> +        qemu_ram_remap(page->ram_addr, page->page_size);
>           g_free(page);
>       }
>   }
>   
> -void kvm_hwpoison_page_add(ram_addr_t ram_addr)
> +static void kvm_hwpoison_page_add(CPUState *cpu, int sigbus_code, void *addr)
>   {
>       HWPoisonPage *page;
> +    ram_addr_t ram_addr, align_ram_addr;
> +    ram_addr_t offset;
> +    hwaddr paddr;
> +    size_t page_size;
> +
> +    assert(sigbus_code == BUS_MCEERR_AR || sigbus_code == BUS_MCEERR_AO);
> +    ram_addr = qemu_ram_addr_from_host(addr);
> +    if (ram_addr == RAM_ADDR_INVALID ||
> +        !kvm_physical_memory_addr_from_host(cpu->kvm_state, addr, &paddr)) {
> +        /* only deal with valid guest RAM here */
> +        return;
> +    }
>   
> +    /* get page size of RAM block, test it's a normal page or huge page */
> +    page_size = qemu_ram_block_from_host(addr, false, &offset)->page_size;
> +    align_ram_addr = QEMU_ALIGN_DOWN(ram_addr, page_size);
>       QLIST_FOREACH(page, &hwpoison_page_list, list) {
> -        if (page->ram_addr == ram_addr) {
> +        if (page->ram_addr == align_ram_addr) {
> +            assert(page->page_size == page_size);
>               return;
>           }
>       }
> -    page = g_new(HWPoisonPage, 1);
> -    page->ram_addr = ram_addr;
> +
> +    page = g_new0(HWPoisonPage, 1);
> +    page->ram_addr = align_ram_addr;
> +    page->page_size = page_size;
>       QLIST_INSERT_HEAD(&hwpoison_page_list, page, list);
>   }
>   
> +static __thread void *pending_sigbus_addr;
> +static __thread int pending_sigbus_code;
> +static __thread bool have_sigbus_pending;
> +#endif
> +
>   static uint32_t adjust_ioeventfd_endianness(uint32_t val, uint32_t size)
>   {
>   #if defined(HOST_WORDS_BIGENDIAN) != defined(TARGET_WORDS_BIGENDIAN)
> @@ -2601,7 +2627,9 @@ static int kvm_init(MachineState *ms)
>           s->kernel_irqchip_split = mc->default_kernel_irqchip_split ? ON_OFF_AUTO_ON : ON_OFF_AUTO_OFF;
>       }
>   
> +#if defined KVM_HAVE_MCE_INJECTION
>       qemu_register_reset(kvm_unpoison_all, NULL);
> +#endif
>   
>       if (s->kernel_irqchip_allowed) {
>           kvm_irqchip_create(s);
> @@ -2782,12 +2810,6 @@ void kvm_cpu_synchronize_pre_loadvm(CPUState *cpu)
>       run_on_cpu(cpu, do_kvm_cpu_synchronize_pre_loadvm, RUN_ON_CPU_NULL);
>   }
>   
> -#ifdef KVM_HAVE_MCE_INJECTION
> -static __thread void *pending_sigbus_addr;
> -static __thread int pending_sigbus_code;
> -static __thread bool have_sigbus_pending;
> -#endif
> -
>   static void kvm_cpu_kick(CPUState *cpu)
>   {
>       qatomic_set(&cpu->kvm_run->immediate_exit, 1);
> @@ -2883,6 +2905,8 @@ int kvm_cpu_exec(CPUState *cpu)
>   #ifdef KVM_HAVE_MCE_INJECTION
>           if (unlikely(have_sigbus_pending)) {
>               qemu_mutex_lock_iothread();
> +            kvm_hwpoison_page_add(cpu, pending_sigbus_code,
> +                                  pending_sigbus_addr);
>               kvm_arch_on_sigbus_vcpu(cpu, pending_sigbus_code,
>                                       pending_sigbus_addr);
>               have_sigbus_pending = false;
> @@ -3436,6 +3460,7 @@ int kvm_on_sigbus(int code, void *addr)
>        * we can only get action optional here.
>        */
>       assert(code != BUS_MCEERR_AR);
> +    kvm_hwpoison_page_add(first_cpu, code, addr);
>       kvm_arch_on_sigbus_vcpu(first_cpu, code, addr);
>       return 0;
>   #else
> diff --git a/include/sysemu/kvm_int.h b/include/sysemu/kvm_int.h
> index 1f5487d9b7..52ec8ef99c 100644
> --- a/include/sysemu/kvm_int.h
> +++ b/include/sysemu/kvm_int.h
> @@ -40,16 +40,4 @@ void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml,
>                                     AddressSpace *as, int as_id, const char *name);
>   
>   void kvm_set_max_memslot_size(hwaddr max_slot_size);
> -
> -/**
> - * kvm_hwpoison_page_add:
> - *
> - * Parameters:
> - *  @ram_addr: the address in the RAM for the poisoned page
> - *
> - * Add a poisoned page to the list
> - *
> - * Return: None.
> - */
> -void kvm_hwpoison_page_add(ram_addr_t ram_addr);
>   #endif
> diff --git a/target/arm/kvm64.c b/target/arm/kvm64.c
> index ccadfbbe72..a3184eb3d2 100644
> --- a/target/arm/kvm64.c
> +++ b/target/arm/kvm64.c
> @@ -1450,7 +1450,6 @@ void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr)
>           ram_addr = qemu_ram_addr_from_host(addr);
>           if (ram_addr != RAM_ADDR_INVALID &&
>               kvm_physical_memory_addr_from_host(c->kvm_state, addr, &paddr)) {
> -            kvm_hwpoison_page_add(ram_addr);
>               /*
>                * If this is a BUS_MCEERR_AR, we know we have been called
>                * synchronously from the vCPU thread, so we can easily
> diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c
> index 9cf8e03669..fb72b349ed 100644
> --- a/target/i386/kvm/kvm.c
> +++ b/target/i386/kvm/kvm.c
> @@ -622,7 +622,6 @@ void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr)
>           ram_addr = qemu_ram_addr_from_host(addr);
>           if (ram_addr != RAM_ADDR_INVALID &&
>               kvm_physical_memory_addr_from_host(c->kvm_state, addr, &paddr)) {
> -            kvm_hwpoison_page_add(ram_addr);
>               kvm_mce_inject(cpu, paddr, code);
>   
>               /*
Eiichi Tsukata Aug. 4, 2022, 6:59 a.m. UTC | #3
Hi 

We’ve also hit this case.

> On May 5, 2022, at 9:32, zhenwei pi <pizhenwei@bytedance.com> wrote:
> 
> Hi, Paolo
> 
> I would appreciate it if you could review patch.
> 
> On 4/20/22 14:45, zhenwei pi wrote:
>> qemu exits during reset with log:
>> qemu-system-x86_64: Could not remap addr: 1000@22001000
>> Currently, after MCE on RAM of a guest, qemu records a ram_addr only,
>> remaps this address with a fixed size(TARGET_PAGE_SIZE) during reset.
>> In the hugetlbfs scenario, mmap(addr...) needs page_size aligned
>> address and correct size. Unaligned address leads mmap to fail.

As far as I checked, SIGBUS sent from memory_failure() due to PR_MCE_KILL_EARLY has aligned address
in siginfo. But SIGBUS sent from kvm_mmu_page_fault() has unaligned address. This happens only when Guest touches
poisoned pages before they get remapped. This is not a usual case but it can sometimes happen.

FYI: call path
       CPU 1/KVM-328915  [005] d..1. 711765.805910: signal_generate: sig=7 errno=0 code=4 comm=CPU 1/KVM pid=328915 grp=0 res=0
       CPU 1/KVM-328915  [005] d..1. 711765.805915: <stack trace>
 => trace_event_raw_event_signal_generate
 => __send_signal
 => do_send_sig_info
 => send_sig_mceerr
 => handle_abnormal_pfn
 => direct_page_fault
 => kvm_mmu_page_fault
 => kvm_arch_vcpu_ioctl_run
 => kvm_vcpu_ioctl
 => __x64_sys_ioctl
 => do_syscall_64


In addition, aligning length suppresses the following madvise error message in qemu_ram_setup_dump():

  qemu_madvise: Invalid argument
  madvise doesn't support MADV_DONTDUMP, but dump_guest_core=off specified


Thanks

Eiichi
zhenwei pi Aug. 5, 2022, 1:32 a.m. UTC | #4
Hi,

Could you please give me any hint about this issue & patch?


On 8/4/22 14:59, Eiichi Tsukata wrote:
> Hi
> 
> We’ve also hit this case.
> 
>> On May 5, 2022, at 9:32, zhenwei pi <pizhenwei@bytedance.com> wrote:
>>
>> Hi, Paolo
>>
>> I would appreciate it if you could review patch.
>>
>> On 4/20/22 14:45, zhenwei pi wrote:
>>> qemu exits during reset with log:
>>> qemu-system-x86_64: Could not remap addr: 1000@22001000
>>> Currently, after MCE on RAM of a guest, qemu records a ram_addr only,
>>> remaps this address with a fixed size(TARGET_PAGE_SIZE) during reset.
>>> In the hugetlbfs scenario, mmap(addr...) needs page_size aligned
>>> address and correct size. Unaligned address leads mmap to fail.
> 
> As far as I checked, SIGBUS sent from memory_failure() due to PR_MCE_KILL_EARLY has aligned address
> in siginfo. But SIGBUS sent from kvm_mmu_page_fault() has unaligned address. This happens only when Guest touches
> poisoned pages before they get remapped. This is not a usual case but it can sometimes happen.
> 
> FYI: call path
>         CPU 1/KVM-328915  [005] d..1. 711765.805910: signal_generate: sig=7 errno=0 code=4 comm=CPU 1/KVM pid=328915 grp=0 res=0
>         CPU 1/KVM-328915  [005] d..1. 711765.805915: <stack trace>
>   => trace_event_raw_event_signal_generate
>   => __send_signal
>   => do_send_sig_info
>   => send_sig_mceerr
>   => handle_abnormal_pfn
>   => direct_page_fault
>   => kvm_mmu_page_fault
>   => kvm_arch_vcpu_ioctl_run
>   => kvm_vcpu_ioctl
>   => __x64_sys_ioctl
>   => do_syscall_64
> 
> 
> In addition, aligning length suppresses the following madvise error message in qemu_ram_setup_dump():
> 
>    qemu_madvise: Invalid argument
>    madvise doesn't support MADV_DONTDUMP, but dump_guest_core=off specified
> 
> 
> Thanks
> 
> Eiichi
diff mbox series

Patch

diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index 5f1377ca04..2a91c5a461 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -1167,11 +1167,14 @@  int kvm_vm_check_extension(KVMState *s, unsigned int extension)
     return ret;
 }
 
+#ifdef KVM_HAVE_MCE_INJECTION
 typedef struct HWPoisonPage {
     ram_addr_t ram_addr;
+    size_t page_size; /* normal page or hugeTLB page? */
     QLIST_ENTRY(HWPoisonPage) list;
 } HWPoisonPage;
 
+/* hwpoison_page_list stores the poisoned pages, unpoison them during reset */
 static QLIST_HEAD(, HWPoisonPage) hwpoison_page_list =
     QLIST_HEAD_INITIALIZER(hwpoison_page_list);
 
@@ -1181,25 +1184,48 @@  static void kvm_unpoison_all(void *param)
 
     QLIST_FOREACH_SAFE(page, &hwpoison_page_list, list, next_page) {
         QLIST_REMOVE(page, list);
-        qemu_ram_remap(page->ram_addr, TARGET_PAGE_SIZE);
+        qemu_ram_remap(page->ram_addr, page->page_size);
         g_free(page);
     }
 }
 
-void kvm_hwpoison_page_add(ram_addr_t ram_addr)
+static void kvm_hwpoison_page_add(CPUState *cpu, int sigbus_code, void *addr)
 {
     HWPoisonPage *page;
+    ram_addr_t ram_addr, align_ram_addr;
+    ram_addr_t offset;
+    hwaddr paddr;
+    size_t page_size;
+
+    assert(sigbus_code == BUS_MCEERR_AR || sigbus_code == BUS_MCEERR_AO);
+    ram_addr = qemu_ram_addr_from_host(addr);
+    if (ram_addr == RAM_ADDR_INVALID ||
+        !kvm_physical_memory_addr_from_host(cpu->kvm_state, addr, &paddr)) {
+        /* only deal with valid guest RAM here */
+        return;
+    }
 
+    /* get page size of RAM block, test it's a normal page or huge page */
+    page_size = qemu_ram_block_from_host(addr, false, &offset)->page_size;
+    align_ram_addr = QEMU_ALIGN_DOWN(ram_addr, page_size);
     QLIST_FOREACH(page, &hwpoison_page_list, list) {
-        if (page->ram_addr == ram_addr) {
+        if (page->ram_addr == align_ram_addr) {
+            assert(page->page_size == page_size);
             return;
         }
     }
-    page = g_new(HWPoisonPage, 1);
-    page->ram_addr = ram_addr;
+
+    page = g_new0(HWPoisonPage, 1);
+    page->ram_addr = align_ram_addr;
+    page->page_size = page_size;
     QLIST_INSERT_HEAD(&hwpoison_page_list, page, list);
 }
 
+static __thread void *pending_sigbus_addr;
+static __thread int pending_sigbus_code;
+static __thread bool have_sigbus_pending;
+#endif
+
 static uint32_t adjust_ioeventfd_endianness(uint32_t val, uint32_t size)
 {
 #if defined(HOST_WORDS_BIGENDIAN) != defined(TARGET_WORDS_BIGENDIAN)
@@ -2601,7 +2627,9 @@  static int kvm_init(MachineState *ms)
         s->kernel_irqchip_split = mc->default_kernel_irqchip_split ? ON_OFF_AUTO_ON : ON_OFF_AUTO_OFF;
     }
 
+#if defined KVM_HAVE_MCE_INJECTION
     qemu_register_reset(kvm_unpoison_all, NULL);
+#endif
 
     if (s->kernel_irqchip_allowed) {
         kvm_irqchip_create(s);
@@ -2782,12 +2810,6 @@  void kvm_cpu_synchronize_pre_loadvm(CPUState *cpu)
     run_on_cpu(cpu, do_kvm_cpu_synchronize_pre_loadvm, RUN_ON_CPU_NULL);
 }
 
-#ifdef KVM_HAVE_MCE_INJECTION
-static __thread void *pending_sigbus_addr;
-static __thread int pending_sigbus_code;
-static __thread bool have_sigbus_pending;
-#endif
-
 static void kvm_cpu_kick(CPUState *cpu)
 {
     qatomic_set(&cpu->kvm_run->immediate_exit, 1);
@@ -2883,6 +2905,8 @@  int kvm_cpu_exec(CPUState *cpu)
 #ifdef KVM_HAVE_MCE_INJECTION
         if (unlikely(have_sigbus_pending)) {
             qemu_mutex_lock_iothread();
+            kvm_hwpoison_page_add(cpu, pending_sigbus_code,
+                                  pending_sigbus_addr);
             kvm_arch_on_sigbus_vcpu(cpu, pending_sigbus_code,
                                     pending_sigbus_addr);
             have_sigbus_pending = false;
@@ -3436,6 +3460,7 @@  int kvm_on_sigbus(int code, void *addr)
      * we can only get action optional here.
      */
     assert(code != BUS_MCEERR_AR);
+    kvm_hwpoison_page_add(first_cpu, code, addr);
     kvm_arch_on_sigbus_vcpu(first_cpu, code, addr);
     return 0;
 #else
diff --git a/include/sysemu/kvm_int.h b/include/sysemu/kvm_int.h
index 1f5487d9b7..52ec8ef99c 100644
--- a/include/sysemu/kvm_int.h
+++ b/include/sysemu/kvm_int.h
@@ -40,16 +40,4 @@  void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml,
                                   AddressSpace *as, int as_id, const char *name);
 
 void kvm_set_max_memslot_size(hwaddr max_slot_size);
-
-/**
- * kvm_hwpoison_page_add:
- *
- * Parameters:
- *  @ram_addr: the address in the RAM for the poisoned page
- *
- * Add a poisoned page to the list
- *
- * Return: None.
- */
-void kvm_hwpoison_page_add(ram_addr_t ram_addr);
 #endif
diff --git a/target/arm/kvm64.c b/target/arm/kvm64.c
index ccadfbbe72..a3184eb3d2 100644
--- a/target/arm/kvm64.c
+++ b/target/arm/kvm64.c
@@ -1450,7 +1450,6 @@  void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr)
         ram_addr = qemu_ram_addr_from_host(addr);
         if (ram_addr != RAM_ADDR_INVALID &&
             kvm_physical_memory_addr_from_host(c->kvm_state, addr, &paddr)) {
-            kvm_hwpoison_page_add(ram_addr);
             /*
              * If this is a BUS_MCEERR_AR, we know we have been called
              * synchronously from the vCPU thread, so we can easily
diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c
index 9cf8e03669..fb72b349ed 100644
--- a/target/i386/kvm/kvm.c
+++ b/target/i386/kvm/kvm.c
@@ -622,7 +622,6 @@  void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr)
         ram_addr = qemu_ram_addr_from_host(addr);
         if (ram_addr != RAM_ADDR_INVALID &&
             kvm_physical_memory_addr_from_host(c->kvm_state, addr, &paddr)) {
-            kvm_hwpoison_page_add(ram_addr);
             kvm_mce_inject(cpu, paddr, code);
 
             /*