diff mbox series

[v2,1/1] migration: skip poisoned memory pages on "ram saving" phase

Message ID 20230914202054.3551250-2-william.roche@oracle.com (mailing list archive)
State New, archived
Headers show
Series Qemu crashes on VM migration after an handled memory error | expand

Commit Message

“William Roche Sept. 14, 2023, 8:20 p.m. UTC
From: William Roche <william.roche@oracle.com>

A memory page poisoned from the hypervisor level is no longer readable.
Thus, it is now treated as a zero-page for the ram saving migration phase.

The migration of a VM will crash Qemu when it tries to read the
memory address space and stumbles on the poisoned page with a similar
stack trace:

Program terminated with signal SIGBUS, Bus error.
#0  _mm256_loadu_si256
#1  buffer_zero_avx2
#2  select_accel_fn
#3  buffer_is_zero
#4  save_zero_page_to_file
#5  save_zero_page
#6  ram_save_target_page_legacy
#7  ram_save_host_page
#8  ram_find_and_save_block
#9  ram_save_iterate
#10 qemu_savevm_state_iterate
#11 migration_iteration_run
#12 migration_thread
#13 qemu_thread_start

Fix it by considering poisoned pages as if they were zero-pages for
the migration copy. This fix also works with underlying large pages,
taking into account the RAMBlock segment "page-size".

Standard migration and compressed transfers are handled by this code.
RDMA transfer isn't touched.

Signed-off-by: William Roche <william.roche@oracle.com>
---
 accel/kvm/kvm-all.c      | 14 ++++++++++++++
 accel/stubs/kvm-stub.c   |  5 +++++
 include/sysemu/kvm.h     | 10 ++++++++++
 migration/ram-compress.c |  3 ++-
 migration/ram.c          | 23 +++++++++++++++++++++--
 migration/ram.h          |  2 ++
 6 files changed, 54 insertions(+), 3 deletions(-)

Comments

Zhijian Li (Fujitsu) Sept. 15, 2023, 3:13 a.m. UTC | #1
On 15/09/2023 04:20, “William Roche wrote:
> From: William Roche <william.roche@oracle.com>
> 
> A memory page poisoned from the hypervisor level is no longer readable.
> Thus, it is now treated as a zero-page for the ram saving migration phase.
> 
> The migration of a VM will crash Qemu when it tries to read the
> memory address space and stumbles on the poisoned page with a similar
> stack trace:
> 
> Program terminated with signal SIGBUS, Bus error.
> #0  _mm256_loadu_si256
> #1  buffer_zero_avx2
> #2  select_accel_fn
> #3  buffer_is_zero
> #4  save_zero_page_to_file
> #5  save_zero_page
> #6  ram_save_target_page_legacy
> #7  ram_save_host_page
> #8  ram_find_and_save_block
> #9  ram_save_iterate
> #10 qemu_savevm_state_iterate
> #11 migration_iteration_run
> #12 migration_thread
> #13 qemu_thread_start
> 
> Fix it by considering poisoned pages as if they were zero-pages for
> the migration copy. This fix also works with underlying large pages,
> taking into account the RAMBlock segment "page-size".
> 
> Standard migration and compressed transfers are handled by this code.
> RDMA transfer isn't touched.
> 


I'm okay with "RDMA isn't touched".
BTW, could you share your reproducing program/hacking to poison the page, so that
i am able to take a look the RDMA part later when i'm free.

Not sure it's suitable to acknowledge a not touched part. Anyway
Acked-by: Li Zhijian <lizhijian@fujitsu.com> # RDMA


> Signed-off-by: William Roche <william.roche@oracle.com>
> ---
>   accel/kvm/kvm-all.c      | 14 ++++++++++++++
>   accel/stubs/kvm-stub.c   |  5 +++++
>   include/sysemu/kvm.h     | 10 ++++++++++
>   migration/ram-compress.c |  3 ++-
>   migration/ram.c          | 23 +++++++++++++++++++++--
>   migration/ram.h          |  2 ++
>   6 files changed, 54 insertions(+), 3 deletions(-)
> 
> diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
> index ff1578bb32..7fb13c8a56 100644
> --- a/accel/kvm/kvm-all.c
> +++ b/accel/kvm/kvm-all.c
> @@ -1152,6 +1152,20 @@ static void kvm_unpoison_all(void *param)
>       }
>   }
>   
> +bool kvm_hwpoisoned_page(RAMBlock *block, void *offset)
> +{
> +    HWPoisonPage *pg;
> +    ram_addr_t ram_addr = (ram_addr_t) offset;
> +
> +    QLIST_FOREACH(pg, &hwpoison_page_list, list) {
> +        if ((ram_addr >= pg->ram_addr) &&
> +            (ram_addr - pg->ram_addr < block->page_size)) {
> +            return true;
> +        }
> +    }
> +    return false;
> +}
> +
>   void kvm_hwpoison_page_add(ram_addr_t ram_addr)
>   {
>       HWPoisonPage *page;
> diff --git a/accel/stubs/kvm-stub.c b/accel/stubs/kvm-stub.c
> index 235dc661bc..c0a31611df 100644
> --- a/accel/stubs/kvm-stub.c
> +++ b/accel/stubs/kvm-stub.c
> @@ -133,3 +133,8 @@ uint32_t kvm_dirty_ring_size(void)
>   {
>       return 0;
>   }
> +
> +bool kvm_hwpoisoned_page(RAMBlock *block, void *ram_addr)
> +{
> +    return false;
> +}
> diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h
> index ee9025f8e9..858688227a 100644
> --- a/include/sysemu/kvm.h
> +++ b/include/sysemu/kvm.h
> @@ -570,4 +570,14 @@ bool kvm_arch_cpu_check_are_resettable(void);
>   bool kvm_dirty_ring_enabled(void);
>   
>   uint32_t kvm_dirty_ring_size(void);
> +
> +/**
> + * kvm_hwpoisoned_page - indicate if the given page is poisoned
> + * @block: memory block of the given page
> + * @ram_addr: offset of the page
> + *
> + * Returns: true: page is poisoned
> + *          false: page not yet poisoned
> + */
> +bool kvm_hwpoisoned_page(RAMBlock *block, void *ram_addr);
>   #endif
> diff --git a/migration/ram-compress.c b/migration/ram-compress.c
> index 06254d8c69..1916ce709d 100644
> --- a/migration/ram-compress.c
> +++ b/migration/ram-compress.c
> @@ -34,6 +34,7 @@
>   #include "qemu/error-report.h"
>   #include "migration.h"
>   #include "options.h"
> +#include "ram.h"
>   #include "io/channel-null.h"
>   #include "exec/target_page.h"
>   #include "exec/ramblock.h"
> @@ -198,7 +199,7 @@ static CompressResult do_compress_ram_page(QEMUFile *f, z_stream *stream,
>   
>       assert(qemu_file_buffer_empty(f));
>   
> -    if (buffer_is_zero(p, page_size)) {
> +    if (migration_buffer_is_zero(block, offset, page_size)) {
>           return RES_ZEROPAGE;
>       }
>   
> diff --git a/migration/ram.c b/migration/ram.c
> index 9040d66e61..fd337f7e65 100644
> --- a/migration/ram.c
> +++ b/migration/ram.c
> @@ -1129,6 +1129,26 @@ void ram_release_page(const char *rbname, uint64_t offset)
>       ram_discard_range(rbname, offset, TARGET_PAGE_SIZE);
>   }
>   
> +/**
> + * migration_buffer_is_zero: indicate if the page at the given
> + * location is entirely filled with zero, or is a poisoned page.
> + *
> + * @block: block that contains the page
> + * @offset: offset inside the block for the page
> + * @len: size to consider
> + */
> +bool migration_buffer_is_zero(RAMBlock *block, ram_addr_t offset,
> +                                     size_t len)
> +{
> +    uint8_t *p = block->host + offset;
> +
> +    if (kvm_enabled() && kvm_hwpoisoned_page(block, (void *)offset)) {
> +        return true;
> +    }
> +
> +    return buffer_is_zero(p, len);
> +}
> +
>   /**
>    * save_zero_page_to_file: send the zero page to the file
>    *
> @@ -1142,10 +1162,9 @@ void ram_release_page(const char *rbname, uint64_t offset)
>   static int save_zero_page_to_file(PageSearchStatus *pss, QEMUFile *file,
>                                     RAMBlock *block, ram_addr_t offset)
>   {
> -    uint8_t *p = block->host + offset;
>       int len = 0;
>   
> -    if (buffer_is_zero(p, TARGET_PAGE_SIZE)) {
> +    if (migration_buffer_is_zero(block, offset, TARGET_PAGE_SIZE)) {
>           len += save_page_header(pss, file, block, offset | RAM_SAVE_FLAG_ZERO);
>           qemu_put_byte(file, 0);
>           len += 1;
> diff --git a/migration/ram.h b/migration/ram.h
> index 145c915ca7..805ea2a211 100644
> --- a/migration/ram.h
> +++ b/migration/ram.h
> @@ -65,6 +65,8 @@ void ram_handle_compressed(void *host, uint8_t ch, uint64_t size);
>   void ram_transferred_add(uint64_t bytes);
>   void ram_release_page(const char *rbname, uint64_t offset);
>   
> +bool migration_buffer_is_zero(RAMBlock *block, ram_addr_t offset, size_t len);
> +
>   int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr);
>   bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset);
>   void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr);
“William Roche Sept. 15, 2023, 11:31 a.m. UTC | #2
On 9/15/23 05:13, Zhijian Li (Fujitsu) wrote:
> 
> 
> I'm okay with "RDMA isn't touched".
> BTW, could you share your reproducing program/hacking to poison the page, so that
> i am able to take a look the RDMA part later when i'm free.
> 
> Not sure it's suitable to acknowledge a not touched part. Anyway
> Acked-by: Li Zhijian <lizhijian@fujitsu.com> # RDMA
> 

Thanks.
As you asked for a procedure to inject memory errors into a running VM,
I've attached to this email the source code (mce_process_react.c) of a
program that will help to target the error injection in the VM.

(Be careful that error injection is currently nor working on AMD
platforms -- this is a work in progress is a separate qemu thread)


The general idea:
We are going to target a process memory page running inside a VM to see
what happens when we inject an error on the underlying physical page at
the platform (hypervisor) level.
To have a better view of what's going on, we'll use a process made for
this: It's goal is to allocate a memory page, and create a SIGBUS
handler to inform when it receives this signal. It will also wait before
touching this page to see what happens next.

     Compiling this tool:
     $ gcc -o mce_process_react_x86 mce_process_react.c


Let's try that:
This procedure shows the best case scenario, where an error injected at
the platform level is reported up to the guest process using it.
Note that qemu should be started with root privilege.

     1. Choose a process running in the VM (and identify a memory page
you want to target, and get its physical address – crash(8) vtop can
help with that) or run the attached mce_process_react example (compiled
for your platform mce_process_react_[x86|arm]) with an option to be
early informed of _AO error (-e) and wait ENTER to continue with reading
the allocated page (-w 0):

[root@VM ]# ./mce_process_react_x86 -e -w 0
Setting Early kill... Ok

Data pages at 0x7fa0f9b25000  physically 0x200f2fa000

Press ENTER to continue with page reading


     2. Go into the VM monitor to get the translation from "Guest
Physical Address to Host Physical Address" or "Host Virtual Address":

  (qemu) gpa2hpa 0x200f2fa000'
Host physical address for 0x200f2fa000 (ram-node1) is 0x46f12fa000


     3. Before we inject the error, we want to keep track of the VM
console output (in a separate window).
If you are using libvirt: # virsh console myvm


     4. We now prepare for the error injection at the platform level to
the address we found.  To do so, we'll need to use the hwpoison-inject
module (x86)
Be careful, as hwpoison takes Page Frame Numbers and this PFN is not the
physical address – you need to remove the last 12 bits (the last 3 zeros
of the above address) !

[root@hv ]# modprobe hwpoison-inject
[root@hv ]# echo 0x46f12fa > /sys/kernel/debug/hwpoison/corrupt-pfn

        If you see "Operation not permitted" error when writing as root
on corrupt-pfn, you may be facing a "kernel_lockdown(7)" which is
enabled on SecureBoot systems (can be verified with
"mokutil --sb-state"). In this case, turn SecureBoot off  (at the UEFI
level for example)

     5. Look at the qemu output (either on the terminal where qemu was
started or  if you are using libvirt:  tail /var/log/libvirt/qemu/myvm

2022-08-31T13:52:25.645398Z qemu-system-x86_64: warning: Guest MCE 
Memory Error at QEMU addr 0x7eeeace00000 and GUEST addr 0x200f200 of 
type BUS_MCEERR_AO injected

     6. On the guest console:
We'll see the VM reaction to the injected error:

[  155.805149] Disabling lock debugging due to kernel taint
[  155.806174] mce: [Hardware Error]: Machine check events logged
[  155.807120] Memory failure: 0x200f200: Killing mce_process_rea:3548 
due to hardware memory corruption
[  155.808877] Memory failure: 0x200f200: recovery action for dirty LRU 
page: Recovered

     7. The Guest process that we started at the first step gives:

Signal 7 received
BUS_MCEERR_AO on vaddr: 0x7fa0f9b25000

At this stage, the VM has a poisoned page, and a migration of this VM
needs to be fixed in order to avoid accessing the poisoned page.

     8. The process continues to run (as it handled the SIGBUS).
Now if you press ENTER on this process terminal, it will try to read the
page which will generate a new MCE (a synchronous one) at VM level which
will be sent to this process:

Signal 7 received
BUS_MCEERR_AR on vaddr: 0x7fa0f9b25000
Exit from the signal handler on BUS_MCEERR_AR

     9. The VM console shows:
[ 2520.895263] MCE: Killing mce_process_rea:3548 due to hardware memory 
corruption fault at 7f45e5265000

     10. The VM continues to run...
With a poisoned page in its address space

HTH,
William.
Zhijian Li (Fujitsu) Sept. 18, 2023, 3:47 a.m. UTC | #3
On 15/09/2023 19:31, William Roche wrote:
> On 9/15/23 05:13, Zhijian Li (Fujitsu) wrote:
>>
>>
>> I'm okay with "RDMA isn't touched".
>> BTW, could you share your reproducing program/hacking to poison the page, so that
>> i am able to take a look the RDMA part later when i'm free.
>>
>> Not sure it's suitable to acknowledge a not touched part. Anyway
>> Acked-by: Li Zhijian <lizhijian@fujitsu.com> # RDMA
>>
> 
> Thanks.
> As you asked for a procedure to inject memory errors into a running VM,
> I've attached to this email the source code (mce_process_react.c) of a
> program that will help to target the error injection in the VM.
> 


Very very thanks for your details, Mark it :)

Thanks
Zhijian



> (Be careful that error injection is currently nor working on AMD
> platforms -- this is a work in progress is a separate qemu thread)
> 
> 
> The general idea:
> We are going to target a process memory page running inside a VM to see
> what happens when we inject an error on the underlying physical page at
> the platform (hypervisor) level.
> To have a better view of what's going on, we'll use a process made for
> this: It's goal is to allocate a memory page, and create a SIGBUS
> handler to inform when it receives this signal. It will also wait before
> touching this page to see what happens next.
> 
>      Compiling this tool:
>      $ gcc -o mce_process_react_x86 mce_process_react.c
> 
> 
> Let's try that:
> This procedure shows the best case scenario, where an error injected at
> the platform level is reported up to the guest process using it.
> Note that qemu should be started with root privilege.
> 
>      1. Choose a process running in the VM (and identify a memory page
> you want to target, and get its physical address – crash(8) vtop can
> help with that) or run the attached mce_process_react example (compiled
> for your platform mce_process_react_[x86|arm]) with an option to be
> early informed of _AO error (-e) and wait ENTER to continue with reading
> the allocated page (-w 0):
> 
> [root@VM ]# ./mce_process_react_x86 -e -w 0
> Setting Early kill... Ok
> 
> Data pages at 0x7fa0f9b25000  physically 0x200f2fa000
> 
> Press ENTER to continue with page reading
> 
> 
>      2. Go into the VM monitor to get the translation from "Guest
> Physical Address to Host Physical Address" or "Host Virtual Address":
> 
>   (qemu) gpa2hpa 0x200f2fa000'
> Host physical address for 0x200f2fa000 (ram-node1) is 0x46f12fa000
> 
> 
>      3. Before we inject the error, we want to keep track of the VM
> console output (in a separate window).
> If you are using libvirt: # virsh console myvm
> 
> 
>      4. We now prepare for the error injection at the platform level to
> the address we found.  To do so, we'll need to use the hwpoison-inject
> module (x86)
> Be careful, as hwpoison takes Page Frame Numbers and this PFN is not the
> physical address – you need to remove the last 12 bits (the last 3 zeros
> of the above address) !
> 
> [root@hv ]# modprobe hwpoison-inject
> [root@hv ]# echo 0x46f12fa > /sys/kernel/debug/hwpoison/corrupt-pfn
> 
>         If you see "Operation not permitted" error when writing as root
> on corrupt-pfn, you may be facing a "kernel_lockdown(7)" which is
> enabled on SecureBoot systems (can be verified with
> "mokutil --sb-state"). In this case, turn SecureBoot off  (at the UEFI
> level for example)
> 
>      5. Look at the qemu output (either on the terminal where qemu was
> started or  if you are using libvirt:  tail /var/log/libvirt/qemu/myvm
> 
> 2022-08-31T13:52:25.645398Z qemu-system-x86_64: warning: Guest MCE Memory Error at QEMU addr 0x7eeeace00000 and GUEST addr 0x200f200 of type BUS_MCEERR_AO injected
> 
>      6. On the guest console:
> We'll see the VM reaction to the injected error:
> 
> [  155.805149] Disabling lock debugging due to kernel taint
> [  155.806174] mce: [Hardware Error]: Machine check events logged
> [  155.807120] Memory failure: 0x200f200: Killing mce_process_rea:3548 due to hardware memory corruption
> [  155.808877] Memory failure: 0x200f200: recovery action for dirty LRU page: Recovered
> 
>      7. The Guest process that we started at the first step gives:
> 
> Signal 7 received
> BUS_MCEERR_AO on vaddr: 0x7fa0f9b25000
> 
> At this stage, the VM has a poisoned page, and a migration of this VM
> needs to be fixed in order to avoid accessing the poisoned page.
> 
>      8. The process continues to run (as it handled the SIGBUS).
> Now if you press ENTER on this process terminal, it will try to read the
> page which will generate a new MCE (a synchronous one) at VM level which
> will be sent to this process:
> 
> Signal 7 received
> BUS_MCEERR_AR on vaddr: 0x7fa0f9b25000
> Exit from the signal handler on BUS_MCEERR_AR
> 
>      9. The VM console shows:
> [ 2520.895263] MCE: Killing mce_process_rea:3548 due to hardware memory corruption fault at 7f45e5265000
> 
>      10. The VM continues to run...
> With a poisoned page in its address space
> 
> HTH,
> William.
Zhijian Li (Fujitsu) Sept. 20, 2023, 10:04 a.m. UTC | #4
On 15/09/2023 19:31, William Roche wrote:
> On 9/15/23 05:13, Zhijian Li (Fujitsu) wrote:
>>
>>
>> I'm okay with "RDMA isn't touched".
>> BTW, could you share your reproducing program/hacking to poison the page, so that
>> i am able to take a look the RDMA part later when i'm free.
>>
>> Not sure it's suitable to acknowledge a not touched part. Anyway
>> Acked-by: Li Zhijian <lizhijian@fujitsu.com> # RDMA
>>
> 
> Thanks.
> As you asked for a procedure to inject memory errors into a running VM,
> I've attached to this email the source code (mce_process_react.c) of a
> program that will help to target the error injection in the VM.


I just tried you hwpoison program and do RDMA migration. Migration failed, but fortunately
the source side is still alive :).

(qemu) Failed to register chunk!: Bad address
Chunk details: block: 0 chunk index 671 start 139955096518656 end 139955097567232 host 139955096518656 local 139954392924160 registrations: 636
qemu-system-x86_64: cannot get lkey
qemu-system-x86_64: rdma migration: write error! -22
qemu-system-x86_64: RDMA is in an error state waiting migration to abort!
qemu-system-x86_64: failed to save SaveStateEntry with id(name): 2(ram): -22
qemu-system-x86_64: Early error. Sending error.


Since current RDMA migration transfers guest memory in a chunk size(1M) by default, we may need to

option 1: reduce all chunk size to 1 page
option 2: handle the hwpoison chunk specially

However, because there may be a chance to use another protocol, it's also possible to temporarily not fix the issue.

Tested-by: Li Zhijian <lizhijian@fujitsu.com>

Thanks
Zhijian




> 
> (Be careful that error injection is currently nor working on AMD
> platforms -- this is a work in progress is a separate qemu thread)
> 
> 
> The general idea:
> We are going to target a process memory page running inside a VM to see
> what happens when we inject an error on the underlying physical page at
> the platform (hypervisor) level.
> To have a better view of what's going on, we'll use a process made for
> this: It's goal is to allocate a memory page, and create a SIGBUS
> handler to inform when it receives this signal. It will also wait before
> touching this page to see what happens next.
> 
>      Compiling this tool:
>      $ gcc -o mce_process_react_x86 mce_process_react.c
> 
> 
> Let's try that:
> This procedure shows the best case scenario, where an error injected at
> the platform level is reported up to the guest process using it.
> Note that qemu should be started with root privilege.
> 
>      1. Choose a process running in the VM (and identify a memory page
> you want to target, and get its physical address – crash(8) vtop can
> help with that) or run the attached mce_process_react example (compiled
> for your platform mce_process_react_[x86|arm]) with an option to be
> early informed of _AO error (-e) and wait ENTER to continue with reading
> the allocated page (-w 0):
> 
> [root@VM ]# ./mce_process_react_x86 -e -w 0
> Setting Early kill... Ok
> 
> Data pages at 0x7fa0f9b25000  physically 0x200f2fa000
> 
> Press ENTER to continue with page reading
> 
> 
>      2. Go into the VM monitor to get the translation from "Guest
> Physical Address to Host Physical Address" or "Host Virtual Address":
> 
>   (qemu) gpa2hpa 0x200f2fa000'
> Host physical address for 0x200f2fa000 (ram-node1) is 0x46f12fa000
> 
> 
>      3. Before we inject the error, we want to keep track of the VM
> console output (in a separate window).
> If you are using libvirt: # virsh console myvm
> 
> 
>      4. We now prepare for the error injection at the platform level to
> the address we found.  To do so, we'll need to use the hwpoison-inject
> module (x86)
> Be careful, as hwpoison takes Page Frame Numbers and this PFN is not the
> physical address – you need to remove the last 12 bits (the last 3 zeros
> of the above address) !
> 
> [root@hv ]# modprobe hwpoison-inject
> [root@hv ]# echo 0x46f12fa > /sys/kernel/debug/hwpoison/corrupt-pfn
> 
>         If you see "Operation not permitted" error when writing as root
> on corrupt-pfn, you may be facing a "kernel_lockdown(7)" which is
> enabled on SecureBoot systems (can be verified with
> "mokutil --sb-state"). In this case, turn SecureBoot off  (at the UEFI
> level for example)
> 
>      5. Look at the qemu output (either on the terminal where qemu was
> started or  if you are using libvirt:  tail /var/log/libvirt/qemu/myvm
> 
> 2022-08-31T13:52:25.645398Z qemu-system-x86_64: warning: Guest MCE Memory Error at QEMU addr 0x7eeeace00000 and GUEST addr 0x200f200 of type BUS_MCEERR_AO injected
> 
>      6. On the guest console:
> We'll see the VM reaction to the injected error:
> 
> [  155.805149] Disabling lock debugging due to kernel taint
> [  155.806174] mce: [Hardware Error]: Machine check events logged
> [  155.807120] Memory failure: 0x200f200: Killing mce_process_rea:3548 due to hardware memory corruption
> [  155.808877] Memory failure: 0x200f200: recovery action for dirty LRU page: Recovered
> 
>      7. The Guest process that we started at the first step gives:
> 
> Signal 7 received
> BUS_MCEERR_AO on vaddr: 0x7fa0f9b25000
> 
> At this stage, the VM has a poisoned page, and a migration of this VM
> needs to be fixed in order to avoid accessing the poisoned page.
> 
>      8. The process continues to run (as it handled the SIGBUS).
> Now if you press ENTER on this process terminal, it will try to read the
> page which will generate a new MCE (a synchronous one) at VM level which
> will be sent to this process:
> 
> Signal 7 received
> BUS_MCEERR_AR on vaddr: 0x7fa0f9b25000
> Exit from the signal handler on BUS_MCEERR_AR
> 
>      9. The VM console shows:
> [ 2520.895263] MCE: Killing mce_process_rea:3548 due to hardware memory corruption fault at 7f45e5265000
> 
>      10. The VM continues to run...
> With a poisoned page in its address space
> 
> HTH,
> William.
“William Roche Sept. 20, 2023, 12:11 p.m. UTC | #5
Thank you Zhijian for your feedback.

So I'll try to push this change today.

Cheers,
William.


On 9/20/23 12:04, Zhijian Li (Fujitsu) wrote:
> 
> 
> On 15/09/2023 19:31, William Roche wrote:
>> On 9/15/23 05:13, Zhijian Li (Fujitsu) wrote:
>>>
>>>
>>> I'm okay with "RDMA isn't touched".
>>> BTW, could you share your reproducing program/hacking to poison the page, so that
>>> i am able to take a look the RDMA part later when i'm free.
>>>
>>> Not sure it's suitable to acknowledge a not touched part. Anyway
>>> Acked-by: Li Zhijian <lizhijian@fujitsu.com> # RDMA
>>>
>>
>> Thanks.
>> As you asked for a procedure to inject memory errors into a running VM,
>> I've attached to this email the source code (mce_process_react.c) of a
>> program that will help to target the error injection in the VM.
> 
> 
> I just tried you hwpoison program and do RDMA migration. Migration failed, but fortunately
> the source side is still alive :).
> 
> (qemu) Failed to register chunk!: Bad address
> Chunk details: block: 0 chunk index 671 start 139955096518656 end 139955097567232 host 139955096518656 local 139954392924160 registrations: 636
> qemu-system-x86_64: cannot get lkey
> qemu-system-x86_64: rdma migration: write error! -22
> qemu-system-x86_64: RDMA is in an error state waiting migration to abort!
> qemu-system-x86_64: failed to save SaveStateEntry with id(name): 2(ram): -22
> qemu-system-x86_64: Early error. Sending error.
> 
> 
> Since current RDMA migration transfers guest memory in a chunk size(1M) by default, we may need to
> 
> option 1: reduce all chunk size to 1 page
> option 2: handle the hwpoison chunk specially
> 
> However, because there may be a chance to use another protocol, it's also possible to temporarily not fix the issue.
> 
> Tested-by: Li Zhijian <lizhijian@fujitsu.com>
> 
> Thanks
> Zhijian
diff mbox series

Patch

diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index ff1578bb32..7fb13c8a56 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -1152,6 +1152,20 @@  static void kvm_unpoison_all(void *param)
     }
 }
 
+bool kvm_hwpoisoned_page(RAMBlock *block, void *offset)
+{
+    HWPoisonPage *pg;
+    ram_addr_t ram_addr = (ram_addr_t) offset;
+
+    QLIST_FOREACH(pg, &hwpoison_page_list, list) {
+        if ((ram_addr >= pg->ram_addr) &&
+            (ram_addr - pg->ram_addr < block->page_size)) {
+            return true;
+        }
+    }
+    return false;
+}
+
 void kvm_hwpoison_page_add(ram_addr_t ram_addr)
 {
     HWPoisonPage *page;
diff --git a/accel/stubs/kvm-stub.c b/accel/stubs/kvm-stub.c
index 235dc661bc..c0a31611df 100644
--- a/accel/stubs/kvm-stub.c
+++ b/accel/stubs/kvm-stub.c
@@ -133,3 +133,8 @@  uint32_t kvm_dirty_ring_size(void)
 {
     return 0;
 }
+
+bool kvm_hwpoisoned_page(RAMBlock *block, void *ram_addr)
+{
+    return false;
+}
diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h
index ee9025f8e9..858688227a 100644
--- a/include/sysemu/kvm.h
+++ b/include/sysemu/kvm.h
@@ -570,4 +570,14 @@  bool kvm_arch_cpu_check_are_resettable(void);
 bool kvm_dirty_ring_enabled(void);
 
 uint32_t kvm_dirty_ring_size(void);
+
+/**
+ * kvm_hwpoisoned_page - indicate if the given page is poisoned
+ * @block: memory block of the given page
+ * @ram_addr: offset of the page
+ *
+ * Returns: true: page is poisoned
+ *          false: page not yet poisoned
+ */
+bool kvm_hwpoisoned_page(RAMBlock *block, void *ram_addr);
 #endif
diff --git a/migration/ram-compress.c b/migration/ram-compress.c
index 06254d8c69..1916ce709d 100644
--- a/migration/ram-compress.c
+++ b/migration/ram-compress.c
@@ -34,6 +34,7 @@ 
 #include "qemu/error-report.h"
 #include "migration.h"
 #include "options.h"
+#include "ram.h"
 #include "io/channel-null.h"
 #include "exec/target_page.h"
 #include "exec/ramblock.h"
@@ -198,7 +199,7 @@  static CompressResult do_compress_ram_page(QEMUFile *f, z_stream *stream,
 
     assert(qemu_file_buffer_empty(f));
 
-    if (buffer_is_zero(p, page_size)) {
+    if (migration_buffer_is_zero(block, offset, page_size)) {
         return RES_ZEROPAGE;
     }
 
diff --git a/migration/ram.c b/migration/ram.c
index 9040d66e61..fd337f7e65 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -1129,6 +1129,26 @@  void ram_release_page(const char *rbname, uint64_t offset)
     ram_discard_range(rbname, offset, TARGET_PAGE_SIZE);
 }
 
+/**
+ * migration_buffer_is_zero: indicate if the page at the given
+ * location is entirely filled with zero, or is a poisoned page.
+ *
+ * @block: block that contains the page
+ * @offset: offset inside the block for the page
+ * @len: size to consider
+ */
+bool migration_buffer_is_zero(RAMBlock *block, ram_addr_t offset,
+                                     size_t len)
+{
+    uint8_t *p = block->host + offset;
+
+    if (kvm_enabled() && kvm_hwpoisoned_page(block, (void *)offset)) {
+        return true;
+    }
+
+    return buffer_is_zero(p, len);
+}
+
 /**
  * save_zero_page_to_file: send the zero page to the file
  *
@@ -1142,10 +1162,9 @@  void ram_release_page(const char *rbname, uint64_t offset)
 static int save_zero_page_to_file(PageSearchStatus *pss, QEMUFile *file,
                                   RAMBlock *block, ram_addr_t offset)
 {
-    uint8_t *p = block->host + offset;
     int len = 0;
 
-    if (buffer_is_zero(p, TARGET_PAGE_SIZE)) {
+    if (migration_buffer_is_zero(block, offset, TARGET_PAGE_SIZE)) {
         len += save_page_header(pss, file, block, offset | RAM_SAVE_FLAG_ZERO);
         qemu_put_byte(file, 0);
         len += 1;
diff --git a/migration/ram.h b/migration/ram.h
index 145c915ca7..805ea2a211 100644
--- a/migration/ram.h
+++ b/migration/ram.h
@@ -65,6 +65,8 @@  void ram_handle_compressed(void *host, uint8_t ch, uint64_t size);
 void ram_transferred_add(uint64_t bytes);
 void ram_release_page(const char *rbname, uint64_t offset);
 
+bool migration_buffer_is_zero(RAMBlock *block, ram_addr_t offset, size_t len);
+
 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr);
 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset);
 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr);