diff mbox series

[3/3] migration: Pre-fault memory before starting background snasphot

Message ID 20210318174611.293520-4-andrey.gruzdev@virtuozzo.com (mailing list archive)
State New, archived
Headers show
Series migration: Fixes to the 'background-snapshot' code | expand

Commit Message

Andrey Gruzdev March 18, 2021, 5:46 p.m. UTC
This commit solves the issue with userfault_fd WP feature that
background snapshot is based on. For any never poluated or discarded
memory page, the UFFDIO_WRITEPROTECT ioctl() would skip updating
PTE for that page, thereby loosing WP setting for it.

So we need to pre-fault pages for each RAM block to be protected
before making a userfault_fd wr-protect ioctl().

Signed-off-by: Andrey Gruzdev <andrey.gruzdev@virtuozzo.com>
---
 migration/migration.c |  6 +++++
 migration/ram.c       | 51 +++++++++++++++++++++++++++++++++++++++++++
 migration/ram.h       |  1 +
 3 files changed, 58 insertions(+)

Comments

David Hildenbrand March 19, 2021, 9:28 a.m. UTC | #1
> +/*
> + * ram_block_populate_pages: populate memory in the RAM block by reading
> + *   an integer from the beginning of each page.
> + *
> + * Since it's solely used for userfault_fd WP feature, here we just
> + *   hardcode page size to TARGET_PAGE_SIZE.
> + *
> + * @bs: RAM block to populate
> + */
> +volatile int ram_block_populate_pages__tmp;
> +static void ram_block_populate_pages(RAMBlock *bs)
> +{
> +    ram_addr_t offset = 0;
> +    int tmp = 0;
> +
> +    for (char *ptr = (char *) bs->host; offset < bs->used_length;
> +            ptr += TARGET_PAGE_SIZE, offset += TARGET_PAGE_SIZE) {

You'll want qemu_real_host_page_size instead of TARGET_PAGE_SIZE

> +        /* Try to do it without memory writes */
> +        tmp += *(volatile int *) ptr;
> +    }


The following is slightly simpler and doesn't rely on volatile semantics [1].
Should work on any arch I guess.

static void ram_block_populate_pages(RAMBlock *bs)
{
     char *ptr = (char *) bs->host;
     ram_addr_t offset;

     for (offset = 0; offset < bs->used_length;
          offset += qemu_real_host_page_size) {
	char tmp = *(volatile char *)(ptr + offset)

	/* Don't optimize the read out. */
	asm volatile ("" : "+r" (tmp));
}

Compiles to

     for (offset = 0; offset < bs->used_length;
     316d:       48 8b 4b 30             mov    0x30(%rbx),%rcx
     char *ptr = (char *) bs->host;
     3171:       48 8b 73 18             mov    0x18(%rbx),%rsi
     for (offset = 0; offset < bs->used_length;
     3175:       48 85 c9                test   %rcx,%rcx
     3178:       74 ce                   je     3148 <ram_write_tracking_prepare+0x58>
          offset += qemu_real_host_page_size) {
     317a:       48 8b 05 00 00 00 00    mov    0x0(%rip),%rax        # 3181 <ram_write_tracking_prepare+0x91>
     3181:       48 8b 38                mov    (%rax),%rdi
     3184:       0f 1f 40 00             nopl   0x0(%rax)
         char tmp = *(volatile char *)(ptr + offset);
     3188:       48 8d 04 16             lea    (%rsi,%rdx,1),%rax
     318c:       0f b6 00                movzbl (%rax),%eax
          offset += qemu_real_host_page_size) {
     318f:       48 01 fa                add    %rdi,%rdx
     for (offset = 0; offset < bs->used_length;
     3192:       48 39 ca                cmp    %rcx,%rdx
     3195:       72 f1                   jb     3188 <ram_write_tracking_prepare+0x98>


[1] https://programfan.github.io/blog/2015/04/27/prevent-gcc-optimize-away-code/


I'll send patches soon to take care of virtio-mem via RamDiscardManager -
to skip populating the parts that are supposed to remain discarded and not migrated.
Unfortunately, the RamDiscardManager patches are still stuck waiting for
acks ... and now we're in soft-freeze.
David Hildenbrand March 19, 2021, 9:32 a.m. UTC | #2
On 19.03.21 10:28, David Hildenbrand wrote:
>> +/*
>> + * ram_block_populate_pages: populate memory in the RAM block by reading
>> + *   an integer from the beginning of each page.
>> + *
>> + * Since it's solely used for userfault_fd WP feature, here we just
>> + *   hardcode page size to TARGET_PAGE_SIZE.
>> + *
>> + * @bs: RAM block to populate
>> + */
>> +volatile int ram_block_populate_pages__tmp;
>> +static void ram_block_populate_pages(RAMBlock *bs)
>> +{
>> +    ram_addr_t offset = 0;
>> +    int tmp = 0;
>> +
>> +    for (char *ptr = (char *) bs->host; offset < bs->used_length;
>> +            ptr += TARGET_PAGE_SIZE, offset += TARGET_PAGE_SIZE) {
> 
> You'll want qemu_real_host_page_size instead of TARGET_PAGE_SIZE
> 
>> +        /* Try to do it without memory writes */
>> +        tmp += *(volatile int *) ptr;
>> +    }
> 
> 
> The following is slightly simpler and doesn't rely on volatile semantics [1].
> Should work on any arch I guess.
> 
> static void ram_block_populate_pages(RAMBlock *bs)
> {
>       char *ptr = (char *) bs->host;
>       ram_addr_t offset;
> 
>       for (offset = 0; offset < bs->used_length;
>            offset += qemu_real_host_page_size) {
> 	char tmp = *(volatile char *)(ptr + offset)

I wanted to do a "= *(ptr + offset)" here.

> 
> 	/* Don't optimize the read out. */
> 	asm volatile ("" : "+r" (tmp));

So this is the only volatile thing that the compiler must guarantee to 
not optimize away.
Andrey Gruzdev March 19, 2021, 11:05 a.m. UTC | #3
On 19.03.2021 12:28, David Hildenbrand wrote:
>> +/*
>> + * ram_block_populate_pages: populate memory in the RAM block by 
>> reading
>> + *   an integer from the beginning of each page.
>> + *
>> + * Since it's solely used for userfault_fd WP feature, here we just
>> + *   hardcode page size to TARGET_PAGE_SIZE.
>> + *
>> + * @bs: RAM block to populate
>> + */
>> +volatile int ram_block_populate_pages__tmp;
>> +static void ram_block_populate_pages(RAMBlock *bs)
>> +{
>> +    ram_addr_t offset = 0;
>> +    int tmp = 0;
>> +
>> +    for (char *ptr = (char *) bs->host; offset < bs->used_length;
>> +            ptr += TARGET_PAGE_SIZE, offset += TARGET_PAGE_SIZE) {
>
> You'll want qemu_real_host_page_size instead of TARGET_PAGE_SIZE
>
Ok.
>> +        /* Try to do it without memory writes */
>> +        tmp += *(volatile int *) ptr;
>> +    }
>
>
> The following is slightly simpler and doesn't rely on volatile 
> semantics [1].
> Should work on any arch I guess.
>
> static void ram_block_populate_pages(RAMBlock *bs)
> {
>     char *ptr = (char *) bs->host;
>     ram_addr_t offset;
>
>     for (offset = 0; offset < bs->used_length;
>          offset += qemu_real_host_page_size) {
>     char tmp = *(volatile char *)(ptr + offset)
>
>     /* Don't optimize the read out. */
>     asm volatile ("" : "+r" (tmp));
> }
>
Thanks, good option, I'll change the code.

> Compiles to
>
>     for (offset = 0; offset < bs->used_length;
>     316d:       48 8b 4b 30             mov    0x30(%rbx),%rcx
>     char *ptr = (char *) bs->host;
>     3171:       48 8b 73 18             mov    0x18(%rbx),%rsi
>     for (offset = 0; offset < bs->used_length;
>     3175:       48 85 c9                test   %rcx,%rcx
>     3178:       74 ce                   je     3148 
> <ram_write_tracking_prepare+0x58>
>          offset += qemu_real_host_page_size) {
>     317a:       48 8b 05 00 00 00 00    mov 0x0(%rip),%rax        # 
> 3181 <ram_write_tracking_prepare+0x91>
>     3181:       48 8b 38                mov    (%rax),%rdi
>     3184:       0f 1f 40 00             nopl   0x0(%rax)
>         char tmp = *(volatile char *)(ptr + offset);
>     3188:       48 8d 04 16             lea    (%rsi,%rdx,1),%rax
>     318c:       0f b6 00                movzbl (%rax),%eax
>          offset += qemu_real_host_page_size) {
>     318f:       48 01 fa                add    %rdi,%rdx
>     for (offset = 0; offset < bs->used_length;
>     3192:       48 39 ca                cmp    %rcx,%rdx
>     3195:       72 f1                   jb     3188 
> <ram_write_tracking_prepare+0x98>
>
>
> [1] 
> https://programfan.github.io/blog/2015/04/27/prevent-gcc-optimize-away-code/
>
>
> I'll send patches soon to take care of virtio-mem via RamDiscardManager -
> to skip populating the parts that are supposed to remain discarded and 
> not migrated.
> Unfortunately, the RamDiscardManager patches are still stuck waiting for
> acks ... and now we're in soft-freeze.
>
RamDiscardManager patches - do they also modify migration code?
I mean which part is responsible of not migrating discarded ranges.
Andrey Gruzdev March 19, 2021, 11:09 a.m. UTC | #4
On 19.03.2021 12:32, David Hildenbrand wrote:
> On 19.03.21 10:28, David Hildenbrand wrote:
>>> +/*
>>> + * ram_block_populate_pages: populate memory in the RAM block by 
>>> reading
>>> + *   an integer from the beginning of each page.
>>> + *
>>> + * Since it's solely used for userfault_fd WP feature, here we just
>>> + *   hardcode page size to TARGET_PAGE_SIZE.
>>> + *
>>> + * @bs: RAM block to populate
>>> + */
>>> +volatile int ram_block_populate_pages__tmp;
>>> +static void ram_block_populate_pages(RAMBlock *bs)
>>> +{
>>> +    ram_addr_t offset = 0;
>>> +    int tmp = 0;
>>> +
>>> +    for (char *ptr = (char *) bs->host; offset < bs->used_length;
>>> +            ptr += TARGET_PAGE_SIZE, offset += TARGET_PAGE_SIZE) {
>>
>> You'll want qemu_real_host_page_size instead of TARGET_PAGE_SIZE
>>
>>> +        /* Try to do it without memory writes */
>>> +        tmp += *(volatile int *) ptr;
>>> +    }
>>
>>
>> The following is slightly simpler and doesn't rely on volatile 
>> semantics [1].
>> Should work on any arch I guess.
>>
>> static void ram_block_populate_pages(RAMBlock *bs)
>> {
>>       char *ptr = (char *) bs->host;
>>       ram_addr_t offset;
>>
>>       for (offset = 0; offset < bs->used_length;
>>            offset += qemu_real_host_page_size) {
>>     char tmp = *(volatile char *)(ptr + offset)
>
> I wanted to do a "= *(ptr + offset)" here.
>
Yep
>>
>>     /* Don't optimize the read out. */
>>     asm volatile ("" : "+r" (tmp));
>
> So this is the only volatile thing that the compiler must guarantee to 
> not optimize away.
>
>
David Hildenbrand March 19, 2021, 11:27 a.m. UTC | #5
On 19.03.21 12:05, Andrey Gruzdev wrote:
> On 19.03.2021 12:28, David Hildenbrand wrote:
>>> +/*
>>> + * ram_block_populate_pages: populate memory in the RAM block by
>>> reading
>>> + *   an integer from the beginning of each page.
>>> + *
>>> + * Since it's solely used for userfault_fd WP feature, here we just
>>> + *   hardcode page size to TARGET_PAGE_SIZE.
>>> + *
>>> + * @bs: RAM block to populate
>>> + */
>>> +volatile int ram_block_populate_pages__tmp;
>>> +static void ram_block_populate_pages(RAMBlock *bs)
>>> +{
>>> +    ram_addr_t offset = 0;
>>> +    int tmp = 0;
>>> +
>>> +    for (char *ptr = (char *) bs->host; offset < bs->used_length;
>>> +            ptr += TARGET_PAGE_SIZE, offset += TARGET_PAGE_SIZE) {
>>
>> You'll want qemu_real_host_page_size instead of TARGET_PAGE_SIZE
>>
> Ok.
>>> +        /* Try to do it without memory writes */
>>> +        tmp += *(volatile int *) ptr;
>>> +    }
>>
>>
>> The following is slightly simpler and doesn't rely on volatile
>> semantics [1].
>> Should work on any arch I guess.
>>
>> static void ram_block_populate_pages(RAMBlock *bs)
>> {
>>      char *ptr = (char *) bs->host;
>>      ram_addr_t offset;
>>
>>      for (offset = 0; offset < bs->used_length;
>>           offset += qemu_real_host_page_size) {
>>      char tmp = *(volatile char *)(ptr + offset)
>>
>>      /* Don't optimize the read out. */
>>      asm volatile ("" : "+r" (tmp));
>> }
>>
> Thanks, good option, I'll change the code.
> 
>> Compiles to
>>
>>      for (offset = 0; offset < bs->used_length;
>>      316d:       48 8b 4b 30             mov    0x30(%rbx),%rcx
>>      char *ptr = (char *) bs->host;
>>      3171:       48 8b 73 18             mov    0x18(%rbx),%rsi
>>      for (offset = 0; offset < bs->used_length;
>>      3175:       48 85 c9                test   %rcx,%rcx
>>      3178:       74 ce                   je     3148
>> <ram_write_tracking_prepare+0x58>
>>           offset += qemu_real_host_page_size) {
>>      317a:       48 8b 05 00 00 00 00    mov 0x0(%rip),%rax        #
>> 3181 <ram_write_tracking_prepare+0x91>
>>      3181:       48 8b 38                mov    (%rax),%rdi
>>      3184:       0f 1f 40 00             nopl   0x0(%rax)
>>          char tmp = *(volatile char *)(ptr + offset);
>>      3188:       48 8d 04 16             lea    (%rsi,%rdx,1),%rax
>>      318c:       0f b6 00                movzbl (%rax),%eax
>>           offset += qemu_real_host_page_size) {
>>      318f:       48 01 fa                add    %rdi,%rdx
>>      for (offset = 0; offset < bs->used_length;
>>      3192:       48 39 ca                cmp    %rcx,%rdx
>>      3195:       72 f1                   jb     3188
>> <ram_write_tracking_prepare+0x98>
>>
>>
>> [1]
>> https://programfan.github.io/blog/2015/04/27/prevent-gcc-optimize-away-code/
>>
>>
>> I'll send patches soon to take care of virtio-mem via RamDiscardManager -
>> to skip populating the parts that are supposed to remain discarded and
>> not migrated.
>> Unfortunately, the RamDiscardManager patches are still stuck waiting for
>> acks ... and now we're in soft-freeze.
>>
> RamDiscardManager patches - do they also modify migration code?
> I mean which part is responsible of not migrating discarded ranges.

I haven't shared relevant patches yet that touch migration code. I'm 
planning on doing that once the generic RamDiscardManager has all 
relevant acks. I'll put you on cc.
Andrey Gruzdev March 19, 2021, 12:37 p.m. UTC | #6
On 19.03.2021 14:27, David Hildenbrand wrote:
> On 19.03.21 12:05, Andrey Gruzdev wrote:
>> On 19.03.2021 12:28, David Hildenbrand wrote:
>>>> +/*
>>>> + * ram_block_populate_pages: populate memory in the RAM block by
>>>> reading
>>>> + *   an integer from the beginning of each page.
>>>> + *
>>>> + * Since it's solely used for userfault_fd WP feature, here we just
>>>> + *   hardcode page size to TARGET_PAGE_SIZE.
>>>> + *
>>>> + * @bs: RAM block to populate
>>>> + */
>>>> +volatile int ram_block_populate_pages__tmp;
>>>> +static void ram_block_populate_pages(RAMBlock *bs)
>>>> +{
>>>> +    ram_addr_t offset = 0;
>>>> +    int tmp = 0;
>>>> +
>>>> +    for (char *ptr = (char *) bs->host; offset < bs->used_length;
>>>> +            ptr += TARGET_PAGE_SIZE, offset += TARGET_PAGE_SIZE) {
>>>
>>> You'll want qemu_real_host_page_size instead of TARGET_PAGE_SIZE
>>>
>> Ok.
>>>> +        /* Try to do it without memory writes */
>>>> +        tmp += *(volatile int *) ptr;
>>>> +    }
>>>
>>>
>>> The following is slightly simpler and doesn't rely on volatile
>>> semantics [1].
>>> Should work on any arch I guess.
>>>
>>> static void ram_block_populate_pages(RAMBlock *bs)
>>> {
>>>      char *ptr = (char *) bs->host;
>>>      ram_addr_t offset;
>>>
>>>      for (offset = 0; offset < bs->used_length;
>>>           offset += qemu_real_host_page_size) {
>>>      char tmp = *(volatile char *)(ptr + offset)
>>>
>>>      /* Don't optimize the read out. */
>>>      asm volatile ("" : "+r" (tmp));
>>> }
>>>
>> Thanks, good option, I'll change the code.
>>
>>> Compiles to
>>>
>>>      for (offset = 0; offset < bs->used_length;
>>>      316d:       48 8b 4b 30             mov 0x30(%rbx),%rcx
>>>      char *ptr = (char *) bs->host;
>>>      3171:       48 8b 73 18             mov 0x18(%rbx),%rsi
>>>      for (offset = 0; offset < bs->used_length;
>>>      3175:       48 85 c9                test   %rcx,%rcx
>>>      3178:       74 ce                   je     3148
>>> <ram_write_tracking_prepare+0x58>
>>>           offset += qemu_real_host_page_size) {
>>>      317a:       48 8b 05 00 00 00 00    mov 0x0(%rip),%rax        #
>>> 3181 <ram_write_tracking_prepare+0x91>
>>>      3181:       48 8b 38                mov    (%rax),%rdi
>>>      3184:       0f 1f 40 00             nopl   0x0(%rax)
>>>          char tmp = *(volatile char *)(ptr + offset);
>>>      3188:       48 8d 04 16             lea (%rsi,%rdx,1),%rax
>>>      318c:       0f b6 00                movzbl (%rax),%eax
>>>           offset += qemu_real_host_page_size) {
>>>      318f:       48 01 fa                add    %rdi,%rdx
>>>      for (offset = 0; offset < bs->used_length;
>>>      3192:       48 39 ca                cmp    %rcx,%rdx
>>>      3195:       72 f1                   jb     3188
>>> <ram_write_tracking_prepare+0x98>
>>>
>>>
>>> [1]
>>> https://programfan.github.io/blog/2015/04/27/prevent-gcc-optimize-away-code/ 
>>>
>>>
>>>
>>> I'll send patches soon to take care of virtio-mem via 
>>> RamDiscardManager -
>>> to skip populating the parts that are supposed to remain discarded and
>>> not migrated.
>>> Unfortunately, the RamDiscardManager patches are still stuck waiting 
>>> for
>>> acks ... and now we're in soft-freeze.
>>>
>> RamDiscardManager patches - do they also modify migration code?
>> I mean which part is responsible of not migrating discarded ranges.
>
> I haven't shared relevant patches yet that touch migration code. I'm 
> planning on doing that once the generic RamDiscardManager has all 
> relevant acks. I'll put you on cc.
>
Got it, thanks.
diff mbox series

Patch

diff --git a/migration/migration.c b/migration/migration.c
index 656d6249a6..496e88cbda 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -3872,6 +3872,12 @@  static void *bg_migration_thread(void *opaque)
 
     update_iteration_initial_status(s);
 
+    /*
+     * Prepare for tracking memory writes with UFFD-WP - populate
+     * RAM pages before protecting.
+     */
+    ram_write_tracking_prepare();
+
     qemu_savevm_state_header(s->to_dst_file);
     qemu_savevm_state_setup(s->to_dst_file);
 
diff --git a/migration/ram.c b/migration/ram.c
index 52537f14ac..825eb80030 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -1560,6 +1560,57 @@  out:
     return ret;
 }
 
+/*
+ * ram_block_populate_pages: populate memory in the RAM block by reading
+ *   an integer from the beginning of each page.
+ *
+ * Since it's solely used for userfault_fd WP feature, here we just
+ *   hardcode page size to TARGET_PAGE_SIZE.
+ *
+ * @bs: RAM block to populate
+ */
+volatile int ram_block_populate_pages__tmp;
+static void ram_block_populate_pages(RAMBlock *bs)
+{
+    ram_addr_t offset = 0;
+    int tmp = 0;
+
+    for (char *ptr = (char *) bs->host; offset < bs->used_length;
+            ptr += TARGET_PAGE_SIZE, offset += TARGET_PAGE_SIZE) {
+        /* Try to do it without memory writes */
+        tmp += *(volatile int *) ptr;
+    }
+    /* Create dependency on 'extern volatile int' to avoid optimizing out */
+    ram_block_populate_pages__tmp += tmp;
+}
+
+/*
+ * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
+ */
+void ram_write_tracking_prepare(void)
+{
+    RAMBlock *bs;
+
+    RCU_READ_LOCK_GUARD();
+
+    RAMBLOCK_FOREACH_NOT_IGNORED(bs) {
+        /* Nothing to do with read-only and MMIO-writable regions */
+        if (bs->mr->readonly || bs->mr->rom_device) {
+            continue;
+        }
+
+        /*
+         * Populate pages of the RAM block before enabling userfault_fd
+         * write protection.
+         *
+         * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
+         * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
+         * pages with pte_none() entries in page table.
+         */
+        ram_block_populate_pages(bs);
+    }
+}
+
 /*
  * ram_write_tracking_start: start UFFD-WP memory tracking
  *
diff --git a/migration/ram.h b/migration/ram.h
index 6378bb3ebc..4833e9fd5b 100644
--- a/migration/ram.h
+++ b/migration/ram.h
@@ -82,6 +82,7 @@  void colo_incoming_start_dirty_log(void);
 /* Background snapshot */
 bool ram_write_tracking_available(void);
 bool ram_write_tracking_compatible(void);
+void ram_write_tracking_prepare(void);
 int ram_write_tracking_start(void);
 void ram_write_tracking_stop(void);