Message ID | 20250228121749.553184-6-ppandit@redhat.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Allow to enable multifd and postcopy migration together | expand |
On Fri, Feb 28, 2025 at 05:47:49PM +0530, Prasad Pandit wrote: > From: Prasad Pandit <pjp@fedoraproject.org> > > When Multifd and Postcopy migration is enabled together, > before starting Postcopy phase, Multifd threads are shutdown. > > But before this shutdown, we need to flush the Multifd channels > on the source side and notify the destination migration thread > to synchronise with the Multifd 'recv_x' threads, so that all > the Multifd data is received and processed on the destination > side, before Multifd threads are shutdown. > > This synchronisation happens when the main migration thread > waits for all the Multifd threads to receive their data. > > Add 'MULTIFD_RECV_SYNC' migration command to notify the > destination side to synchronise with Multifd threads. > > Suggested-by: Peter Xu <peterx@redhat.com> Thanks for trying to grant me the credit, but.. I suggest not having a new message at all. We should be able to do multifd's flush and sync before VM stopped in postcopy_start().. So we can drop this line. And whatever come up with, it needs to be with patch 2 because patch 2 currently is broken itself. > Signed-off-by: Prasad Pandit <pjp@fedoraproject.org> > --- > migration/migration.c | 3 +++ > migration/multifd-nocomp.c | 21 +++++++++++++++------ > migration/multifd.c | 1 + > migration/multifd.h | 1 + > migration/savevm.c | 13 +++++++++++++ > migration/savevm.h | 1 + > 6 files changed, 34 insertions(+), 6 deletions(-) > > v6: New patch, not from earlier versions. > - https://lore.kernel.org/qemu-devel/20250215123119.814345-1-ppandit@redhat.com/T/#t > > diff --git a/migration/migration.c b/migration/migration.c > index 5db9e18272..65fc4f5eed 100644 > --- a/migration/migration.c > +++ b/migration/migration.c > @@ -3401,6 +3401,9 @@ static MigIterateState migration_iteration_run(MigrationState *s) > if (!in_postcopy && must_precopy <= s->threshold_size > && can_switchover && qatomic_read(&s->start_postcopy)) { > if (migrate_multifd()) { > + multifd_send_flush(); > + multifd_send_sync_main(MULTIFD_SYNC_LOCAL); > + qemu_savevm_send_multifd_recv_sync(s->to_dst_file); And.. this is not what I meant.. So again, there're more than one way to make sure no multifd pages will arrive during postcopy. What I actually think the easiest is to do flush and sync once in postcopy_start() as mentioned above. I think that'll serialize with postcopy messages later on the main channel, making sure all channels are flushed before moving to postcopy work. If you want to stick with shutdown channels, I'm OK. But then we at least need one message to say "the recv side finished joining all recv threads". That needs to be an ACK sent from dest to src, not src to dest. > multifd_send_shutdown(); > } > if (postcopy_start(s, &local_err)) { > diff --git a/migration/multifd-nocomp.c b/migration/multifd-nocomp.c > index d0edec7cd1..bbe07e4f7e 100644 > --- a/migration/multifd-nocomp.c > +++ b/migration/multifd-nocomp.c > @@ -334,7 +334,7 @@ retry: > * After flush, always retry. > */ > if (pages->block != block || multifd_queue_full(pages)) { > - if (!multifd_send(&multifd_ram_send)) { > + if (multifd_send_flush() < 0) { > return false; > } > goto retry; > @@ -387,6 +387,18 @@ bool multifd_ram_sync_per_round(void) > return !migrate_multifd_flush_after_each_section(); > } > > +int multifd_send_flush(void) > +{ > + if (!multifd_payload_empty(multifd_ram_send)) { > + if (!multifd_send(&multifd_ram_send)) { > + error_report("%s: multifd_send fail", __func__); > + return -1; > + } > + } > + > + return 0; > +} > + > int multifd_ram_flush_and_sync(QEMUFile *f) > { > MultiFDSyncReq req; > @@ -396,11 +408,8 @@ int multifd_ram_flush_and_sync(QEMUFile *f) > return 0; > } > > - if (!multifd_payload_empty(multifd_ram_send)) { > - if (!multifd_send(&multifd_ram_send)) { > - error_report("%s: multifd_send fail", __func__); > - return -1; > - } > + if ((ret = multifd_send_flush()) < 0) { > + return ret; > } > > /* File migrations only need to sync with threads */ > diff --git a/migration/multifd.c b/migration/multifd.c > index 2bd8604ca1..8928ca2611 100644 > --- a/migration/multifd.c > +++ b/migration/multifd.c > @@ -1265,6 +1265,7 @@ static void *multifd_recv_thread(void *opaque) > > rcu_unregister_thread(); > trace_multifd_recv_thread_end(p->id, p->packets_recved); > + qemu_sem_post(&multifd_recv_state->sem_sync); > > return NULL; > } > diff --git a/migration/multifd.h b/migration/multifd.h > index bff867ca6b..242b923633 100644 > --- a/migration/multifd.h > +++ b/migration/multifd.h > @@ -361,6 +361,7 @@ static inline uint32_t multifd_ram_page_count(void) > > void multifd_ram_save_setup(void); > void multifd_ram_save_cleanup(void); > +int multifd_send_flush(void); > int multifd_ram_flush_and_sync(QEMUFile *f); > bool multifd_ram_sync_per_round(void); > bool multifd_ram_sync_per_section(void); > diff --git a/migration/savevm.c b/migration/savevm.c > index 4046faf009..0b71e988ba 100644 > --- a/migration/savevm.c > +++ b/migration/savevm.c > @@ -37,6 +37,7 @@ > #include "migration/register.h" > #include "migration/global_state.h" > #include "migration/channel-block.h" > +#include "multifd.h" > #include "ram.h" > #include "qemu-file.h" > #include "savevm.h" > @@ -90,6 +91,7 @@ enum qemu_vm_cmd { > MIG_CMD_ENABLE_COLO, /* Enable COLO */ > MIG_CMD_POSTCOPY_RESUME, /* resume postcopy on dest */ > MIG_CMD_RECV_BITMAP, /* Request for recved bitmap on dst */ > + MIG_CMD_MULTIFD_RECV_SYNC, /* Sync multifd recv_x and main threads */ > MIG_CMD_MAX > }; > > @@ -109,6 +111,7 @@ static struct mig_cmd_args { > [MIG_CMD_POSTCOPY_RESUME] = { .len = 0, .name = "POSTCOPY_RESUME" }, > [MIG_CMD_PACKAGED] = { .len = 4, .name = "PACKAGED" }, > [MIG_CMD_RECV_BITMAP] = { .len = -1, .name = "RECV_BITMAP" }, > + [MIG_CMD_MULTIFD_RECV_SYNC] = { .len = 0, .name = "MULTIFD_RECV_SYNC" }, > [MIG_CMD_MAX] = { .len = -1, .name = "MAX" }, > }; > > @@ -1201,6 +1204,12 @@ void qemu_savevm_send_recv_bitmap(QEMUFile *f, char *block_name) > qemu_savevm_command_send(f, MIG_CMD_RECV_BITMAP, len + 1, (uint8_t *)buf); > } > > +void qemu_savevm_send_multifd_recv_sync(QEMUFile *f) > +{ > + /* TBD: trace_savevm_send_multifd_recv_sync(); */ > + qemu_savevm_command_send(f, MIG_CMD_MULTIFD_RECV_SYNC, 0, NULL); > +} > + > bool qemu_savevm_state_blocked(Error **errp) > { > SaveStateEntry *se; > @@ -2479,6 +2488,10 @@ static int loadvm_process_command(QEMUFile *f) > case MIG_CMD_RECV_BITMAP: > return loadvm_handle_recv_bitmap(mis, len); > > + case MIG_CMD_MULTIFD_RECV_SYNC: > + multifd_recv_sync_main(); > + break; > + > case MIG_CMD_ENABLE_COLO: > return loadvm_process_enable_colo(mis); > } > diff --git a/migration/savevm.h b/migration/savevm.h > index 7957460062..91ae703925 100644 > --- a/migration/savevm.h > +++ b/migration/savevm.h > @@ -53,6 +53,7 @@ void qemu_savevm_send_postcopy_listen(QEMUFile *f); > void qemu_savevm_send_postcopy_run(QEMUFile *f); > void qemu_savevm_send_postcopy_resume(QEMUFile *f); > void qemu_savevm_send_recv_bitmap(QEMUFile *f, char *block_name); > +void qemu_savevm_send_multifd_recv_sync(QEMUFile *f); > > void qemu_savevm_send_postcopy_ram_discard(QEMUFile *f, const char *name, > uint16_t len, > -- > 2.48.1 >
Hello Peter, On Fri, 28 Feb 2025 at 19:13, Peter Xu <peterx@redhat.com> wrote: > We should be able to do multifd's flush and sync before VM > stopped in postcopy_start().. > > What I actually think the easiest is to do flush and sync once in > postcopy_start() as mentioned above. I think that'll serialize with > postcopy messages later on the main channel, making sure all channels are > flushed before moving to postcopy work. * Is there some specific benefit to calling 'multifd_ram_flush_and_sync()' from OR before postcopy_start()? Maybe that is missing on me. * I'll try to explain why adding a migration command makes sense: - I did call 'multifd_ram_flush_and_sync()' before calling postcopy_start() in migration_iteration_run(). After flushing the 'multifd_send' queue, all it does is send 'RAM_SAVE_FLAG_MULTIFD_FLUSH' flag on the main channel. On the destination side the 'qemu_loadvm_state_main()' function does not understand that message, because it looks for 'section_type'; And when it is not able to identify the section, it leads to an error. "Expected vmdescription section, but got %d", section_type(=0)" - ie. multifd_ram_flush_and_sync() is not reusable by itself. To make it work, we need to add a (RAM) section header to the message. Because then it'll go to ram_load_precopy() and call -> multifd_recv_sync_main(). - But sending a lone RAM section header from migration_iteration_run() OR even in postcopy_start() does not seem right. That is out-of-place, because both migration_iteration_run() and postcopy_start() have nothing to do with RAM sections. - I think 'flush' and 'sync' ought to be small individual functions/operations that are section agnostic. We should be able to call 'flush' and 'sync' from anywhere in the code without side-effects. So tying 'flush' and 'sync' with RAM sections does not seem right, because we need to be able to call 'flush' and 'sync' from other parts too, like before calling postcopy_start() OR maybe some other code parts. - Another observation is: when multifd and postcopy are enabled together and the guest VM is writing to its RAM, multifd_ram_flush_and_sync() is called only during setup, not after that. ===== 2025-03-02 18:13:26.425+0000: initiating migration 2025-03-02T18:13:26.508809Z qemu-system-x86_64: multifd_ram_flush_and_sync: called 2025-03-02 18:15:20.070+0000: shutting down, reason=migrated 2025-03-03 11:26:41.340+0000: initiating migration 2025-03-03T11:26:41.415625Z qemu-system-x86_64: info: multifd_ram_flush_and_sync: called 2025-03-03 11:30:22.766+0000: shutting down, reason=migrated ===== - If we untie the 'flush and sync' from RAM sections and make it a general migration command, we shall be able to call it from anywhere else. > If you want to stick with shutdown channels, I'm OK. But then we at least > need one message to say "the recv side finished joining all recv threads". > That needs to be an ACK sent from dest to src, not src to dest. * But earlier we discussed 'flush and sync' is enough for that, no? Because 'multifd_ram_flush_and_sync' only notifies the destination to sync multifd_recv threads. It does not receive any acknowledgement back. * And multifd_recv_sync_main() function on the destination blocks the 'main' thread until all multfd_recv_threads (mig/dst/recv_x) have exited, only then it proceeds to accept the incoming new postcopy connection. Thank you. --- - Prasad
On Mon, Mar 03, 2025 at 05:13:56PM +0530, Prasad Pandit wrote: > Hello Peter, > > On Fri, 28 Feb 2025 at 19:13, Peter Xu <peterx@redhat.com> wrote: > > We should be able to do multifd's flush and sync before VM > > stopped in postcopy_start().. > > > > What I actually think the easiest is to do flush and sync once in > > postcopy_start() as mentioned above. I think that'll serialize with > > postcopy messages later on the main channel, making sure all channels are > > flushed before moving to postcopy work. > > * Is there some specific benefit to calling > 'multifd_ram_flush_and_sync()' from OR before postcopy_start()? Maybe > that is missing on me. > > * I'll try to explain why adding a migration command makes sense: > > - I did call 'multifd_ram_flush_and_sync()' before calling > postcopy_start() in migration_iteration_run(). After flushing the > 'multifd_send' queue, all it does is send > 'RAM_SAVE_FLAG_MULTIFD_FLUSH' flag on the main channel. On the > destination side the 'qemu_loadvm_state_main()' function does not > understand that message, because it looks for 'section_type'; And when > it is not able to identify the section, it leads to an error. > > "Expected vmdescription section, but got %d", section_type(=0)" > > - ie. multifd_ram_flush_and_sync() is not reusable by itself. To > make it work, we need to add a (RAM) section header to the message. > Because then it'll go to ram_load_precopy() and call -> > multifd_recv_sync_main(). > > - But sending a lone RAM section header from > migration_iteration_run() OR even in postcopy_start() does not seem > right. That is out-of-place, because both migration_iteration_run() > and postcopy_start() have nothing to do with RAM sections. > > - I think 'flush' and 'sync' ought to be small individual > functions/operations that are section agnostic. We should be able to > call 'flush' and 'sync' from anywhere in the code without > side-effects. So tying 'flush' and 'sync' with RAM sections does not > seem right, because we need to be able to call 'flush' and 'sync' from > other parts too, like before calling postcopy_start() OR maybe some > other code parts. We need the header. Maybe the easiest as of now is one more hook like qemu_savevm_state_complete_precopy_prepare(), and only use it in RAM as of now. > > - Another observation is: when multifd and postcopy are enabled > together and the guest VM is writing to its RAM, > multifd_ram_flush_and_sync() is called only during setup, not after > that. > ===== > 2025-03-02 18:13:26.425+0000: initiating migration > 2025-03-02T18:13:26.508809Z qemu-system-x86_64: > multifd_ram_flush_and_sync: called > 2025-03-02 18:15:20.070+0000: shutting down, reason=migrated > > 2025-03-03 11:26:41.340+0000: initiating migration > 2025-03-03T11:26:41.415625Z qemu-system-x86_64: info: > multifd_ram_flush_and_sync: called > 2025-03-03 11:30:22.766+0000: shutting down, reason=migrated > ===== > > - If we untie the 'flush and sync' from RAM sections and make it a > general migration command, we shall be able to call it from anywhere > else. > > > If you want to stick with shutdown channels, I'm OK. But then we at least > > need one message to say "the recv side finished joining all recv threads". > > That needs to be an ACK sent from dest to src, not src to dest. > > * But earlier we discussed 'flush and sync' is enough for that, no? Yes it's ok I think, but this patch didn't do that. + multifd_send_flush(); + multifd_send_sync_main(MULTIFD_SYNC_LOCAL); + qemu_savevm_send_multifd_recv_sync(s->to_dst_file); I don't think it sent RAM_SAVE_FLAG_MULTIFD_FLUSH. IIUC you need the complete multifd_ram_flush_and_sync(), and the new message not needed. > Because 'multifd_ram_flush_and_sync' only notifies the destination to > sync multifd_recv threads. It does not receive any acknowledgement > back. If my understanding is correct, we don't need ACK. It will make sure when postcopy starts all pages flushed and consumed on dest. Instead of I prepare the patch and whole commit message, please take your time and think about it, justify it, and if you also think it works put explanation into commit message and then we can go with it. > > * And multifd_recv_sync_main() function on the destination blocks the > 'main' thread until all multfd_recv_threads (mig/dst/recv_x) have > exited, only then it proceeds to accept the incoming new postcopy > connection. I don't think it makes sure threads have exited, AFAIU it does not join the threads, but sync with threads on the per-thread sync messages. Thanks, > > > Thank you. > --- > - Prasad >
Hello Peter, On Mon, 3 Mar 2025 at 20:20, Peter Xu <peterx@redhat.com> wrote: > We need the header. * We need a section type, which is sent by qemu_savevm_command_send() as 'QEMU_VM_COMMAND'. > Maybe the easiest as of now is one more hook like > qemu_savevm_state_complete_precopy_prepare(), and only use it in RAM as of > now. * What will this helper do? > > * But earlier we discussed 'flush and sync' is enough for that, no? > > Yes it's ok I think, but this patch didn't do that. > > + multifd_send_flush(); > + multifd_send_sync_main(MULTIFD_SYNC_LOCAL); > + qemu_savevm_send_multifd_recv_sync(s->to_dst_file); > > I don't think it sent RAM_SAVE_FLAG_MULTIFD_FLUSH. IIUC you need the > complete multifd_ram_flush_and_sync(), and the new message not needed. * If we look at multifd_ram_flush_and_sync(), it does: 1. multifd_send() <= this patch does it via multifd_send_flush() 2. multifd_send_sync_main() <= this patch also calls it above 3. send RAM_SAVE_FLAG_MULTIFD_FLUSH <= this patch sends MIG_CMD_MULTIFD_RECV_SYNC * What is missing? > Instead of I prepare the patch and whole commit message, please take your > time and think about it, justify it, and if you also think it works put > explanation into commit message and then we can go with it. * The commit message does explain about flush and sync and how the migration command helps. What else do we need to add? > > * And multifd_recv_sync_main() function on the destination blocks the > > 'main' thread until all multfd_recv_threads (mig/dst/recv_x) have > > exited, only then it proceeds to accept the incoming new postcopy > > connection. > > I don't think it makes sure threads have exited, * 'multifd_recv_sync_main()' blocks the main thread on 'multifd_recv_state->sem_sync' semaphore. It is increased when multifd_recv threads exit due to the shutdown message. ie. the 'main' thread unblocks when all 'mig/dst/recv_x' threads have exited. Thank you. --- - Prasad
On Tue, Mar 04, 2025 at 01:40:02PM +0530, Prasad Pandit wrote: > Hello Peter, > > On Mon, 3 Mar 2025 at 20:20, Peter Xu <peterx@redhat.com> wrote: > > We need the header. > > * We need a section type, which is sent by qemu_savevm_command_send() > as 'QEMU_VM_COMMAND'. I think we need the header, the ram is a module. > > > Maybe the easiest as of now is one more hook like > > qemu_savevm_state_complete_precopy_prepare(), and only use it in RAM as of > > now. > > * What will this helper do? Do similarly like qemu_savevm_state_complete_precopy_iterable() but do whatever a vmstate hander wants, so it'll be with a header. > > > > * But earlier we discussed 'flush and sync' is enough for that, no? > > > > Yes it's ok I think, but this patch didn't do that. > > > > + multifd_send_flush(); > > + multifd_send_sync_main(MULTIFD_SYNC_LOCAL); > > + qemu_savevm_send_multifd_recv_sync(s->to_dst_file); > > > > I don't think it sent RAM_SAVE_FLAG_MULTIFD_FLUSH. IIUC you need the > > complete multifd_ram_flush_and_sync(), and the new message not needed. > > * If we look at multifd_ram_flush_and_sync(), it does: > 1. multifd_send() <= this patch does it via > multifd_send_flush() > 2. multifd_send_sync_main() <= this patch also calls it above MULTIFD_SYNC_LOCAL will not invoke MULTIFD_FLAG_SYNC, which we need. > 3. send RAM_SAVE_FLAG_MULTIFD_FLUSH <= this patch sends > MIG_CMD_MULTIFD_RECV_SYNC IMO we shouldn't make a global cmd for multifd. > > * What is missing? > > > Instead of I prepare the patch and whole commit message, please take your > > time and think about it, justify it, and if you also think it works put > > explanation into commit message and then we can go with it. > > * The commit message does explain about flush and sync and how the > migration command helps. What else do we need to add? Please consider adding details like "we need message AAA on BBB channel to serialize with CCC" and details. Not asking that as required to merge, but my understanding is that that's what is missing and that's why none of yet versions can make sure of it in code. Maybe that'll help you to understand how that was serialized. > > > > * And multifd_recv_sync_main() function on the destination blocks the > > > 'main' thread until all multfd_recv_threads (mig/dst/recv_x) have > > > exited, only then it proceeds to accept the incoming new postcopy > > > connection. > > > > I don't think it makes sure threads have exited, > > * 'multifd_recv_sync_main()' blocks the main thread on > 'multifd_recv_state->sem_sync' semaphore. It is increased when > multifd_recv threads exit due to the shutdown message. ie. the 'main' > thread unblocks when all 'mig/dst/recv_x' threads have exited. So is it your intention to not send MULTIFD_FLAG_SYNC above? In all cases, I still think that's not the right way to do. Thanks,
Hi, On Tue, 4 Mar 2025 at 20:05, Peter Xu <peterx@redhat.com> wrote: > I think we need the header, the ram is a module. > Do similarly like qemu_savevm_state_complete_precopy_iterable() but do > whatever a vmstate hander wants, so it'll be with a header. * I don't fully see yet how this shall work. > Please consider adding details like "we need message AAA on BBB channel to > serialize with CCC" and details. Not asking that as required to merge, but > my understanding is that that's what is missing and that's why none of yet > versions can make sure of it in code. Maybe that'll help you to understand > how that was serialized. * Okay, will try. > > MULTIFD_SYNC_LOCAL will not invoke MULTIFD_FLAG_SYNC, which we need. ... > So is it your intention to not send MULTIFD_FLAG_SYNC above? > In all cases, I still think that's not the right way to do. * It makes little difference; MULTIFD_FLAG_SYNC is also used to increase 'multifd_recv_state->sem_sync' semaphore on the destination side, which then unblocks the 'main' thread waiting on it. === diff --git a/migration/migration.c b/migration/migration.c index 65fc4f5eed..d8c4ea0ad1 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -3402,7 +3402,7 @@ static MigIterateState migration_iteration_run(MigrationState *s) && can_switchover && qatomic_read(&s->start_postcopy)) { if (migrate_multifd()) { multifd_send_flush(); - multifd_send_sync_main(MULTIFD_SYNC_LOCAL); + multifd_send_sync_main(MULTIFD_SYNC_ALL); qemu_savevm_send_multifd_recv_sync(s->to_dst_file); multifd_send_shutdown(); } diff --git a/migration/multifd.c b/migration/multifd.c index 8928ca2611..2b5bc2d478 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -1265,7 +1265,7 @@ static void *multifd_recv_thread(void *opaque) rcu_unregister_thread(); trace_multifd_recv_thread_end(p->id, p->packets_recved); - qemu_sem_post(&multifd_recv_state->sem_sync); +// qemu_sem_post(&multifd_recv_state->sem_sync); return NULL; } === host-1] 63/63 qemu:qtest+qtest-x86_64 / qtest-x86_64/migration-test OK 159.46s 79 subtests passed host-2] 63/63 qemu:qtest+qtest-x86_64 / qtest-x86_64/migration-test OK 164.55s 79 subtests passed === * I tried the above patch and it also works the same. I'll use this, no issues. Thank you. --- - Prasad
On Wed, Mar 05, 2025 at 04:51:00PM +0530, Prasad Pandit wrote: > Hi, > > On Tue, 4 Mar 2025 at 20:05, Peter Xu <peterx@redhat.com> wrote: > > I think we need the header, the ram is a module. > > Do similarly like qemu_savevm_state_complete_precopy_iterable() but do > > whatever a vmstate hander wants, so it'll be with a header. > > * I don't fully see yet how this shall work. Another option is add another event for precopy_notifier_list, it can be PRECOPY_NOTIFY_SWITCH_POSTCOPY, then make RAM register to it, send the header itself, and do the flush and sync. Let me know if you want me to write the patches. That's almost the only thing left to make it clearer.. > > > Please consider adding details like "we need message AAA on BBB channel to > > serialize with CCC" and details. Not asking that as required to merge, but > > my understanding is that that's what is missing and that's why none of yet > > versions can make sure of it in code. Maybe that'll help you to understand > > how that was serialized. > > * Okay, will try. > > > > > MULTIFD_SYNC_LOCAL will not invoke MULTIFD_FLAG_SYNC, which we need. > ... > > So is it your intention to not send MULTIFD_FLAG_SYNC above? > > In all cases, I still think that's not the right way to do. > > * It makes little difference; MULTIFD_FLAG_SYNC is also used to > increase 'multifd_recv_state->sem_sync' semaphore on the destination > side, which then unblocks the 'main' thread waiting on it. AFAIU that's the whole difference and whole point of doing such.. > === > diff --git a/migration/migration.c b/migration/migration.c > index 65fc4f5eed..d8c4ea0ad1 100644 > --- a/migration/migration.c > +++ b/migration/migration.c > @@ -3402,7 +3402,7 @@ static MigIterateState > migration_iteration_run(MigrationState *s) > && can_switchover && qatomic_read(&s->start_postcopy)) { > if (migrate_multifd()) { > multifd_send_flush(); > - multifd_send_sync_main(MULTIFD_SYNC_LOCAL); > + multifd_send_sync_main(MULTIFD_SYNC_ALL); > qemu_savevm_send_multifd_recv_sync(s->to_dst_file); > multifd_send_shutdown(); > } > diff --git a/migration/multifd.c b/migration/multifd.c > index 8928ca2611..2b5bc2d478 100644 > --- a/migration/multifd.c > +++ b/migration/multifd.c > @@ -1265,7 +1265,7 @@ static void *multifd_recv_thread(void *opaque) > > rcu_unregister_thread(); > trace_multifd_recv_thread_end(p->id, p->packets_recved); > - qemu_sem_post(&multifd_recv_state->sem_sync); > +// qemu_sem_post(&multifd_recv_state->sem_sync); > > return NULL; > } > === > host-1] 63/63 qemu:qtest+qtest-x86_64 / qtest-x86_64/migration-test > OK 159.46s 79 subtests passed > host-2] 63/63 qemu:qtest+qtest-x86_64 / qtest-x86_64/migration-test > OK 164.55s 79 subtests passed > === > > * I tried the above patch and it also works the same. I'll use this, no issues. We can't introduce a migration global cmd just to work the same as RAM_SAVE_FLAG_MULTIFD_FLUSH, which is a sub-cmd for a module. > > Thank you. > --- > - Prasad >
Hello Peter, On Wed, 5 Mar 2025 at 18:24, Peter Xu <peterx@redhat.com> wrote: > > On Tue, 4 Mar 2025 at 20:05, Peter Xu <peterx@redhat.com> wrote: > > > I think we need the header, the ram is a module. > > > Do similarly like qemu_savevm_state_complete_precopy_iterable() but do > > > whatever a vmstate hander wants, so it'll be with a header. > > === diff --git a/migration/migration.c b/migration/migration.c index 65fc4f5eed..da2c49c303 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -3401,9 +3401,10 @@ static MigIterateState migration_iteration_run(MigrationState *s) if (!in_postcopy && must_precopy <= s->threshold_size && can_switchover && qatomic_read(&s->start_postcopy)) { if (migrate_multifd()) { - multifd_send_flush(); - multifd_send_sync_main(MULTIFD_SYNC_LOCAL); - qemu_savevm_send_multifd_recv_sync(s->to_dst_file); +/* multifd_send_flush(); + * multifd_send_sync_main(MULTIFD_SYNC_ALL); + * qemu_savevm_send_multifd_recv_sync(s->to_dst_file); + */ + qemu_savevm_state_complete_multifd(s->to_dst_file); multifd_send_shutdown(); } if (postcopy_start(s, &local_err)) { diff --git a/migration/savevm.c b/migration/savevm.c index 0b71e988ba..c2b181b0cc 100644 --- a/migration/savevm.c +++ b/migration/savevm.c @@ -1602,6 +1602,37 @@ int qemu_savevm_state_complete_precopy(QEMUFile *f, bool iterable_only) return qemu_fflush(f); } +int qemu_savevm_state_complete_multifd(QEMUFile *f) +{ + int ret; + SaveStateEntry *se; + int64_t start_ts_each, end_ts_each; + + QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { + if (strcmp(se->idstr, "ram")) { + continue; + } + + start_ts_each = qemu_clock_get_us(QEMU_CLOCK_REALTIME); + trace_savevm_section_start(se->idstr, se->section_id); + + save_section_header(f, se, QEMU_VM_SECTION_END); + + ret = se->ops->save_live_complete_precopy(f, se->opaque); + trace_savevm_section_end(se->idstr, se->section_id, ret); + save_section_footer(f, se); + if (ret < 0) { + qemu_file_set_error(f, ret); + return -1; + } + end_ts_each = qemu_clock_get_us(QEMU_CLOCK_REALTIME); + trace_vmstate_downtime_save("iterable", se->idstr, se->instance_id, + end_ts_each - start_ts_each); + } + + return ret; +} + /* Give an estimate of the amount left to be transferred, * the result is split into the amount for units that can and * for units that can't do postcopy. diff --git a/migration/savevm.h b/migration/savevm.h index 91ae703925..e3789984a1 100644 --- a/migration/savevm.h +++ b/migration/savevm.h @@ -70,5 +70,6 @@ int qemu_load_device_state(QEMUFile *f); int qemu_loadvm_approve_switchover(void); int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f, bool in_postcopy); +int qemu_savevm_state_complete_multifd(QEMUFile *f); #endif ==== 63/63 qemu:qtest+qtest-x86_64 / qtest-x86_64/migration-test OK 164.02s 79 subtests passed 63/63 qemu:qtest+qtest-x86_64 / qtest-x86_64/migration-test OK 164.99s 79 subtests passed ==== * Does the above patch look okay? ==== Guest not dirtying RAM: =================== 2025-03-07 10:57:28.740+0000: initiating migration 2025-03-07T10:57:28.823166Z qemu-system-x86_64: info: multifd_ram_flush_and_sync: called 2025-03-07T10:58:04.450758Z qemu-system-x86_64: info: multifd_ram_flush_and_sync: called 2025-03-07T10:58:04.711523Z qemu-system-x86_64: info: multifd_ram_flush_and_sync: called 2025-03-07 10:58:05.078+0000: shutting down, reason=migrated 2025-03-07 10:59:44.322+0000: initiating migration 2025-03-07T10:59:44.394714Z qemu-system-x86_64: info: multifd_ram_flush_and_sync: called 2025-03-07T11:00:20.198360Z qemu-system-x86_64: info: multifd_ram_flush_and_sync: called 2025-03-07T11:00:20.279135Z qemu-system-x86_64: info: multifd_ram_flush_and_sync: called 2025-03-07 11:00:20.571+0000: shutting down, reason=migrated ==== Guest dirtying RAM: $ dd if=/dev/urandom of=/tmp/random.bin bs=256M count=32 ... ================ 2025-03-07 11:04:00.281+0000: initiating migration 2025-03-07T11:04:00.359426Z qemu-system-x86_64: info: multifd_ram_flush_and_sync: called 2025-03-07T11:05:51.406988Z qemu-system-x86_64: info: multifd_ram_flush_and_sync: called 2025-03-07T11:06:56.899920Z qemu-system-x86_64: info: multifd_ram_flush_and_sync: called 2025-03-07 11:08:02.376+0000: shutting down, reason=migrated 2025-03-07 11:09:19.295+0000: initiating migration 2025-03-07T11:09:19.372012Z qemu-system-x86_64: info: multifd_ram_flush_and_sync: called 2025-03-07T11:11:24.217610Z qemu-system-x86_64: info: multifd_ram_flush_and_sync: called 2025-03-07T11:12:35.342709Z qemu-system-x86_64: info: multifd_ram_flush_and_sync: called 2025-03-07 11:13:48.947+0000: shutting down, reason=migrated ==== * When a guest is running some workload (dirtying memory), it seems to take a little more time before switching to the Postcopy phase. > Let me know if you want me to write the patches. That's almost the only > thing left to make it clearer.. * If the above patch is not okay, it'll help if you share your version of it. Meanwhile I'll split the patch-2 and re-arrange other patches. Thank you. --- - Prasad
On Fri, Mar 07, 2025 at 05:15:03PM +0530, Prasad Pandit wrote: > Hello Peter, > > On Wed, 5 Mar 2025 at 18:24, Peter Xu <peterx@redhat.com> wrote: > > > On Tue, 4 Mar 2025 at 20:05, Peter Xu <peterx@redhat.com> wrote: > > > > I think we need the header, the ram is a module. > > > > Do similarly like qemu_savevm_state_complete_precopy_iterable() but do [1] > > > > whatever a vmstate hander wants, so it'll be with a header. > > > > === > diff --git a/migration/migration.c b/migration/migration.c > index 65fc4f5eed..da2c49c303 100644 > --- a/migration/migration.c > +++ b/migration/migration.c > @@ -3401,9 +3401,10 @@ static MigIterateState > migration_iteration_run(MigrationState *s) > if (!in_postcopy && must_precopy <= s->threshold_size > && can_switchover && qatomic_read(&s->start_postcopy)) { > if (migrate_multifd()) { > - multifd_send_flush(); > - multifd_send_sync_main(MULTIFD_SYNC_LOCAL); > - qemu_savevm_send_multifd_recv_sync(s->to_dst_file); > +/* multifd_send_flush(); > + * multifd_send_sync_main(MULTIFD_SYNC_ALL); > + * qemu_savevm_send_multifd_recv_sync(s->to_dst_file); > + */ > + qemu_savevm_state_complete_multifd(s->to_dst_file); > multifd_send_shutdown(); > } Please move all of them at the entry of postcopy_start(). > if (postcopy_start(s, &local_err)) { > diff --git a/migration/savevm.c b/migration/savevm.c > index 0b71e988ba..c2b181b0cc 100644 > --- a/migration/savevm.c > +++ b/migration/savevm.c > @@ -1602,6 +1602,37 @@ int qemu_savevm_state_complete_precopy(QEMUFile > *f, bool iterable_only) > return qemu_fflush(f); > } > > +int qemu_savevm_state_complete_multifd(QEMUFile *f) I still like the name I provided because it's more generic, above [1]. > +{ > + int ret; > + SaveStateEntry *se; > + int64_t start_ts_each, end_ts_each; > + > + QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { > + if (strcmp(se->idstr, "ram")) { > + continue; > + } > + > + start_ts_each = qemu_clock_get_us(QEMU_CLOCK_REALTIME); > + trace_savevm_section_start(se->idstr, se->section_id); > + > + save_section_header(f, se, QEMU_VM_SECTION_END); Maybe it should be SECTION_PART. So we provide all iterator a chance to do something right before switching to postcopy but before VM stop. RAM can register. > + > + ret = se->ops->save_live_complete_precopy(f, se->opaque); Maybe we don't want to run the whole iteration but only flush. I think for simplicity we can make a new hook. > + trace_savevm_section_end(se->idstr, se->section_id, ret); > + save_section_footer(f, se); > + if (ret < 0) { > + qemu_file_set_error(f, ret); > + return -1; > + } > + end_ts_each = qemu_clock_get_us(QEMU_CLOCK_REALTIME); > + trace_vmstate_downtime_save("iterable", se->idstr, se->instance_id, > + end_ts_each - start_ts_each); We do not need to account > + } > + > + return ret; > +} > + > /* Give an estimate of the amount left to be transferred, > * the result is split into the amount for units that can and > * for units that can't do postcopy. > diff --git a/migration/savevm.h b/migration/savevm.h > index 91ae703925..e3789984a1 100644 > --- a/migration/savevm.h > +++ b/migration/savevm.h > @@ -70,5 +70,6 @@ int qemu_load_device_state(QEMUFile *f); > int qemu_loadvm_approve_switchover(void); > int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f, > bool in_postcopy); > +int qemu_savevm_state_complete_multifd(QEMUFile *f); > > #endif > ==== > > 63/63 qemu:qtest+qtest-x86_64 / qtest-x86_64/migration-test > OK 164.02s 79 subtests passed > 63/63 qemu:qtest+qtest-x86_64 / qtest-x86_64/migration-test > OK 164.99s 79 subtests passed > ==== > > * Does the above patch look okay? > > ==== > Guest not dirtying RAM: > =================== > 2025-03-07 10:57:28.740+0000: initiating migration > 2025-03-07T10:57:28.823166Z qemu-system-x86_64: info: > multifd_ram_flush_and_sync: called > 2025-03-07T10:58:04.450758Z qemu-system-x86_64: info: > multifd_ram_flush_and_sync: called > 2025-03-07T10:58:04.711523Z qemu-system-x86_64: info: > multifd_ram_flush_and_sync: called > 2025-03-07 10:58:05.078+0000: shutting down, reason=migrated > > 2025-03-07 10:59:44.322+0000: initiating migration > 2025-03-07T10:59:44.394714Z qemu-system-x86_64: info: > multifd_ram_flush_and_sync: called > 2025-03-07T11:00:20.198360Z qemu-system-x86_64: info: > multifd_ram_flush_and_sync: called > 2025-03-07T11:00:20.279135Z qemu-system-x86_64: info: > multifd_ram_flush_and_sync: called > 2025-03-07 11:00:20.571+0000: shutting down, reason=migrated > ==== > > Guest dirtying RAM: $ dd if=/dev/urandom of=/tmp/random.bin bs=256M count=32 ... > ================ > 2025-03-07 11:04:00.281+0000: initiating migration > 2025-03-07T11:04:00.359426Z qemu-system-x86_64: info: > multifd_ram_flush_and_sync: called > 2025-03-07T11:05:51.406988Z qemu-system-x86_64: info: > multifd_ram_flush_and_sync: called > 2025-03-07T11:06:56.899920Z qemu-system-x86_64: info: > multifd_ram_flush_and_sync: called > 2025-03-07 11:08:02.376+0000: shutting down, reason=migrated > > 2025-03-07 11:09:19.295+0000: initiating migration > 2025-03-07T11:09:19.372012Z qemu-system-x86_64: info: > multifd_ram_flush_and_sync: called > 2025-03-07T11:11:24.217610Z qemu-system-x86_64: info: > multifd_ram_flush_and_sync: called > 2025-03-07T11:12:35.342709Z qemu-system-x86_64: info: > multifd_ram_flush_and_sync: called > 2025-03-07 11:13:48.947+0000: shutting down, reason=migrated > ==== > > * When a guest is running some workload (dirtying memory), it seems to > take a little more time before switching to the Postcopy phase. > > > Let me know if you want me to write the patches. That's almost the only > > thing left to make it clearer.. > > * If the above patch is not okay, it'll help if you share your > version of it. Meanwhile I'll split the patch-2 and re-arrange other > patches. Please see above, if that doesn't help you come up with a final version, I'll write it. Thanks,
On Fri, Mar 07, 2025 at 05:15:03PM +0530, Prasad Pandit wrote: > diff --git a/migration/migration.c b/migration/migration.c > index 65fc4f5eed..da2c49c303 100644 > --- a/migration/migration.c > +++ b/migration/migration.c > @@ -3401,9 +3401,10 @@ static MigIterateState > migration_iteration_run(MigrationState *s) > if (!in_postcopy && must_precopy <= s->threshold_size > && can_switchover && qatomic_read(&s->start_postcopy)) { > if (migrate_multifd()) { > - multifd_send_flush(); > - multifd_send_sync_main(MULTIFD_SYNC_LOCAL); > - qemu_savevm_send_multifd_recv_sync(s->to_dst_file); > +/* multifd_send_flush(); > + * multifd_send_sync_main(MULTIFD_SYNC_ALL); > + * qemu_savevm_send_multifd_recv_sync(s->to_dst_file); > + */ > + qemu_savevm_state_complete_multifd(s->to_dst_file); > multifd_send_shutdown(); Forgot to mention one thing: If you do flush and sync, IMHO we can keep the threads there and remove this shutdown, as long as we are sure it'll be properly shutdown when cleanup. With the assertion in dest threads, I think it should be OK.
Hi, On Sat, 8 Mar 2025 at 04:18, Peter Xu <peterx@redhat.com> wrote: > Please move all of them at the entry of postcopy_start(). * Okay. > [1] > > > +int qemu_savevm_state_complete_multifd(QEMUFile *f) > I still like the name I provided because it's more generic, above [1]. === > Maybe the easiest as of now is one more hook like > qemu_savevm_state_complete_precopy_prepare(), === * ie. use -> 'qemu_savevm_state_complete_precopy_prepare' name? Should it be _postcopy_prepare? >> + if (strcmp(se->idstr, "ram")) { >> + continue; >> + } >> + save_section_header(f, se, QEMU_VM_SECTION_END); > > Maybe it should be SECTION_PART. * Will try with SECTION_PART. > So we provide all iterator a chance to > do something right before switching to postcopy but before VM stop. RAM > can register. * ...all iterators a chance to do something? Above function only processes "ram" entries -> strcmp(se->idstr, "ram"). * ...RAM can register? > > + > > + ret = se->ops->save_live_complete_precopy(f, se->opaque); > > Maybe we don't want to run the whole iteration but only flush. I think for > simplicity we can make a new hook. * ..new hook? I tried calling multifd_ram_flush_and_sync() in place of ->save_live_complete_precopy(), but it crashes QEMU. > > + end_ts_each = qemu_clock_get_us(QEMU_CLOCK_REALTIME); > > + trace_vmstate_downtime_save("iterable", se->idstr, se->instance_id, > > + end_ts_each - start_ts_each); > > We do not need to account > If you do flush and sync, IMHO we can keep the threads there and remove > this shutdown, as long as we are sure it'll be properly shutdown when > cleanup. With the assertion in dest threads, I think it should be OK. * Okay. Thank you. --- - Prasad
Peter Xu <peterx@redhat.com> writes: > On Fri, Mar 07, 2025 at 05:15:03PM +0530, Prasad Pandit wrote: >> diff --git a/migration/migration.c b/migration/migration.c >> index 65fc4f5eed..da2c49c303 100644 >> --- a/migration/migration.c >> +++ b/migration/migration.c >> @@ -3401,9 +3401,10 @@ static MigIterateState >> migration_iteration_run(MigrationState *s) >> if (!in_postcopy && must_precopy <= s->threshold_size >> && can_switchover && qatomic_read(&s->start_postcopy)) { >> if (migrate_multifd()) { >> - multifd_send_flush(); >> - multifd_send_sync_main(MULTIFD_SYNC_LOCAL); >> - qemu_savevm_send_multifd_recv_sync(s->to_dst_file); >> +/* multifd_send_flush(); >> + * multifd_send_sync_main(MULTIFD_SYNC_ALL); >> + * qemu_savevm_send_multifd_recv_sync(s->to_dst_file); >> + */ >> + qemu_savevm_state_complete_multifd(s->to_dst_file); >> multifd_send_shutdown(); > > Forgot to mention one thing: > > If you do flush and sync, IMHO we can keep the threads there and remove > this shutdown, as long as we are sure it'll be properly shutdown when > cleanup. > > With the assertion in dest threads, I think it should be OK. Good point. Shutdown at random places makes it difficult to protect against cleanup races.
On Mon, 10 Mar 2025 at 20:09, Fabiano Rosas <farosas@suse.de> wrote: > Good point. Shutdown at random places makes it difficult to protect > against cleanup races. * Just to understand, how and where do these races occur? Thank you. --- - Prasad
Prasad Pandit <ppandit@redhat.com> writes: > On Mon, 10 Mar 2025 at 20:09, Fabiano Rosas <farosas@suse.de> wrote: >> Good point. Shutdown at random places makes it difficult to protect >> against cleanup races. > > * Just to understand, how and where do these races occur? > They occur when cleanup code is allowed to run when resources have not yet been allocated or while the resources are still being accessed. Having the shutdown routine at a single point when it's clear everything else is ready for shutdown helps not only to avoid those issues, but also when investigating them. Otherwise you'll have the same code running at (potentially) completely different points in time and one of those times the system might be in an unexpected state.
Hi, On Tue, 11 Mar 2025 at 01:28, Fabiano Rosas <farosas@suse.de> wrote: > They occur when cleanup code is allowed to run when resources have not > yet been allocated or while the resources are still being accessed. > > Having the shutdown routine at a single point when it's clear everything > else is ready for shutdown helps not only to avoid those issues, but > also when investigating them. Otherwise you'll have the same code > running at (potentially) completely different points in time and one of > those times the system might be in an unexpected state. * I see. It's not clear when this would happen in a production deployment. === if (migrate_multifd()) { multifd_send_shutdown(); <= [1] } postcopy_start(...) <= [2] === * There was another scenario regarding multifd shutdown as: the EOF or shutdown message sent via [1] on each multifd connection reaches the destination _later_ than the Postcopy phase start via [2]. And this may result in multifd_recv_threads on the destination overwriting memory pages written by postcopy thread, thus corrupting the guest state. * Do we have any bugs/issues where these above things happened? To see the real circumstances under which it happened? * There seems to be some disconnect between the kind of scenarios we are considering and the minimal requirements for live migrations: a stable network with real good bandwidth. If we test live migration with guest dirtying RAM to the tune of 64M/128M/256M/512M bytes, that assumes/implies that the network bandwidth is much more than 512Mbps, no? Thank you. --- - Prasad
Prasad Pandit <ppandit@redhat.com> writes: > Hi, > > On Tue, 11 Mar 2025 at 01:28, Fabiano Rosas <farosas@suse.de> wrote: >> They occur when cleanup code is allowed to run when resources have not >> yet been allocated or while the resources are still being accessed. >> >> Having the shutdown routine at a single point when it's clear everything >> else is ready for shutdown helps not only to avoid those issues, but >> also when investigating them. Otherwise you'll have the same code >> running at (potentially) completely different points in time and one of >> those times the system might be in an unexpected state. > > * I see. It's not clear when this would happen in a production deployment. > === > if (migrate_multifd()) { > multifd_send_shutdown(); <= [1] > } > > postcopy_start(...) <= [2] > === > > * There was another scenario regarding multifd shutdown as: the EOF or > shutdown message sent via [1] on each multifd connection reaches the > destination _later_ than the Postcopy phase start via [2]. And this > may result in multifd_recv_threads on the destination overwriting > memory pages written by postcopy thread, thus corrupting the guest > state. Isn't that the point? To add a sync for this which would allow the shutdown to not be added? > > * Do we have any bugs/issues where these above things happened? To see > the real circumstances under which it happened? > We do. They don't come with a description of the circumstances. You're lucky if you get a coredump. You can peruse `git log migration/multifd`, I'd say most of the work in the recent years has been solving concurrency issues. > * There seems to be some disconnect between the kind of scenarios we > are considering and the minimal requirements for live migrations: a > stable network with real good bandwidth. There's no such requirement. Besides, the topic is not failed migrations due to lack of resources. We're talking about correctness issues that are hard to spot. Those should always be fixed when found, independently of what the production environment is expected to be.
Hello Peter, On Sat, 8 Mar 2025 at 04:18, Peter Xu <peterx@redhat.com> wrote: > [1] > Please move all of them at the entry of postcopy_start(). > I still like the name I provided because it's more generic, above [1]. > Maybe it should be SECTION_PART. So we provide all iterator a chance to > do something right before switching to postcopy but before VM stop. RAM > can register. > Maybe we don't want to run the whole iteration but only flush. I think for > simplicity we can make a new hook. > We do not need to account > Please see above, if that doesn't help you come up with a final version, > I'll write it. === diff --git a/migration/migration.c b/migration/migration.c index f97bb2777f..9ef8e8ee1d 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -2700,6 +2700,10 @@ static int postcopy_start(MigrationState *ms, Error **errp) QIOChannelBuffer *bioc; QEMUFile *fb; + if (migrate_multifd()) { + qemu_savevm_state_postcopy_prepare(ms->to_dst_file); + } + /* * Now we're 100% sure to switch to postcopy, so JSON writer won't be * useful anymore. Free the resources early if it is there. Clearing diff --git a/migration/ram.c b/migration/ram.c index 6fd88cbf2a..13a8c63660 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -1143,6 +1143,15 @@ static int save_zero_page(RAMState *rs, PageSearchStatus *pss, return len; } +int ram_save_zero_page(QEMUFile *f, void *opaque) +{ + RAMState *rs = *(RAMState **)opaque; + PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY]; + ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; + + return save_zero_page(rs, pss, offset); +} + /* * @pages: the number of pages written by the control path, * < 0 - error diff --git a/migration/ram.h b/migration/ram.h index 921c39a2c5..425c9ad9fd 100644 --- a/migration/ram.h +++ b/migration/ram.h @@ -122,4 +122,5 @@ void ram_write_tracking_prepare(void); int ram_write_tracking_start(void); void ram_write_tracking_stop(void); +int ram_save_zero_page(QEMUFile *f, void *opaque); #endif diff --git a/migration/savevm.c b/migration/savevm.c index ce158c3512..9bb0f03942 100644 --- a/migration/savevm.c +++ b/migration/savevm.c @@ -1676,6 +1676,36 @@ int qemu_savevm_state_complete_precopy(QEMUFile *f, bool iterable_only) return qemu_fflush(f); } +int qemu_savevm_state_postcopy_prepare(QEMUFile *f) +{ + int ret = 0; + SaveStateEntry *se; + + QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { + if (strcmp(se->idstr, "ram")) { + continue; + } + + save_section_header(f, se, QEMU_VM_SECTION_PART); + + ram_save_zero_page(f, se->opaque); + if ((ret = multifd_ram_flush_and_sync(f)) < 0) { + return ret; + } + qemu_put_be64(f, RAM_SAVE_FLAG_EOS); + + trace_savevm_section_end(se->idstr, se->section_id, ret); + save_section_footer(f, se); + if (ret < 0) { + qemu_file_set_error(f, ret); + return -1; + } + } + + return ret; +} + + /* Give an estimate of the amount left to be transferred, * the result is split into the amount for units that can and * for units that can't do postcopy. diff --git a/migration/savevm.h b/migration/savevm.h index 138c39a7f9..7d4ff25ca3 100644 --- a/migration/savevm.h +++ b/migration/savevm.h @@ -74,4 +74,5 @@ int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f, bool qemu_loadvm_load_state_buffer(const char *idstr, uint32_t instance_id, char *buf, size_t len, Error **errp); +int qemu_savevm_state_postcopy_prepare(QEMUFile *f); #endif === host-1] 63/63 qemu:qtest+qtest-x86_64 / qtest-x86_64/migration-test OK 139.87s 79 subtests passed host-2] 63/63 qemu:qtest+qtest-x86_64 / qtest-x86_64/migration-test OK 137.52s 79 subtests passed 2025-03-13 12:11:01.653+0000: initiating migration 2025-03-13T12:11:01.690247Z qemu-system-x86_64: info: multifd_ram_flush_and_sync: called: 1, 0 2025-03-13T12:12:05.342194Z qemu-system-x86_64: info: multifd_ram_flush_and_sync: called: 1, 0 2025-03-13 12:12:54.688+0000: shutting down, reason=migrated 2025-03-13 12:21:10.967+0000: initiating migration 2025-03-13T12:21:10.994878Z qemu-system-x86_64: info: multifd_ram_flush_and_sync: called: 1, 0 2025-03-13T12:22:15.955266Z qemu-system-x86_64: info: multifd_ram_flush_and_sync: called: 1, 0 2025-03-13 12:23:02.107+0000: shutting down, reason=migrated === * Does this patch look okay? Thank you. --- - Prasad
On Thu, Mar 13, 2025 at 06:13:24PM +0530, Prasad Pandit wrote: > +int qemu_savevm_state_postcopy_prepare(QEMUFile *f) > +{ > + int ret = 0; > + SaveStateEntry *se; > + > + QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { > + if (strcmp(se->idstr, "ram")) { > + continue; > + } > + > + save_section_header(f, se, QEMU_VM_SECTION_PART); > + > + ram_save_zero_page(f, se->opaque); I'll stop requesting a why here... but I think this is another example that even if all the tests pass it may not be correct. > + if ((ret = multifd_ram_flush_and_sync(f)) < 0) { > + return ret; > + } > + qemu_put_be64(f, RAM_SAVE_FLAG_EOS); > + > + trace_savevm_section_end(se->idstr, se->section_id, ret); > + save_section_footer(f, se); > + if (ret < 0) { > + qemu_file_set_error(f, ret); > + return -1; > + } > + } > + > + return ret; > +} [...] > * Does this patch look okay? I've written the relevant patches below, please review and take them if you agree with the changes. Thanks, ===8<=== From f9343dfc777ef04168443e86a1fa3922296ea563 Mon Sep 17 00:00:00 2001 From: Peter Xu <peterx@redhat.com> Date: Thu, 13 Mar 2025 15:34:10 -0400 Subject: [PATCH 1/2] migration: Add save_postcopy_prepare() savevm handler Add a savevm handler for a module to opt-in sending extra sections right before postcopy starts, and before VM is stopped. RAM will start to use this new savevm handler in the next patch to do flush and sync for multifd pages. Note that we choose to do it before VM stopped because the current only potential user is not sensitive to VM status, so doing it before VM is stopped is preferred to enlarge any postcopy downtime. It is still a bit unfortunate that we need to introduce such a new savevm handler just for the only use case, however it's so far the cleanest. Signed-off-by: Peter Xu <peterx@redhat.com> --- include/migration/register.h | 15 +++++++++++++++ migration/savevm.h | 1 + migration/migration.c | 4 ++++ migration/savevm.c | 33 +++++++++++++++++++++++++++++++++ 4 files changed, 53 insertions(+) diff --git a/include/migration/register.h b/include/migration/register.h index c041ce32f2..b79dc81b8d 100644 --- a/include/migration/register.h +++ b/include/migration/register.h @@ -189,6 +189,21 @@ typedef struct SaveVMHandlers { /* This runs outside the BQL! */ + /** + * @save_postcopy_prepare + * + * This hook will be invoked on the source side right before switching + * to postcopy (before VM stopped). + * + * @f: QEMUFile where to send the data + * @opaque: Data pointer passed to register_savevm_live() + * @errp: Error** used to report error message + * + * Returns: true if succeeded, false if error occured. When false is + * returned, @errp must be set. + */ + bool (*save_postcopy_prepare)(QEMUFile *f, void *opaque, Error **errp); + /** * @state_pending_estimate * diff --git a/migration/savevm.h b/migration/savevm.h index 138c39a7f9..2d5e9c7166 100644 --- a/migration/savevm.h +++ b/migration/savevm.h @@ -45,6 +45,7 @@ void qemu_savevm_state_pending_exact(uint64_t *must_precopy, void qemu_savevm_state_pending_estimate(uint64_t *must_precopy, uint64_t *can_postcopy); int qemu_savevm_state_complete_precopy_iterable(QEMUFile *f, bool in_postcopy); +bool qemu_savevm_state_postcopy_prepare(QEMUFile *f, Error **errp); void qemu_savevm_send_ping(QEMUFile *f, uint32_t value); void qemu_savevm_send_open_return_path(QEMUFile *f); int qemu_savevm_send_packaged(QEMUFile *f, const uint8_t *buf, size_t len); diff --git a/migration/migration.c b/migration/migration.c index d46e776e24..212f6b4145 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -2707,6 +2707,10 @@ static int postcopy_start(MigrationState *ms, Error **errp) } } + if (!qemu_savevm_state_postcopy_prepare(ms->to_dst_file, errp)) { + return -1; + } + trace_postcopy_start(); bql_lock(); trace_postcopy_start_set_run(); diff --git a/migration/savevm.c b/migration/savevm.c index ce158c3512..23ef4c7dc9 100644 --- a/migration/savevm.c +++ b/migration/savevm.c @@ -1523,6 +1523,39 @@ void qemu_savevm_state_complete_postcopy(QEMUFile *f) qemu_fflush(f); } +bool qemu_savevm_state_postcopy_prepare(QEMUFile *f, Error **errp) +{ + SaveStateEntry *se; + bool ret; + + QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { + if (!se->ops || !se->ops->save_postcopy_prepare) { + continue; + } + + if (se->ops->is_active) { + if (!se->ops->is_active(se->opaque)) { + continue; + } + } + + trace_savevm_section_start(se->idstr, se->section_id); + + save_section_header(f, se, QEMU_VM_SECTION_PART); + ret = se->ops->save_postcopy_prepare(f, se->opaque, errp); + save_section_footer(f, se); + + trace_savevm_section_end(se->idstr, se->section_id, ret); + + if (!ret) { + assert(*errp); + return false; + } + } + + return true; +} + int qemu_savevm_state_complete_precopy_iterable(QEMUFile *f, bool in_postcopy) { int64_t start_ts_each, end_ts_each;
Hi, On Fri, 14 Mar 2025 at 01:40, Peter Xu <peterx@redhat.com> wrote: >+ save_section_header(f, se, QEMU_VM_SECTION_PART); > + ram_save_zero_page(f, se->opaque); >I'll stop requesting a why here... * Earlier in this thread you mentioned 'We need a header'. I took it as a 'RAM page' header, not save_section_header(). Section type (QEMU_VM_COMMAND) was sent by qemu_savevm_command_send() as well. > but I think this is another example that even if all the tests pass it may not be correct. * This is also an example of - communication is hard. > From f9343dfc777ef04168443e86a1fa3922296ea563 Mon Sep 17 00:00:00 2001 > From: Peter Xu <peterx@redhat.com> > Date: Thu, 13 Mar 2025 15:34:10 -0400 > Subject: [PATCH 1/2] migration: Add save_postcopy_prepare() savevm handler > > Add a savevm handler for a module to opt-in sending extra sections right > before postcopy starts, and before VM is stopped. > > RAM will start to use this new savevm handler in the next patch to do flush > and sync for multifd pages. > > Note that we choose to do it before VM stopped because the current only > potential user is not sensitive to VM status, so doing it before VM is > stopped is preferred to enlarge any postcopy downtime. > > It is still a bit unfortunate that we need to introduce such a new savevm > handler just for the only use case, however it's so far the cleanest. > > Signed-off-by: Peter Xu <peterx@redhat.com> > --- > include/migration/register.h | 15 +++++++++++++++ > migration/savevm.h | 1 + > migration/migration.c | 4 ++++ > migration/savevm.c | 33 +++++++++++++++++++++++++++++++++ > 4 files changed, 53 insertions(+) > > diff --git a/include/migration/register.h b/include/migration/register.h > index c041ce32f2..b79dc81b8d 100644 > --- a/include/migration/register.h > +++ b/include/migration/register.h > @@ -189,6 +189,21 @@ typedef struct SaveVMHandlers { > > /* This runs outside the BQL! */ > > + /** > + * @save_postcopy_prepare > + * > + * This hook will be invoked on the source side right before switching > + * to postcopy (before VM stopped). > + * > + * @f: QEMUFile where to send the data > + * @opaque: Data pointer passed to register_savevm_live() > + * @errp: Error** used to report error message > + * > + * Returns: true if succeeded, false if error occured. When false is > + * returned, @errp must be set. > + */ > + bool (*save_postcopy_prepare)(QEMUFile *f, void *opaque, Error **errp); > + > /** > * @state_pending_estimate > * > diff --git a/migration/savevm.h b/migration/savevm.h > index 138c39a7f9..2d5e9c7166 100644 > --- a/migration/savevm.h > +++ b/migration/savevm.h > @@ -45,6 +45,7 @@ void qemu_savevm_state_pending_exact(uint64_t *must_precopy, > void qemu_savevm_state_pending_estimate(uint64_t *must_precopy, > uint64_t *can_postcopy); > int qemu_savevm_state_complete_precopy_iterable(QEMUFile *f, bool in_postcopy); > +bool qemu_savevm_state_postcopy_prepare(QEMUFile *f, Error **errp); > void qemu_savevm_send_ping(QEMUFile *f, uint32_t value); > void qemu_savevm_send_open_return_path(QEMUFile *f); > int qemu_savevm_send_packaged(QEMUFile *f, const uint8_t *buf, size_t len); > diff --git a/migration/migration.c b/migration/migration.c > index d46e776e24..212f6b4145 100644 > --- a/migration/migration.c > +++ b/migration/migration.c > @@ -2707,6 +2707,10 @@ static int postcopy_start(MigrationState *ms, Error **errp) > } > } > > + if (!qemu_savevm_state_postcopy_prepare(ms->to_dst_file, errp)) { > + return -1; > + } > + > trace_postcopy_start(); > bql_lock(); > trace_postcopy_start_set_run(); > diff --git a/migration/savevm.c b/migration/savevm.c > index ce158c3512..23ef4c7dc9 100644 > --- a/migration/savevm.c > +++ b/migration/savevm.c > @@ -1523,6 +1523,39 @@ void qemu_savevm_state_complete_postcopy(QEMUFile *f) > qemu_fflush(f); > } > > +bool qemu_savevm_state_postcopy_prepare(QEMUFile *f, Error **errp) > +{ > + SaveStateEntry *se; > + bool ret; > + > + QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { > + if (!se->ops || !se->ops->save_postcopy_prepare) { > + continue; > + } > + > + if (se->ops->is_active) { > + if (!se->ops->is_active(se->opaque)) { > + continue; > + } > + } > + > + trace_savevm_section_start(se->idstr, se->section_id); > + > + save_section_header(f, se, QEMU_VM_SECTION_PART); > + ret = se->ops->save_postcopy_prepare(f, se->opaque, errp); > + save_section_footer(f, se); > + > + trace_savevm_section_end(se->idstr, se->section_id, ret); > + > + if (!ret) { > + assert(*errp); > + return false; > + } > + } > + > + return true; > +} > + > int qemu_savevm_state_complete_precopy_iterable(QEMUFile *f, bool in_postcopy) > { > int64_t start_ts_each, end_ts_each; > -- > 2.47.0 > > > From 299e1cdd9b28802f361ed012673825685e30f965 Mon Sep 17 00:00:00 2001 > From: Peter Xu <peterx@redhat.com> > Date: Thu, 13 Mar 2025 15:56:01 -0400 > Subject: [PATCH 2/2] migration/ram: Implement save_postcopy_prepare() > > Implement save_postcopy_prepare(), preparing for the enablement of both > multifd and postcopy. > > Please see the rich comment for the rationals. > > Signed-off-by: Peter Xu <peterx@redhat.com> > --- > migration/ram.c | 37 +++++++++++++++++++++++++++++++++++++ > 1 file changed, 37 insertions(+) > > diff --git a/migration/ram.c b/migration/ram.c > index 424df6d9f1..119e7d3ac2 100644 > --- a/migration/ram.c > +++ b/migration/ram.c > @@ -4420,6 +4420,42 @@ static int ram_resume_prepare(MigrationState *s, void *opaque) > return 0; > } > > +static bool ram_save_postcopy_prepare(QEMUFile *f, void *opaque, Error **errp) > +{ > + int ret; > + > + if (migrate_multifd()) { > + /* > + * When multifd is enabled, source QEMU needs to make sure all the > + * pages queued before postcopy starts to be flushed. > + * > + * Meanwhile, the load of these pages must happen before switching > + * to postcopy. It's because loading of guest pages (so far) in > + * multifd recv threads is still non-atomic, so the load cannot > + * happen with vCPUs running on destination side. > + * > + * This flush and sync will guarantee those pages loaded _before_ > + * postcopy starts on destination. The rational is, this happens > + * before VM stops (and before source QEMU sends all the rest of > + * the postcopy messages). So when the destination QEMU received > + * the postcopy messages, it must have received the sync message on > + * the main channel (either RAM_SAVE_FLAG_MULTIFD_FLUSH, or > + * RAM_SAVE_FLAG_EOS), and such message should have guaranteed all > + * previous guest pages queued in the multifd channels to be > + * completely loaded. > + */ > + ret = multifd_ram_flush_and_sync(f); > + if (ret < 0) { > + error_setg(errp, "%s: multifd flush and sync failed", __func__); > + return false; > + } > + } > + > + qemu_put_be64(f, RAM_SAVE_FLAG_EOS); > + > + return true; > +} > + > void postcopy_preempt_shutdown_file(MigrationState *s) > { > qemu_put_be64(s->postcopy_qemufile_src, RAM_SAVE_FLAG_EOS); > @@ -4439,6 +4475,7 @@ static SaveVMHandlers savevm_ram_handlers = { > .load_setup = ram_load_setup, > .load_cleanup = ram_load_cleanup, > .resume_prepare = ram_resume_prepare, > + .save_postcopy_prepare = ram_save_postcopy_prepare, > }; > > static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host, > -- > 2.47.0 * I get the infrastructural changes that they'll help to take 'section' specific action before postcopy starts. It's not clear how tying flush and sync with a RAM section helps; because on the destination side 'section' is only used to call se->ops->load_state()->ram_load->ram_load_precopy()->multifd_recv_sync_main(). * To confirm: - Benefit of this approach is that 'flush and sync' works via vmstate_load -> se->ops->load_state() -> ram_load -> ram_load_precopy() sequence? * Thank you for the patches, I'll send a revised patchset including them. Thank you. --- - Prasad
On Mon, Mar 17, 2025 at 06:00:14PM +0530, Prasad Pandit wrote: > Hi, > > On Fri, 14 Mar 2025 at 01:40, Peter Xu <peterx@redhat.com> wrote: > >+ save_section_header(f, se, QEMU_VM_SECTION_PART); > > + ram_save_zero_page(f, se->opaque); > >I'll stop requesting a why here... > > * Earlier in this thread you mentioned 'We need a header'. I took it > as a 'RAM page' header, not save_section_header(). Section type The question is not about the header, it's about the zero page, and why we send this zero page out of blue. IIIUC it can corrupt a page if it uses whatever cached in the previous round in the pss, and if it used to be a non-zero. > (QEMU_VM_COMMAND) was sent by qemu_savevm_command_send() as well. But it should be for generic VM operations. We have a few outliers but they're too special. For this case we'd better not add more outliers, because neither it is special, nor necessary. Thanks,
diff --git a/migration/migration.c b/migration/migration.c index 5db9e18272..65fc4f5eed 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -3401,6 +3401,9 @@ static MigIterateState migration_iteration_run(MigrationState *s) if (!in_postcopy && must_precopy <= s->threshold_size && can_switchover && qatomic_read(&s->start_postcopy)) { if (migrate_multifd()) { + multifd_send_flush(); + multifd_send_sync_main(MULTIFD_SYNC_LOCAL); + qemu_savevm_send_multifd_recv_sync(s->to_dst_file); multifd_send_shutdown(); } if (postcopy_start(s, &local_err)) { diff --git a/migration/multifd-nocomp.c b/migration/multifd-nocomp.c index d0edec7cd1..bbe07e4f7e 100644 --- a/migration/multifd-nocomp.c +++ b/migration/multifd-nocomp.c @@ -334,7 +334,7 @@ retry: * After flush, always retry. */ if (pages->block != block || multifd_queue_full(pages)) { - if (!multifd_send(&multifd_ram_send)) { + if (multifd_send_flush() < 0) { return false; } goto retry; @@ -387,6 +387,18 @@ bool multifd_ram_sync_per_round(void) return !migrate_multifd_flush_after_each_section(); } +int multifd_send_flush(void) +{ + if (!multifd_payload_empty(multifd_ram_send)) { + if (!multifd_send(&multifd_ram_send)) { + error_report("%s: multifd_send fail", __func__); + return -1; + } + } + + return 0; +} + int multifd_ram_flush_and_sync(QEMUFile *f) { MultiFDSyncReq req; @@ -396,11 +408,8 @@ int multifd_ram_flush_and_sync(QEMUFile *f) return 0; } - if (!multifd_payload_empty(multifd_ram_send)) { - if (!multifd_send(&multifd_ram_send)) { - error_report("%s: multifd_send fail", __func__); - return -1; - } + if ((ret = multifd_send_flush()) < 0) { + return ret; } /* File migrations only need to sync with threads */ diff --git a/migration/multifd.c b/migration/multifd.c index 2bd8604ca1..8928ca2611 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -1265,6 +1265,7 @@ static void *multifd_recv_thread(void *opaque) rcu_unregister_thread(); trace_multifd_recv_thread_end(p->id, p->packets_recved); + qemu_sem_post(&multifd_recv_state->sem_sync); return NULL; } diff --git a/migration/multifd.h b/migration/multifd.h index bff867ca6b..242b923633 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -361,6 +361,7 @@ static inline uint32_t multifd_ram_page_count(void) void multifd_ram_save_setup(void); void multifd_ram_save_cleanup(void); +int multifd_send_flush(void); int multifd_ram_flush_and_sync(QEMUFile *f); bool multifd_ram_sync_per_round(void); bool multifd_ram_sync_per_section(void); diff --git a/migration/savevm.c b/migration/savevm.c index 4046faf009..0b71e988ba 100644 --- a/migration/savevm.c +++ b/migration/savevm.c @@ -37,6 +37,7 @@ #include "migration/register.h" #include "migration/global_state.h" #include "migration/channel-block.h" +#include "multifd.h" #include "ram.h" #include "qemu-file.h" #include "savevm.h" @@ -90,6 +91,7 @@ enum qemu_vm_cmd { MIG_CMD_ENABLE_COLO, /* Enable COLO */ MIG_CMD_POSTCOPY_RESUME, /* resume postcopy on dest */ MIG_CMD_RECV_BITMAP, /* Request for recved bitmap on dst */ + MIG_CMD_MULTIFD_RECV_SYNC, /* Sync multifd recv_x and main threads */ MIG_CMD_MAX }; @@ -109,6 +111,7 @@ static struct mig_cmd_args { [MIG_CMD_POSTCOPY_RESUME] = { .len = 0, .name = "POSTCOPY_RESUME" }, [MIG_CMD_PACKAGED] = { .len = 4, .name = "PACKAGED" }, [MIG_CMD_RECV_BITMAP] = { .len = -1, .name = "RECV_BITMAP" }, + [MIG_CMD_MULTIFD_RECV_SYNC] = { .len = 0, .name = "MULTIFD_RECV_SYNC" }, [MIG_CMD_MAX] = { .len = -1, .name = "MAX" }, }; @@ -1201,6 +1204,12 @@ void qemu_savevm_send_recv_bitmap(QEMUFile *f, char *block_name) qemu_savevm_command_send(f, MIG_CMD_RECV_BITMAP, len + 1, (uint8_t *)buf); } +void qemu_savevm_send_multifd_recv_sync(QEMUFile *f) +{ + /* TBD: trace_savevm_send_multifd_recv_sync(); */ + qemu_savevm_command_send(f, MIG_CMD_MULTIFD_RECV_SYNC, 0, NULL); +} + bool qemu_savevm_state_blocked(Error **errp) { SaveStateEntry *se; @@ -2479,6 +2488,10 @@ static int loadvm_process_command(QEMUFile *f) case MIG_CMD_RECV_BITMAP: return loadvm_handle_recv_bitmap(mis, len); + case MIG_CMD_MULTIFD_RECV_SYNC: + multifd_recv_sync_main(); + break; + case MIG_CMD_ENABLE_COLO: return loadvm_process_enable_colo(mis); } diff --git a/migration/savevm.h b/migration/savevm.h index 7957460062..91ae703925 100644 --- a/migration/savevm.h +++ b/migration/savevm.h @@ -53,6 +53,7 @@ void qemu_savevm_send_postcopy_listen(QEMUFile *f); void qemu_savevm_send_postcopy_run(QEMUFile *f); void qemu_savevm_send_postcopy_resume(QEMUFile *f); void qemu_savevm_send_recv_bitmap(QEMUFile *f, char *block_name); +void qemu_savevm_send_multifd_recv_sync(QEMUFile *f); void qemu_savevm_send_postcopy_ram_discard(QEMUFile *f, const char *name, uint16_t len,