Message ID | 1707418446-134863-11-git-send-email-steven.sistare@oracle.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | allow cpr-reboot for vfio | expand |
On Thu, Feb 08, 2024 at 10:54:03AM -0800, Steve Sistare wrote: > When migration for cpr is initiated, stop the vm and set state > RUN_STATE_FINISH_MIGRATE before ram is saved. This eliminates the > possibility of ram and device state being out of sync, and guarantees > that a guest in the suspended state remains suspended, because qmp_cont > rejects a cont command in the RUN_STATE_FINISH_MIGRATE state. > > Signed-off-by: Steve Sistare <steven.sistare@oracle.com> > --- > include/migration/misc.h | 1 + > migration/migration.c | 32 +++++++++++++++++++++++++------- > 2 files changed, 26 insertions(+), 7 deletions(-) > > diff --git a/include/migration/misc.h b/include/migration/misc.h > index 6dc234b..54c99a3 100644 > --- a/include/migration/misc.h > +++ b/include/migration/misc.h > @@ -60,6 +60,7 @@ void migration_object_init(void); > void migration_shutdown(void); > bool migration_is_idle(void); > bool migration_is_active(MigrationState *); > +bool migrate_mode_is_cpr(MigrationState *); > > typedef enum MigrationEventType { > MIG_EVENT_PRECOPY_SETUP, > diff --git a/migration/migration.c b/migration/migration.c > index d1fce9e..fc5c587 100644 > --- a/migration/migration.c > +++ b/migration/migration.c > @@ -1603,6 +1603,11 @@ bool migration_is_active(MigrationState *s) > s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE); > } > > +bool migrate_mode_is_cpr(MigrationState *s) > +{ > + return s->parameters.mode == MIG_MODE_CPR_REBOOT; > +} > + > int migrate_init(MigrationState *s, Error **errp) > { > int ret; > @@ -2651,13 +2656,14 @@ static int migration_completion_precopy(MigrationState *s, > bql_lock(); > migration_downtime_start(s); > > - s->vm_old_state = runstate_get(); > - global_state_store(); > - > - ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE); > - trace_migration_completion_vm_stop(ret); > - if (ret < 0) { > - goto out_unlock; > + if (!migrate_mode_is_cpr(s)) { > + s->vm_old_state = runstate_get(); > + global_state_store(); > + ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE); > + trace_migration_completion_vm_stop(ret); > + if (ret < 0) { > + goto out_unlock; > + } > } > > ret = migration_maybe_pause(s, current_active_state, > @@ -3576,6 +3582,7 @@ void migrate_fd_connect(MigrationState *s, Error *error_in) > Error *local_err = NULL; > uint64_t rate_limit; > bool resume = s->state == MIGRATION_STATUS_POSTCOPY_PAUSED; > + int ret; > > /* > * If there's a previous error, free it and prepare for another one. > @@ -3651,6 +3658,17 @@ void migrate_fd_connect(MigrationState *s, Error *error_in) > goto fail; > } > > + if (migrate_mode_is_cpr(s)) { > + s->vm_old_state = runstate_get(); > + global_state_store(); > + ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE); > + trace_migration_completion_vm_stop(ret); > + if (ret < 0) { > + error_setg(&local_err, "migration_stop_vm failed, error %d", -ret); > + goto fail; > + } > + } Could we have a helper function for the shared codes? How about postcopy? I know it's nonsense to enable postcopy for cpr.. but iiuc we don't yet forbid an user doing so. Maybe we should? > + > if (migrate_background_snapshot()) { > qemu_thread_create(&s->thread, "bg_snapshot", > bg_migration_thread, s, QEMU_THREAD_JOINABLE); > -- > 1.8.3.1 >
On 2/20/2024 2:33 AM, Peter Xu wrote: > On Thu, Feb 08, 2024 at 10:54:03AM -0800, Steve Sistare wrote: >> When migration for cpr is initiated, stop the vm and set state >> RUN_STATE_FINISH_MIGRATE before ram is saved. This eliminates the >> possibility of ram and device state being out of sync, and guarantees >> that a guest in the suspended state remains suspended, because qmp_cont >> rejects a cont command in the RUN_STATE_FINISH_MIGRATE state. >> >> Signed-off-by: Steve Sistare <steven.sistare@oracle.com> >> --- >> include/migration/misc.h | 1 + >> migration/migration.c | 32 +++++++++++++++++++++++++------- >> 2 files changed, 26 insertions(+), 7 deletions(-) >> >> diff --git a/include/migration/misc.h b/include/migration/misc.h >> index 6dc234b..54c99a3 100644 >> --- a/include/migration/misc.h >> +++ b/include/migration/misc.h >> @@ -60,6 +60,7 @@ void migration_object_init(void); >> void migration_shutdown(void); >> bool migration_is_idle(void); >> bool migration_is_active(MigrationState *); >> +bool migrate_mode_is_cpr(MigrationState *); >> >> typedef enum MigrationEventType { >> MIG_EVENT_PRECOPY_SETUP, >> diff --git a/migration/migration.c b/migration/migration.c >> index d1fce9e..fc5c587 100644 >> --- a/migration/migration.c >> +++ b/migration/migration.c >> @@ -1603,6 +1603,11 @@ bool migration_is_active(MigrationState *s) >> s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE); >> } >> >> +bool migrate_mode_is_cpr(MigrationState *s) >> +{ >> + return s->parameters.mode == MIG_MODE_CPR_REBOOT; >> +} >> + >> int migrate_init(MigrationState *s, Error **errp) >> { >> int ret; >> @@ -2651,13 +2656,14 @@ static int migration_completion_precopy(MigrationState *s, >> bql_lock(); >> migration_downtime_start(s); >> >> - s->vm_old_state = runstate_get(); >> - global_state_store(); >> - >> - ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE); >> - trace_migration_completion_vm_stop(ret); >> - if (ret < 0) { >> - goto out_unlock; >> + if (!migrate_mode_is_cpr(s)) { >> + s->vm_old_state = runstate_get(); >> + global_state_store(); >> + ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE); >> + trace_migration_completion_vm_stop(ret); >> + if (ret < 0) { >> + goto out_unlock; >> + } >> } >> >> ret = migration_maybe_pause(s, current_active_state, >> @@ -3576,6 +3582,7 @@ void migrate_fd_connect(MigrationState *s, Error *error_in) >> Error *local_err = NULL; >> uint64_t rate_limit; >> bool resume = s->state == MIGRATION_STATUS_POSTCOPY_PAUSED; >> + int ret; >> >> /* >> * If there's a previous error, free it and prepare for another one. >> @@ -3651,6 +3658,17 @@ void migrate_fd_connect(MigrationState *s, Error *error_in) >> goto fail; >> } >> >> + if (migrate_mode_is_cpr(s)) { >> + s->vm_old_state = runstate_get(); >> + global_state_store(); >> + ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE); >> + trace_migration_completion_vm_stop(ret); >> + if (ret < 0) { >> + error_setg(&local_err, "migration_stop_vm failed, error %d", -ret); >> + goto fail; >> + } >> + } > > Could we have a helper function for the shared codes? Will do. > How about postcopy? I know it's nonsense to enable postcopy for cpr.. but > iiuc we don't yet forbid an user doing so. Maybe we should? I will assert that mode != cpr in the postcopy path instead, to prevent any nonsense. - Steve >> + >> if (migrate_background_snapshot()) { >> qemu_thread_create(&s->thread, "bg_snapshot", >> bg_migration_thread, s, QEMU_THREAD_JOINABLE); >> -- >> 1.8.3.1 >> >
On 2/20/2024 2:33 AM, Peter Xu wrote: > On Thu, Feb 08, 2024 at 10:54:03AM -0800, Steve Sistare wrote: >> When migration for cpr is initiated, stop the vm and set state >> RUN_STATE_FINISH_MIGRATE before ram is saved. This eliminates the >> possibility of ram and device state being out of sync, and guarantees >> that a guest in the suspended state remains suspended, because qmp_cont >> rejects a cont command in the RUN_STATE_FINISH_MIGRATE state. >> >> Signed-off-by: Steve Sistare <steven.sistare@oracle.com> >> --- >> include/migration/misc.h | 1 + >> migration/migration.c | 32 +++++++++++++++++++++++++------- >> 2 files changed, 26 insertions(+), 7 deletions(-) >> >> diff --git a/include/migration/misc.h b/include/migration/misc.h >> index 6dc234b..54c99a3 100644 >> --- a/include/migration/misc.h >> +++ b/include/migration/misc.h >> @@ -60,6 +60,7 @@ void migration_object_init(void); >> void migration_shutdown(void); >> bool migration_is_idle(void); >> bool migration_is_active(MigrationState *); >> +bool migrate_mode_is_cpr(MigrationState *); >> >> typedef enum MigrationEventType { >> MIG_EVENT_PRECOPY_SETUP, >> diff --git a/migration/migration.c b/migration/migration.c >> index d1fce9e..fc5c587 100644 >> --- a/migration/migration.c >> +++ b/migration/migration.c >> @@ -1603,6 +1603,11 @@ bool migration_is_active(MigrationState *s) >> s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE); >> } >> >> +bool migrate_mode_is_cpr(MigrationState *s) >> +{ >> + return s->parameters.mode == MIG_MODE_CPR_REBOOT; >> +} >> + >> int migrate_init(MigrationState *s, Error **errp) >> { >> int ret; >> @@ -2651,13 +2656,14 @@ static int migration_completion_precopy(MigrationState *s, >> bql_lock(); >> migration_downtime_start(s); >> >> - s->vm_old_state = runstate_get(); >> - global_state_store(); >> - >> - ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE); >> - trace_migration_completion_vm_stop(ret); >> - if (ret < 0) { >> - goto out_unlock; >> + if (!migrate_mode_is_cpr(s)) { >> + s->vm_old_state = runstate_get(); >> + global_state_store(); >> + ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE); >> + trace_migration_completion_vm_stop(ret); >> + if (ret < 0) { >> + goto out_unlock; >> + } >> } >> >> ret = migration_maybe_pause(s, current_active_state, >> @@ -3576,6 +3582,7 @@ void migrate_fd_connect(MigrationState *s, Error *error_in) >> Error *local_err = NULL; >> uint64_t rate_limit; >> bool resume = s->state == MIGRATION_STATUS_POSTCOPY_PAUSED; >> + int ret; >> >> /* >> * If there's a previous error, free it and prepare for another one. >> @@ -3651,6 +3658,17 @@ void migrate_fd_connect(MigrationState *s, Error *error_in) >> goto fail; >> } >> >> + if (migrate_mode_is_cpr(s)) { >> + s->vm_old_state = runstate_get(); >> + global_state_store(); >> + ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE); >> + trace_migration_completion_vm_stop(ret); >> + if (ret < 0) { >> + error_setg(&local_err, "migration_stop_vm failed, error %d", -ret); >> + goto fail; >> + } >> + } > > Could we have a helper function for the shared codes? I propose to add code to migration_stop_vm to make it the helper. Some call sites emit more traces (via migration_stop_vm) as a result of my refactoring, and postcopy start sets vm_old_state, which is not used thereafter in that path. Those changes seem harmless to me. Tell me what you think: ------------------------------------------------------- diff --git a/migration/migration.c b/migration/migration.c index fc5c587..30d2b08 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -107,12 +107,6 @@ static int migration_maybe_pause(MigrationState *s, static void migrate_fd_cancel(MigrationState *s); static bool close_return_path_on_source(MigrationState *s); -static void migration_downtime_start(MigrationState *s) -{ - trace_vmstate_downtime_checkpoint("src-downtime-start"); - s->downtime_start = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); -} - static void migration_downtime_end(MigrationState *s) { int64_t now = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); @@ -161,11 +155,20 @@ static gint page_request_addr_cmp(gconstpointer ap, gconstpo return (a > b) - (a < b); } -int migration_stop_vm(RunState state) +static int migration_stop_vm(MigrationState *s, RunState state) { - int ret = vm_stop_force_state(state); + int ret; + + trace_vmstate_downtime_checkpoint("src-downtime-start"); + s->downtime_start = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); + + s->vm_old_state = runstate_get(); + global_state_store(); + + ret = vm_stop_force_state(state); trace_vmstate_downtime_checkpoint("src-vm-stopped"); + trace_migration_completion_vm_stop(ret); return ret; } @@ -2454,10 +2457,7 @@ static int postcopy_start(MigrationState *ms, Error **errp) bql_lock(); trace_postcopy_start_set_run(); - migration_downtime_start(ms); - - global_state_store(); - ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE); + ret = migration_stop_vm(ms, RUN_STATE_FINISH_MIGRATE); if (ret < 0) { goto fail; } @@ -2654,13 +2654,9 @@ static int migration_completion_precopy(MigrationState *s, int ret; bql_lock(); - migration_downtime_start(s); if (!migrate_mode_is_cpr(s)) { - s->vm_old_state = runstate_get(); - global_state_store(); - ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE); - trace_migration_completion_vm_stop(ret); + ret = migration_stop_vm(s, RUN_STATE_FINISH_MIGRATE); if (ret < 0) { goto out_unlock; } @@ -3498,15 +3494,10 @@ static void *bg_migration_thread(void *opaque) s->setup_time = qemu_clock_get_ms(QEMU_CLOCK_HOST) - setup_start; trace_migration_thread_setup_complete(); - migration_downtime_start(s); bql_lock(); - s->vm_old_state = runstate_get(); - - global_state_store(); - /* Forcibly stop VM before saving state of vCPUs and devices */ - if (migration_stop_vm(RUN_STATE_PAUSED)) { + if (migration_stop_vm(s, RUN_STATE_PAUSED)) { goto fail; } /* @@ -3659,10 +3650,7 @@ void migrate_fd_connect(MigrationState *s, Error *error_in) } if (migrate_mode_is_cpr(s)) { - s->vm_old_state = runstate_get(); - global_state_store(); - ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE); - trace_migration_completion_vm_stop(ret); + ret = migration_stop_vm(s, RUN_STATE_FINISH_MIGRATE); if (ret < 0) { error_setg(&local_err, "migration_stop_vm failed, error %d", -ret); goto fail; diff --git a/migration/migration.h b/migration/migration.h index aef8afb..65c0b61 100644 --- a/migration/migration.h +++ b/migration/migration.h @@ -541,6 +541,4 @@ int migration_rp_wait(MigrationState *s); */ void migration_rp_kick(MigrationState *s); -int migration_stop_vm(RunState state); - #endif ------------------------------------------------------- - Steve
On 2/20/2024 2:33 AM, Peter Xu wrote: > On Thu, Feb 08, 2024 at 10:54:03AM -0800, Steve Sistare wrote: >> When migration for cpr is initiated, stop the vm and set state >> RUN_STATE_FINISH_MIGRATE before ram is saved. This eliminates the >> possibility of ram and device state being out of sync, and guarantees >> that a guest in the suspended state remains suspended, because qmp_cont >> rejects a cont command in the RUN_STATE_FINISH_MIGRATE state. >> >> Signed-off-by: Steve Sistare <steven.sistare@oracle.com> >> --- >> include/migration/misc.h | 1 + >> migration/migration.c | 32 +++++++++++++++++++++++++------- >> 2 files changed, 26 insertions(+), 7 deletions(-) >> >> diff --git a/include/migration/misc.h b/include/migration/misc.h >> index 6dc234b..54c99a3 100644 >> --- a/include/migration/misc.h >> +++ b/include/migration/misc.h >> @@ -60,6 +60,7 @@ void migration_object_init(void); >> void migration_shutdown(void); >> bool migration_is_idle(void); >> bool migration_is_active(MigrationState *); >> +bool migrate_mode_is_cpr(MigrationState *); >> >> typedef enum MigrationEventType { >> MIG_EVENT_PRECOPY_SETUP, >> diff --git a/migration/migration.c b/migration/migration.c >> index d1fce9e..fc5c587 100644 >> --- a/migration/migration.c >> +++ b/migration/migration.c >> @@ -1603,6 +1603,11 @@ bool migration_is_active(MigrationState *s) >> s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE); >> } >> >> +bool migrate_mode_is_cpr(MigrationState *s) >> +{ >> + return s->parameters.mode == MIG_MODE_CPR_REBOOT; >> +} >> + >> int migrate_init(MigrationState *s, Error **errp) >> { >> int ret; >> @@ -2651,13 +2656,14 @@ static int migration_completion_precopy(MigrationState *s, >> bql_lock(); >> migration_downtime_start(s); >> >> - s->vm_old_state = runstate_get(); >> - global_state_store(); >> - >> - ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE); >> - trace_migration_completion_vm_stop(ret); >> - if (ret < 0) { >> - goto out_unlock; >> + if (!migrate_mode_is_cpr(s)) { >> + s->vm_old_state = runstate_get(); >> + global_state_store(); >> + ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE); >> + trace_migration_completion_vm_stop(ret); >> + if (ret < 0) { >> + goto out_unlock; >> + } >> } >> >> ret = migration_maybe_pause(s, current_active_state, >> @@ -3576,6 +3582,7 @@ void migrate_fd_connect(MigrationState *s, Error *error_in) >> Error *local_err = NULL; >> uint64_t rate_limit; >> bool resume = s->state == MIGRATION_STATUS_POSTCOPY_PAUSED; >> + int ret; >> >> /* >> * If there's a previous error, free it and prepare for another one. >> @@ -3651,6 +3658,17 @@ void migrate_fd_connect(MigrationState *s, Error *error_in) >> goto fail; >> } >> >> + if (migrate_mode_is_cpr(s)) { >> + s->vm_old_state = runstate_get(); >> + global_state_store(); >> + ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE); >> + trace_migration_completion_vm_stop(ret); >> + if (ret < 0) { >> + error_setg(&local_err, "migration_stop_vm failed, error %d", -ret); >> + goto fail; >> + } >> + } > > Could we have a helper function for the shared codes? > > How about postcopy? I know it's nonsense to enable postcopy for cpr.. but > iiuc we don't yet forbid an user doing so. Maybe we should? How about this? ------------------------------------------- @@ -3600,6 +3600,11 @@ void migrate_fd_connect(MigrationState *s, Error *error_in) return; } + if (migrate_mode_is_cpr(s) && migrate_postcopy()) { + error_setg(&local_err, "cannot mix postcopy and cpr"); + goto fail; + } + if (resume) { /* This is a resumed migration */ rate_limit = migrate_max_postcopy_bandwidth(); ------------------------------------------------ - Steve
On Wed, Feb 21, 2024 at 04:23:07PM -0500, Steven Sistare wrote: > > How about postcopy? I know it's nonsense to enable postcopy for cpr.. but > > iiuc we don't yet forbid an user doing so. Maybe we should? > > How about this? > > ------------------------------------------- > @@ -3600,6 +3600,11 @@ void migrate_fd_connect(MigrationState *s, Error *error_in) > return; > } > > + if (migrate_mode_is_cpr(s) && migrate_postcopy()) { > + error_setg(&local_err, "cannot mix postcopy and cpr"); > + goto fail; > + } > + > if (resume) { > /* This is a resumed migration */ > rate_limit = migrate_max_postcopy_bandwidth(); > ------------------------------------------------ migrate_fd_connect() will be a bit late, the error won't be able to be attached in the "migrate" request. Perhaps, migrate_prepare()?
On Wed, Feb 21, 2024 at 04:20:07PM -0500, Steven Sistare wrote: > On 2/20/2024 2:33 AM, Peter Xu wrote: > > On Thu, Feb 08, 2024 at 10:54:03AM -0800, Steve Sistare wrote: > >> When migration for cpr is initiated, stop the vm and set state > >> RUN_STATE_FINISH_MIGRATE before ram is saved. This eliminates the > >> possibility of ram and device state being out of sync, and guarantees > >> that a guest in the suspended state remains suspended, because qmp_cont > >> rejects a cont command in the RUN_STATE_FINISH_MIGRATE state. > >> > >> Signed-off-by: Steve Sistare <steven.sistare@oracle.com> > >> --- > >> include/migration/misc.h | 1 + > >> migration/migration.c | 32 +++++++++++++++++++++++++------- > >> 2 files changed, 26 insertions(+), 7 deletions(-) > >> > >> diff --git a/include/migration/misc.h b/include/migration/misc.h > >> index 6dc234b..54c99a3 100644 > >> --- a/include/migration/misc.h > >> +++ b/include/migration/misc.h > >> @@ -60,6 +60,7 @@ void migration_object_init(void); > >> void migration_shutdown(void); > >> bool migration_is_idle(void); > >> bool migration_is_active(MigrationState *); > >> +bool migrate_mode_is_cpr(MigrationState *); > >> > >> typedef enum MigrationEventType { > >> MIG_EVENT_PRECOPY_SETUP, > >> diff --git a/migration/migration.c b/migration/migration.c > >> index d1fce9e..fc5c587 100644 > >> --- a/migration/migration.c > >> +++ b/migration/migration.c > >> @@ -1603,6 +1603,11 @@ bool migration_is_active(MigrationState *s) > >> s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE); > >> } > >> > >> +bool migrate_mode_is_cpr(MigrationState *s) > >> +{ > >> + return s->parameters.mode == MIG_MODE_CPR_REBOOT; > >> +} > >> + > >> int migrate_init(MigrationState *s, Error **errp) > >> { > >> int ret; > >> @@ -2651,13 +2656,14 @@ static int migration_completion_precopy(MigrationState *s, > >> bql_lock(); > >> migration_downtime_start(s); > >> > >> - s->vm_old_state = runstate_get(); > >> - global_state_store(); > >> - > >> - ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE); > >> - trace_migration_completion_vm_stop(ret); > >> - if (ret < 0) { > >> - goto out_unlock; > >> + if (!migrate_mode_is_cpr(s)) { > >> + s->vm_old_state = runstate_get(); > >> + global_state_store(); > >> + ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE); > >> + trace_migration_completion_vm_stop(ret); > >> + if (ret < 0) { > >> + goto out_unlock; > >> + } > >> } > >> > >> ret = migration_maybe_pause(s, current_active_state, > >> @@ -3576,6 +3582,7 @@ void migrate_fd_connect(MigrationState *s, Error *error_in) > >> Error *local_err = NULL; > >> uint64_t rate_limit; > >> bool resume = s->state == MIGRATION_STATUS_POSTCOPY_PAUSED; > >> + int ret; > >> > >> /* > >> * If there's a previous error, free it and prepare for another one. > >> @@ -3651,6 +3658,17 @@ void migrate_fd_connect(MigrationState *s, Error *error_in) > >> goto fail; > >> } > >> > >> + if (migrate_mode_is_cpr(s)) { > >> + s->vm_old_state = runstate_get(); > >> + global_state_store(); > >> + ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE); > >> + trace_migration_completion_vm_stop(ret); > >> + if (ret < 0) { > >> + error_setg(&local_err, "migration_stop_vm failed, error %d", -ret); > >> + goto fail; > >> + } > >> + } > > > > Could we have a helper function for the shared codes? > > I propose to add code to migration_stop_vm to make it the helper. Some call sites emit > more traces (via migration_stop_vm) as a result of my refactoring, This should be fine. > and postcopy start sets > vm_old_state, which is not used thereafter in that path. Those changes seem harmless to me. Not only harmless, I think it was a bug to not set vm_old_state in postcopy_start().. See: https://issues.redhat.com/browse/RHEL-18061 I suspect that was the cause. > Tell me what you think: I'll have a closer look later, but so far it looks all good. Thanks, > > ------------------------------------------------------- > diff --git a/migration/migration.c b/migration/migration.c > index fc5c587..30d2b08 100644 > --- a/migration/migration.c > +++ b/migration/migration.c > @@ -107,12 +107,6 @@ static int migration_maybe_pause(MigrationState *s, > static void migrate_fd_cancel(MigrationState *s); > static bool close_return_path_on_source(MigrationState *s); > > -static void migration_downtime_start(MigrationState *s) > -{ > - trace_vmstate_downtime_checkpoint("src-downtime-start"); > - s->downtime_start = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); > -} > - > static void migration_downtime_end(MigrationState *s) > { > int64_t now = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); > @@ -161,11 +155,20 @@ static gint page_request_addr_cmp(gconstpointer ap, gconstpo > return (a > b) - (a < b); > } > > -int migration_stop_vm(RunState state) > +static int migration_stop_vm(MigrationState *s, RunState state) > { > - int ret = vm_stop_force_state(state); > + int ret; > + > + trace_vmstate_downtime_checkpoint("src-downtime-start"); > + s->downtime_start = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); > + > + s->vm_old_state = runstate_get(); > + global_state_store(); > + > + ret = vm_stop_force_state(state); > > trace_vmstate_downtime_checkpoint("src-vm-stopped"); > + trace_migration_completion_vm_stop(ret); > > return ret; > } > @@ -2454,10 +2457,7 @@ static int postcopy_start(MigrationState *ms, Error **errp) > bql_lock(); > trace_postcopy_start_set_run(); > > - migration_downtime_start(ms); > - > - global_state_store(); > - ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE); > + ret = migration_stop_vm(ms, RUN_STATE_FINISH_MIGRATE); > if (ret < 0) { > goto fail; > } > @@ -2654,13 +2654,9 @@ static int migration_completion_precopy(MigrationState *s, > int ret; > > bql_lock(); > - migration_downtime_start(s); > > if (!migrate_mode_is_cpr(s)) { > - s->vm_old_state = runstate_get(); > - global_state_store(); > - ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE); > - trace_migration_completion_vm_stop(ret); > + ret = migration_stop_vm(s, RUN_STATE_FINISH_MIGRATE); > if (ret < 0) { > goto out_unlock; > } > @@ -3498,15 +3494,10 @@ static void *bg_migration_thread(void *opaque) > s->setup_time = qemu_clock_get_ms(QEMU_CLOCK_HOST) - setup_start; > > trace_migration_thread_setup_complete(); > - migration_downtime_start(s); > > bql_lock(); > > - s->vm_old_state = runstate_get(); > - > - global_state_store(); > - /* Forcibly stop VM before saving state of vCPUs and devices */ > - if (migration_stop_vm(RUN_STATE_PAUSED)) { > + if (migration_stop_vm(s, RUN_STATE_PAUSED)) { > goto fail; > } > /* > @@ -3659,10 +3650,7 @@ void migrate_fd_connect(MigrationState *s, Error *error_in) > } > > if (migrate_mode_is_cpr(s)) { > - s->vm_old_state = runstate_get(); > - global_state_store(); > - ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE); > - trace_migration_completion_vm_stop(ret); > + ret = migration_stop_vm(s, RUN_STATE_FINISH_MIGRATE); > if (ret < 0) { > error_setg(&local_err, "migration_stop_vm failed, error %d", -ret); > goto fail; > diff --git a/migration/migration.h b/migration/migration.h > index aef8afb..65c0b61 100644 > --- a/migration/migration.h > +++ b/migration/migration.h > @@ -541,6 +541,4 @@ int migration_rp_wait(MigrationState *s); > */ > void migration_rp_kick(MigrationState *s); > > -int migration_stop_vm(RunState state); > - > #endif > ------------------------------------------------------- > > - Steve >
On Thu, Feb 22, 2024 at 05:12:53PM +0800, Peter Xu wrote: > On Wed, Feb 21, 2024 at 04:20:07PM -0500, Steven Sistare wrote: > > On 2/20/2024 2:33 AM, Peter Xu wrote: > > > On Thu, Feb 08, 2024 at 10:54:03AM -0800, Steve Sistare wrote: > > >> When migration for cpr is initiated, stop the vm and set state > > >> RUN_STATE_FINISH_MIGRATE before ram is saved. This eliminates the > > >> possibility of ram and device state being out of sync, and guarantees > > >> that a guest in the suspended state remains suspended, because qmp_cont > > >> rejects a cont command in the RUN_STATE_FINISH_MIGRATE state. > > >> > > >> Signed-off-by: Steve Sistare <steven.sistare@oracle.com> > > >> --- > > >> include/migration/misc.h | 1 + > > >> migration/migration.c | 32 +++++++++++++++++++++++++------- > > >> 2 files changed, 26 insertions(+), 7 deletions(-) > > >> > > >> diff --git a/include/migration/misc.h b/include/migration/misc.h > > >> index 6dc234b..54c99a3 100644 > > >> --- a/include/migration/misc.h > > >> +++ b/include/migration/misc.h > > >> @@ -60,6 +60,7 @@ void migration_object_init(void); > > >> void migration_shutdown(void); > > >> bool migration_is_idle(void); > > >> bool migration_is_active(MigrationState *); > > >> +bool migrate_mode_is_cpr(MigrationState *); > > >> > > >> typedef enum MigrationEventType { > > >> MIG_EVENT_PRECOPY_SETUP, > > >> diff --git a/migration/migration.c b/migration/migration.c > > >> index d1fce9e..fc5c587 100644 > > >> --- a/migration/migration.c > > >> +++ b/migration/migration.c > > >> @@ -1603,6 +1603,11 @@ bool migration_is_active(MigrationState *s) > > >> s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE); > > >> } > > >> > > >> +bool migrate_mode_is_cpr(MigrationState *s) > > >> +{ > > >> + return s->parameters.mode == MIG_MODE_CPR_REBOOT; > > >> +} > > >> + > > >> int migrate_init(MigrationState *s, Error **errp) > > >> { > > >> int ret; > > >> @@ -2651,13 +2656,14 @@ static int migration_completion_precopy(MigrationState *s, > > >> bql_lock(); > > >> migration_downtime_start(s); > > >> > > >> - s->vm_old_state = runstate_get(); > > >> - global_state_store(); > > >> - > > >> - ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE); > > >> - trace_migration_completion_vm_stop(ret); > > >> - if (ret < 0) { > > >> - goto out_unlock; > > >> + if (!migrate_mode_is_cpr(s)) { > > >> + s->vm_old_state = runstate_get(); > > >> + global_state_store(); > > >> + ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE); > > >> + trace_migration_completion_vm_stop(ret); > > >> + if (ret < 0) { > > >> + goto out_unlock; > > >> + } > > >> } > > >> > > >> ret = migration_maybe_pause(s, current_active_state, > > >> @@ -3576,6 +3582,7 @@ void migrate_fd_connect(MigrationState *s, Error *error_in) > > >> Error *local_err = NULL; > > >> uint64_t rate_limit; > > >> bool resume = s->state == MIGRATION_STATUS_POSTCOPY_PAUSED; > > >> + int ret; > > >> > > >> /* > > >> * If there's a previous error, free it and prepare for another one. > > >> @@ -3651,6 +3658,17 @@ void migrate_fd_connect(MigrationState *s, Error *error_in) > > >> goto fail; > > >> } > > >> > > >> + if (migrate_mode_is_cpr(s)) { > > >> + s->vm_old_state = runstate_get(); > > >> + global_state_store(); > > >> + ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE); > > >> + trace_migration_completion_vm_stop(ret); > > >> + if (ret < 0) { > > >> + error_setg(&local_err, "migration_stop_vm failed, error %d", -ret); > > >> + goto fail; > > >> + } > > >> + } > > > > > > Could we have a helper function for the shared codes? > > > > I propose to add code to migration_stop_vm to make it the helper. Some call sites emit > > more traces (via migration_stop_vm) as a result of my refactoring, > > This should be fine. > > > and postcopy start sets > > vm_old_state, which is not used thereafter in that path. Those changes seem harmless to me. > > Not only harmless, I think it was a bug to not set vm_old_state in > postcopy_start().. See: > > https://issues.redhat.com/browse/RHEL-18061 > > I suspect that was the cause. > > > Tell me what you think: > > I'll have a closer look later, but so far it looks all good. > > Thanks, > > > > > ------------------------------------------------------- > > diff --git a/migration/migration.c b/migration/migration.c > > index fc5c587..30d2b08 100644 > > --- a/migration/migration.c > > +++ b/migration/migration.c > > @@ -107,12 +107,6 @@ static int migration_maybe_pause(MigrationState *s, > > static void migrate_fd_cancel(MigrationState *s); > > static bool close_return_path_on_source(MigrationState *s); > > > > -static void migration_downtime_start(MigrationState *s) > > -{ > > - trace_vmstate_downtime_checkpoint("src-downtime-start"); > > - s->downtime_start = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); > > -} Ah.. one more thing: would you mind keep this helper even if it can be squashed when sending formal patch? I want to keep downtime start/end super clear and paired as they're important hook points. It should be inlined anyway by the compiler. > > - > > static void migration_downtime_end(MigrationState *s) > > { > > int64_t now = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); > > @@ -161,11 +155,20 @@ static gint page_request_addr_cmp(gconstpointer ap, gconstpo > > return (a > b) - (a < b); > > } > > > > -int migration_stop_vm(RunState state) > > +static int migration_stop_vm(MigrationState *s, RunState state) > > { > > - int ret = vm_stop_force_state(state); > > + int ret; > > + > > + trace_vmstate_downtime_checkpoint("src-downtime-start"); > > + s->downtime_start = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); > > + > > + s->vm_old_state = runstate_get(); > > + global_state_store(); > > + > > + ret = vm_stop_force_state(state); > > > > trace_vmstate_downtime_checkpoint("src-vm-stopped"); > > + trace_migration_completion_vm_stop(ret); > > > > return ret; > > } > > @@ -2454,10 +2457,7 @@ static int postcopy_start(MigrationState *ms, Error **errp) > > bql_lock(); > > trace_postcopy_start_set_run(); > > > > - migration_downtime_start(ms); > > - > > - global_state_store(); > > - ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE); > > + ret = migration_stop_vm(ms, RUN_STATE_FINISH_MIGRATE); > > if (ret < 0) { > > goto fail; > > } > > @@ -2654,13 +2654,9 @@ static int migration_completion_precopy(MigrationState *s, > > int ret; > > > > bql_lock(); > > - migration_downtime_start(s); > > > > if (!migrate_mode_is_cpr(s)) { > > - s->vm_old_state = runstate_get(); > > - global_state_store(); > > - ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE); > > - trace_migration_completion_vm_stop(ret); > > + ret = migration_stop_vm(s, RUN_STATE_FINISH_MIGRATE); > > if (ret < 0) { > > goto out_unlock; > > } > > @@ -3498,15 +3494,10 @@ static void *bg_migration_thread(void *opaque) > > s->setup_time = qemu_clock_get_ms(QEMU_CLOCK_HOST) - setup_start; > > > > trace_migration_thread_setup_complete(); > > - migration_downtime_start(s); > > > > bql_lock(); > > > > - s->vm_old_state = runstate_get(); > > - > > - global_state_store(); > > - /* Forcibly stop VM before saving state of vCPUs and devices */ > > - if (migration_stop_vm(RUN_STATE_PAUSED)) { > > + if (migration_stop_vm(s, RUN_STATE_PAUSED)) { > > goto fail; > > } > > /* > > @@ -3659,10 +3650,7 @@ void migrate_fd_connect(MigrationState *s, Error *error_in) > > } > > > > if (migrate_mode_is_cpr(s)) { > > - s->vm_old_state = runstate_get(); > > - global_state_store(); > > - ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE); > > - trace_migration_completion_vm_stop(ret); > > + ret = migration_stop_vm(s, RUN_STATE_FINISH_MIGRATE); > > if (ret < 0) { > > error_setg(&local_err, "migration_stop_vm failed, error %d", -ret); > > goto fail; > > diff --git a/migration/migration.h b/migration/migration.h > > index aef8afb..65c0b61 100644 > > --- a/migration/migration.h > > +++ b/migration/migration.h > > @@ -541,6 +541,4 @@ int migration_rp_wait(MigrationState *s); > > */ > > void migration_rp_kick(MigrationState *s); > > > > -int migration_stop_vm(RunState state); > > - > > #endif > > ------------------------------------------------------- > > > > - Steve > > > > -- > Peter Xu
On 2/22/2024 4:03 AM, Peter Xu wrote: > On Wed, Feb 21, 2024 at 04:23:07PM -0500, Steven Sistare wrote: >>> How about postcopy? I know it's nonsense to enable postcopy for cpr.. but >>> iiuc we don't yet forbid an user doing so. Maybe we should? >> >> How about this? >> >> ------------------------------------------- >> @@ -3600,6 +3600,11 @@ void migrate_fd_connect(MigrationState *s, Error *error_in) >> return; >> } >> >> + if (migrate_mode_is_cpr(s) && migrate_postcopy()) { >> + error_setg(&local_err, "cannot mix postcopy and cpr"); >> + goto fail; >> + } >> + >> if (resume) { >> /* This is a resumed migration */ >> rate_limit = migrate_max_postcopy_bandwidth(); >> ------------------------------------------------ > > migrate_fd_connect() will be a bit late, the error won't be able to be > attached in the "migrate" request. Perhaps, migrate_prepare()? Thank you, that is better - steve
On 2/22/2024 4:30 AM, Peter Xu wrote: > On Thu, Feb 22, 2024 at 05:12:53PM +0800, Peter Xu wrote: >> On Wed, Feb 21, 2024 at 04:20:07PM -0500, Steven Sistare wrote: >>> On 2/20/2024 2:33 AM, Peter Xu wrote: >>>> On Thu, Feb 08, 2024 at 10:54:03AM -0800, Steve Sistare wrote: >>>>> When migration for cpr is initiated, stop the vm and set state >>>>> RUN_STATE_FINISH_MIGRATE before ram is saved. This eliminates the >>>>> possibility of ram and device state being out of sync, and guarantees >>>>> that a guest in the suspended state remains suspended, because qmp_cont >>>>> rejects a cont command in the RUN_STATE_FINISH_MIGRATE state. >>>>> >>>>> Signed-off-by: Steve Sistare <steven.sistare@oracle.com> >>>>> --- >>>>> include/migration/misc.h | 1 + >>>>> migration/migration.c | 32 +++++++++++++++++++++++++------- >>>>> 2 files changed, 26 insertions(+), 7 deletions(-) >>>>> >>>>> diff --git a/include/migration/misc.h b/include/migration/misc.h >>>>> index 6dc234b..54c99a3 100644 >>>>> --- a/include/migration/misc.h >>>>> +++ b/include/migration/misc.h >>>>> @@ -60,6 +60,7 @@ void migration_object_init(void); >>>>> void migration_shutdown(void); >>>>> bool migration_is_idle(void); >>>>> bool migration_is_active(MigrationState *); >>>>> +bool migrate_mode_is_cpr(MigrationState *); >>>>> >>>>> typedef enum MigrationEventType { >>>>> MIG_EVENT_PRECOPY_SETUP, >>>>> diff --git a/migration/migration.c b/migration/migration.c >>>>> index d1fce9e..fc5c587 100644 >>>>> --- a/migration/migration.c >>>>> +++ b/migration/migration.c >>>>> @@ -1603,6 +1603,11 @@ bool migration_is_active(MigrationState *s) >>>>> s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE); >>>>> } >>>>> >>>>> +bool migrate_mode_is_cpr(MigrationState *s) >>>>> +{ >>>>> + return s->parameters.mode == MIG_MODE_CPR_REBOOT; >>>>> +} >>>>> + >>>>> int migrate_init(MigrationState *s, Error **errp) >>>>> { >>>>> int ret; >>>>> @@ -2651,13 +2656,14 @@ static int migration_completion_precopy(MigrationState *s, >>>>> bql_lock(); >>>>> migration_downtime_start(s); >>>>> >>>>> - s->vm_old_state = runstate_get(); >>>>> - global_state_store(); >>>>> - >>>>> - ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE); >>>>> - trace_migration_completion_vm_stop(ret); >>>>> - if (ret < 0) { >>>>> - goto out_unlock; >>>>> + if (!migrate_mode_is_cpr(s)) { >>>>> + s->vm_old_state = runstate_get(); >>>>> + global_state_store(); >>>>> + ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE); >>>>> + trace_migration_completion_vm_stop(ret); >>>>> + if (ret < 0) { >>>>> + goto out_unlock; >>>>> + } >>>>> } >>>>> >>>>> ret = migration_maybe_pause(s, current_active_state, >>>>> @@ -3576,6 +3582,7 @@ void migrate_fd_connect(MigrationState *s, Error *error_in) >>>>> Error *local_err = NULL; >>>>> uint64_t rate_limit; >>>>> bool resume = s->state == MIGRATION_STATUS_POSTCOPY_PAUSED; >>>>> + int ret; >>>>> >>>>> /* >>>>> * If there's a previous error, free it and prepare for another one. >>>>> @@ -3651,6 +3658,17 @@ void migrate_fd_connect(MigrationState *s, Error *error_in) >>>>> goto fail; >>>>> } >>>>> >>>>> + if (migrate_mode_is_cpr(s)) { >>>>> + s->vm_old_state = runstate_get(); >>>>> + global_state_store(); >>>>> + ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE); >>>>> + trace_migration_completion_vm_stop(ret); >>>>> + if (ret < 0) { >>>>> + error_setg(&local_err, "migration_stop_vm failed, error %d", -ret); >>>>> + goto fail; >>>>> + } >>>>> + } >>>> >>>> Could we have a helper function for the shared codes? >>> >>> I propose to add code to migration_stop_vm to make it the helper. Some call sites emit >>> more traces (via migration_stop_vm) as a result of my refactoring, >> >> This should be fine. >> >>> and postcopy start sets >>> vm_old_state, which is not used thereafter in that path. Those changes seem harmless to me. >> >> Not only harmless, I think it was a bug to not set vm_old_state in >> postcopy_start().. See: >> >> https://issues.redhat.com/browse/RHEL-18061 >> >> I suspect that was the cause. >> >>> Tell me what you think: >> >> I'll have a closer look later, but so far it looks all good. >> >> Thanks, >> >>> >>> ------------------------------------------------------- >>> diff --git a/migration/migration.c b/migration/migration.c >>> index fc5c587..30d2b08 100644 >>> --- a/migration/migration.c >>> +++ b/migration/migration.c >>> @@ -107,12 +107,6 @@ static int migration_maybe_pause(MigrationState *s, >>> static void migrate_fd_cancel(MigrationState *s); >>> static bool close_return_path_on_source(MigrationState *s); >>> >>> -static void migration_downtime_start(MigrationState *s) >>> -{ >>> - trace_vmstate_downtime_checkpoint("src-downtime-start"); >>> - s->downtime_start = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); >>> -} > > Ah.. one more thing: would you mind keep this helper even if it can be > squashed when sending formal patch? I want to keep downtime start/end > super clear and paired as they're important hook points. It should be > inlined anyway by the compiler. Will do - steve >>> - >>> static void migration_downtime_end(MigrationState *s) >>> { >>> int64_t now = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); >>> @@ -161,11 +155,20 @@ static gint page_request_addr_cmp(gconstpointer ap, gconstpo >>> return (a > b) - (a < b); >>> } >>> >>> -int migration_stop_vm(RunState state) >>> +static int migration_stop_vm(MigrationState *s, RunState state) >>> { >>> - int ret = vm_stop_force_state(state); >>> + int ret; >>> + >>> + trace_vmstate_downtime_checkpoint("src-downtime-start"); >>> + s->downtime_start = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); >>> + >>> + s->vm_old_state = runstate_get(); >>> + global_state_store(); >>> + >>> + ret = vm_stop_force_state(state); >>> >>> trace_vmstate_downtime_checkpoint("src-vm-stopped"); >>> + trace_migration_completion_vm_stop(ret); >>> >>> return ret; >>> } >>> @@ -2454,10 +2457,7 @@ static int postcopy_start(MigrationState *ms, Error **errp) >>> bql_lock(); >>> trace_postcopy_start_set_run(); >>> >>> - migration_downtime_start(ms); >>> - >>> - global_state_store(); >>> - ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE); >>> + ret = migration_stop_vm(ms, RUN_STATE_FINISH_MIGRATE); >>> if (ret < 0) { >>> goto fail; >>> } >>> @@ -2654,13 +2654,9 @@ static int migration_completion_precopy(MigrationState *s, >>> int ret; >>> >>> bql_lock(); >>> - migration_downtime_start(s); >>> >>> if (!migrate_mode_is_cpr(s)) { >>> - s->vm_old_state = runstate_get(); >>> - global_state_store(); >>> - ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE); >>> - trace_migration_completion_vm_stop(ret); >>> + ret = migration_stop_vm(s, RUN_STATE_FINISH_MIGRATE); >>> if (ret < 0) { >>> goto out_unlock; >>> } >>> @@ -3498,15 +3494,10 @@ static void *bg_migration_thread(void *opaque) >>> s->setup_time = qemu_clock_get_ms(QEMU_CLOCK_HOST) - setup_start; >>> >>> trace_migration_thread_setup_complete(); >>> - migration_downtime_start(s); >>> >>> bql_lock(); >>> >>> - s->vm_old_state = runstate_get(); >>> - >>> - global_state_store(); >>> - /* Forcibly stop VM before saving state of vCPUs and devices */ >>> - if (migration_stop_vm(RUN_STATE_PAUSED)) { >>> + if (migration_stop_vm(s, RUN_STATE_PAUSED)) { >>> goto fail; >>> } >>> /* >>> @@ -3659,10 +3650,7 @@ void migrate_fd_connect(MigrationState *s, Error *error_in) >>> } >>> >>> if (migrate_mode_is_cpr(s)) { >>> - s->vm_old_state = runstate_get(); >>> - global_state_store(); >>> - ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE); >>> - trace_migration_completion_vm_stop(ret); >>> + ret = migration_stop_vm(s, RUN_STATE_FINISH_MIGRATE); >>> if (ret < 0) { >>> error_setg(&local_err, "migration_stop_vm failed, error %d", -ret); >>> goto fail; >>> diff --git a/migration/migration.h b/migration/migration.h >>> index aef8afb..65c0b61 100644 >>> --- a/migration/migration.h >>> +++ b/migration/migration.h >>> @@ -541,6 +541,4 @@ int migration_rp_wait(MigrationState *s); >>> */ >>> void migration_rp_kick(MigrationState *s); >>> >>> -int migration_stop_vm(RunState state); >>> - >>> #endif >>> ------------------------------------------------------- >>> >>> - Steve >>> >> >> -- >> Peter Xu >
diff --git a/include/migration/misc.h b/include/migration/misc.h index 6dc234b..54c99a3 100644 --- a/include/migration/misc.h +++ b/include/migration/misc.h @@ -60,6 +60,7 @@ void migration_object_init(void); void migration_shutdown(void); bool migration_is_idle(void); bool migration_is_active(MigrationState *); +bool migrate_mode_is_cpr(MigrationState *); typedef enum MigrationEventType { MIG_EVENT_PRECOPY_SETUP, diff --git a/migration/migration.c b/migration/migration.c index d1fce9e..fc5c587 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -1603,6 +1603,11 @@ bool migration_is_active(MigrationState *s) s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE); } +bool migrate_mode_is_cpr(MigrationState *s) +{ + return s->parameters.mode == MIG_MODE_CPR_REBOOT; +} + int migrate_init(MigrationState *s, Error **errp) { int ret; @@ -2651,13 +2656,14 @@ static int migration_completion_precopy(MigrationState *s, bql_lock(); migration_downtime_start(s); - s->vm_old_state = runstate_get(); - global_state_store(); - - ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE); - trace_migration_completion_vm_stop(ret); - if (ret < 0) { - goto out_unlock; + if (!migrate_mode_is_cpr(s)) { + s->vm_old_state = runstate_get(); + global_state_store(); + ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE); + trace_migration_completion_vm_stop(ret); + if (ret < 0) { + goto out_unlock; + } } ret = migration_maybe_pause(s, current_active_state, @@ -3576,6 +3582,7 @@ void migrate_fd_connect(MigrationState *s, Error *error_in) Error *local_err = NULL; uint64_t rate_limit; bool resume = s->state == MIGRATION_STATUS_POSTCOPY_PAUSED; + int ret; /* * If there's a previous error, free it and prepare for another one. @@ -3651,6 +3658,17 @@ void migrate_fd_connect(MigrationState *s, Error *error_in) goto fail; } + if (migrate_mode_is_cpr(s)) { + s->vm_old_state = runstate_get(); + global_state_store(); + ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE); + trace_migration_completion_vm_stop(ret); + if (ret < 0) { + error_setg(&local_err, "migration_stop_vm failed, error %d", -ret); + goto fail; + } + } + if (migrate_background_snapshot()) { qemu_thread_create(&s->thread, "bg_snapshot", bg_migration_thread, s, QEMU_THREAD_JOINABLE);
When migration for cpr is initiated, stop the vm and set state RUN_STATE_FINISH_MIGRATE before ram is saved. This eliminates the possibility of ram and device state being out of sync, and guarantees that a guest in the suspended state remains suspended, because qmp_cont rejects a cont command in the RUN_STATE_FINISH_MIGRATE state. Signed-off-by: Steve Sistare <steven.sistare@oracle.com> --- include/migration/misc.h | 1 + migration/migration.c | 32 +++++++++++++++++++++++++------- 2 files changed, 26 insertions(+), 7 deletions(-)