Message ID | 20250110100707.4805-1-shivam.kumar1@nutanix.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | [RFC] Fix race in live migration failure path | expand |
Shivam Kumar <shivam.kumar1@nutanix.com> writes: > Even if a live migration fails due to some reason, migration status > should not be set to MIGRATION_STATUS_FAILED until migrate fd cleanup > is done, else the client can trigger another instance of migration > before the cleanup is complete (as it would assume no migration is > active) or reset migration capabilities affecting old migration's > cleanup. Hence, set the status to 'failing' when a migration failure > happens and once the cleanup is complete, set the migration status to > MIGRATION_STATUS_FAILED. > > Signed-off-by: Shivam Kumar <shivam.kumar1@nutanix.com> > --- > migration/migration.c | 49 +++++++++++++++++++++---------------------- > migration/migration.h | 9 ++++++++ > migration/multifd.c | 6 ++---- > migration/savevm.c | 7 +++---- > 4 files changed, 38 insertions(+), 33 deletions(-) > > diff --git a/migration/migration.c b/migration/migration.c > index df61ca4e93..f084f54f6b 100644 > --- a/migration/migration.c > +++ b/migration/migration.c > @@ -1143,8 +1143,9 @@ static bool migration_is_active(void) migration_is_running() is the one that gates qmp_migrate() and qmp_migrate_set_capabilities(). > { > MigrationState *s = current_migration; > > - return (s->state == MIGRATION_STATUS_ACTIVE || > - s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE); > + return ((s->state == MIGRATION_STATUS_ACTIVE || > + s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) && > + !qatomic_read(&s->failing)); > } > > static bool migrate_show_downtime(MigrationState *s) > @@ -1439,6 +1440,11 @@ static void migrate_fd_cleanup(MigrationState *s) > MIGRATION_STATUS_CANCELLED); > } > > + if (qatomic_xchg(&s->failing, 0)) { > + migrate_set_state(&s->state, s->state, > + MIGRATION_STATUS_FAILED); > + } I hope you've verified that sure every place that used to set FAILED will also reach migrate_fd_cleanup() eventually. Also, we probably still need the FAILING state. Otherwise, this will trip code that expects a state change on failure. Anything that does: if (state != MIGRATION_STATUS_FOO) { ... } So I think the change overall should be -migrate_set_state(&s->state, s->state, MIGRATION_STATUS_FAILED); +migrate_set_state(&s->state, s->state, MIGRATION_STATUS_FAILING); void migrate_set_state(MigrationStatus *state, MigrationStatus old_state, MigrationStatus new_state) { assert(new_state < MIGRATION_STATUS__MAX); if (qatomic_cmpxchg(state, old_state, new_state) == old_state) { trace_migrate_set_state(MigrationStatus_str(new_state)); + if (new_state == MIGRATION_STATUS_FAILING) { + qatomic_set(&s->failing, 1); + } migrate_generate_event(new_state); } } And we should proably do the same for CANCELLING actually, but there the (preexisting) issue is actual concurrency, while here it's just inconsistency in the state. > + > if (s->error) { > /* It is used on info migrate. We can't free it */ > error_report_err(error_copy(s->error)); > @@ -1484,17 +1490,16 @@ static void migrate_error_free(MigrationState *s) > static void migrate_fd_error(MigrationState *s, const Error *error) > { > MigrationStatus current = s->state; > - MigrationStatus next; > > assert(s->to_dst_file == NULL); > > switch (current) { > case MIGRATION_STATUS_SETUP: > - next = MIGRATION_STATUS_FAILED; > + qatomic_set(&s->failing, 1); > break; > case MIGRATION_STATUS_POSTCOPY_RECOVER_SETUP: > /* Never fail a postcopy migration; switch back to PAUSED instead */ > - next = MIGRATION_STATUS_POSTCOPY_PAUSED; > + migrate_set_state(&s->state, current, MIGRATION_STATUS_POSTCOPY_PAUSED); > break; > default: > /* > @@ -1506,7 +1511,6 @@ static void migrate_fd_error(MigrationState *s, const Error *error) > return; > } > > - migrate_set_state(&s->state, current, next); > migrate_set_error(s, error); > } > > @@ -2101,8 +2105,7 @@ void qmp_migrate(const char *uri, bool has_channels, > } else { > error_setg(&local_err, QERR_INVALID_PARAMETER_VALUE, "uri", > "a valid migration protocol"); > - migrate_set_state(&s->state, MIGRATION_STATUS_SETUP, > - MIGRATION_STATUS_FAILED); > + qatomic_set(&s->failing, 1); > } > > if (local_err) { > @@ -2498,7 +2501,7 @@ static int postcopy_start(MigrationState *ms, Error **errp) > if (migrate_postcopy_preempt()) { > migration_wait_main_channel(ms); > if (postcopy_preempt_establish_channel(ms)) { > - migrate_set_state(&ms->state, ms->state, MIGRATION_STATUS_FAILED); > + qatomic_set(&ms->failing, 1); > error_setg(errp, "%s: Failed to establish preempt channel", > __func__); > return -1; > @@ -2648,8 +2651,7 @@ static int postcopy_start(MigrationState *ms, Error **errp) > fail_closefb: > qemu_fclose(fb); > fail: > - migrate_set_state(&ms->state, MIGRATION_STATUS_POSTCOPY_ACTIVE, > - MIGRATION_STATUS_FAILED); > + qatomic_set(&ms->failing, 1); > if (restart_block) { > /* A failure happened early enough that we know the destination hasn't > * accessed block devices, so we're safe to recover. > @@ -2782,8 +2784,7 @@ static void migration_completion_failed(MigrationState *s, > bql_unlock(); > } > > - migrate_set_state(&s->state, current_active_state, > - MIGRATION_STATUS_FAILED); > + qatomic_set(&s->failing, 1); > } > > /** > @@ -2850,8 +2851,6 @@ fail: > */ > static void bg_migration_completion(MigrationState *s) > { > - int current_active_state = s->state; > - > if (s->state == MIGRATION_STATUS_ACTIVE) { > /* > * By this moment we have RAM content saved into the migration stream. > @@ -2874,8 +2873,7 @@ static void bg_migration_completion(MigrationState *s) > return; > > fail: > - migrate_set_state(&s->state, current_active_state, > - MIGRATION_STATUS_FAILED); > + qatomic_set(&s->failing, 1); > } > > typedef enum MigThrError { > @@ -3047,6 +3045,10 @@ static MigThrError migration_detect_error(MigrationState *s) > return MIG_THR_ERR_FATAL; > } > > + if (qatomic_read(&s->failing)) { > + return MIG_THR_ERR_FATAL; > + } > + > /* > * Try to detect any file errors. Note that postcopy_qemufile_src will > * be NULL when postcopy preempt is not enabled. > @@ -3077,7 +3079,7 @@ static MigThrError migration_detect_error(MigrationState *s) > * For precopy (or postcopy with error outside IO), we fail > * with no time. > */ > - migrate_set_state(&s->state, state, MIGRATION_STATUS_FAILED); > + qatomic_set(&s->failing, 1); > trace_migration_thread_file_err(); > > /* Time to stop the migration, now. */ > @@ -3492,8 +3494,7 @@ static void *migration_thread(void *opaque) > if (ret) { > migrate_set_error(s, local_err); > error_free(local_err); > - migrate_set_state(&s->state, MIGRATION_STATUS_ACTIVE, > - MIGRATION_STATUS_FAILED); > + qatomic_set(&s->failing, 1); > goto out; > } > > @@ -3617,8 +3618,7 @@ static void *bg_migration_thread(void *opaque) > if (ret) { > migrate_set_error(s, local_err); > error_free(local_err); > - migrate_set_state(&s->state, MIGRATION_STATUS_ACTIVE, > - MIGRATION_STATUS_FAILED); > + qatomic_set(&s->failing, 1); > goto fail_setup; > } > > @@ -3685,8 +3685,7 @@ static void *bg_migration_thread(void *opaque) > > fail: > if (early_fail) { > - migrate_set_state(&s->state, MIGRATION_STATUS_ACTIVE, > - MIGRATION_STATUS_FAILED); > + qatomic_set(&s->failing, 1); > bql_unlock(); > } > > @@ -3805,7 +3804,7 @@ void migrate_fd_connect(MigrationState *s, Error *error_in) > > fail: > migrate_set_error(s, local_err); > - migrate_set_state(&s->state, s->state, MIGRATION_STATUS_FAILED); > + qatomic_set(&s->failing, 1); > error_report_err(local_err); > migrate_fd_cleanup(s); > } > diff --git a/migration/migration.h b/migration/migration.h > index 7b6e718690..3b808d971f 100644 > --- a/migration/migration.h > +++ b/migration/migration.h > @@ -471,6 +471,15 @@ struct MigrationState { > bool switchover_acked; > /* Is this a rdma migration */ > bool rdma_migration; > + /* > + * Is migration failing? Migration status should not be set to > + * MIGRATION_STATUS_FAILED until migrate fd cleanup is done, else > + * the client can trigger another instance of migration before > + * the cleanup is complete. Hence, set the status to 'failing' > + * when a migration failure happens and once the cleanup is done, > + * set it to MIGRATION_STATUS_FAILED. > + */ > + int failing; > }; > > void migrate_set_state(MigrationStatus *state, MigrationStatus old_state, > diff --git a/migration/multifd.c b/migration/multifd.c > index 4f973d70e0..48ece2162c 100644 > --- a/migration/multifd.c > +++ b/migration/multifd.c > @@ -878,8 +878,7 @@ bool multifd_send_setup(void) > return true; > > err: > - migrate_set_state(&s->state, MIGRATION_STATUS_SETUP, > - MIGRATION_STATUS_FAILED); > + qatomic_set(&s->failing, 1); > return false; > } > > @@ -949,8 +948,7 @@ static void multifd_recv_terminate_threads(Error *err) > migrate_set_error(s, err); > if (s->state == MIGRATION_STATUS_SETUP || > s->state == MIGRATION_STATUS_ACTIVE) { > - migrate_set_state(&s->state, s->state, > - MIGRATION_STATUS_FAILED); > + qatomic_set(&s->failing, 1); > } > } > > diff --git a/migration/savevm.c b/migration/savevm.c > index 927b1146c0..4f0ef34f23 100644 > --- a/migration/savevm.c > +++ b/migration/savevm.c > @@ -1686,7 +1686,6 @@ static int qemu_savevm_state(QEMUFile *f, Error **errp) > { > int ret; > MigrationState *ms = migrate_get_current(); > - MigrationStatus status; > > if (migration_is_running()) { > error_setg(errp, "There's a migration process in progress"); > @@ -1723,11 +1722,11 @@ cleanup: > qemu_savevm_state_cleanup(); > > if (ret != 0) { > - status = MIGRATION_STATUS_FAILED; > + qatomic_set(&ms->failing, 1); > } else { > - status = MIGRATION_STATUS_COMPLETED; > + migrate_set_state(&ms->state, MIGRATION_STATUS_SETUP, > + MIGRATION_STATUS_COMPLETED); > } > - migrate_set_state(&ms->state, MIGRATION_STATUS_SETUP, status); > > /* f is outer parameter, it should not stay in global migration state after > * this function finished */
On Fri, Jan 10, 2025 at 10:09:38AM -0300, Fabiano Rosas wrote: > Shivam Kumar <shivam.kumar1@nutanix.com> writes: > > > Even if a live migration fails due to some reason, migration status > > should not be set to MIGRATION_STATUS_FAILED until migrate fd cleanup > > is done, else the client can trigger another instance of migration > > before the cleanup is complete (as it would assume no migration is > > active) or reset migration capabilities affecting old migration's > > cleanup. Hence, set the status to 'failing' when a migration failure > > happens and once the cleanup is complete, set the migration status to > > MIGRATION_STATUS_FAILED. > > > > Signed-off-by: Shivam Kumar <shivam.kumar1@nutanix.com> > > --- > > migration/migration.c | 49 +++++++++++++++++++++---------------------- > > migration/migration.h | 9 ++++++++ > > migration/multifd.c | 6 ++---- > > migration/savevm.c | 7 +++---- > > 4 files changed, 38 insertions(+), 33 deletions(-) > > > > diff --git a/migration/migration.c b/migration/migration.c > > index df61ca4e93..f084f54f6b 100644 > > --- a/migration/migration.c > > +++ b/migration/migration.c > > @@ -1143,8 +1143,9 @@ static bool migration_is_active(void) > > migration_is_running() is the one that gates qmp_migrate() and > qmp_migrate_set_capabilities(). > > > { > > MigrationState *s = current_migration; > > > > - return (s->state == MIGRATION_STATUS_ACTIVE || > > - s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE); > > + return ((s->state == MIGRATION_STATUS_ACTIVE || > > + s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) && > > + !qatomic_read(&s->failing)); > > } > > > > static bool migrate_show_downtime(MigrationState *s) > > @@ -1439,6 +1440,11 @@ static void migrate_fd_cleanup(MigrationState *s) > > MIGRATION_STATUS_CANCELLED); > > } > > > > + if (qatomic_xchg(&s->failing, 0)) { > > + migrate_set_state(&s->state, s->state, > > + MIGRATION_STATUS_FAILED); > > + } > > I hope you've verified that sure every place that used to set FAILED > will also reach migrate_fd_cleanup() eventually. > > Also, we probably still need the FAILING state. Otherwise, this will > trip code that expects a state change on failure. Anything that does: > > if (state != MIGRATION_STATUS_FOO) { > ... > } > > So I think the change overall should be > > -migrate_set_state(&s->state, s->state, MIGRATION_STATUS_FAILED); > +migrate_set_state(&s->state, s->state, MIGRATION_STATUS_FAILING); > > void migrate_set_state(MigrationStatus *state, MigrationStatus old_state, > MigrationStatus new_state) > { > assert(new_state < MIGRATION_STATUS__MAX); > if (qatomic_cmpxchg(state, old_state, new_state) == old_state) { > trace_migrate_set_state(MigrationStatus_str(new_state)); > > + if (new_state == MIGRATION_STATUS_FAILING) { > + qatomic_set(&s->failing, 1); > + } > migrate_generate_event(new_state); > } > } > > And we should proably do the same for CANCELLING actually, but there the > (preexisting) issue is actual concurrency, while here it's just > inconsistency in the state. Yes something like FAILING sounds reasonable. Though since we have s->error, I wonder whether that's a better place to represent a migration as "failing" in one place, because otherwise we need to set two places (both FAILING state, and the s->error) - whenever something fails, we'd better always update s->error so as to remember what failed, then reported via query-migrate. From that POV, s->failing is probably never gonna be needed (due to s->error being present anyway)? So far, such Error* looks like the best single point to say that the migration is failing - it also enforces the Error to be provided whoever wants to set it to failing state.
On 13 Jan 2025, at 9:59 PM, Peter Xu <peterx@redhat.com> wrote:
!-------------------------------------------------------------------|
CAUTION: External Email
|-------------------------------------------------------------------!
On Fri, Jan 10, 2025 at 10:09:38AM -0300, Fabiano Rosas wrote:
Shivam Kumar <shivam.kumar1@nutanix.com> writes:
Even if a live migration fails due to some reason, migration status
should not be set to MIGRATION_STATUS_FAILED until migrate fd cleanup
is done, else the client can trigger another instance of migration
before the cleanup is complete (as it would assume no migration is
active) or reset migration capabilities affecting old migration's
cleanup. Hence, set the status to 'failing' when a migration failure
happens and once the cleanup is complete, set the migration status to
MIGRATION_STATUS_FAILED.
Signed-off-by: Shivam Kumar <shivam.kumar1@nutanix.com>
---
migration/migration.c | 49 +++++++++++++++++++++----------------------
migration/migration.h | 9 ++++++++
migration/multifd.c | 6 ++----
migration/savevm.c | 7 +++----
4 files changed, 38 insertions(+), 33 deletions(-)
diff --git a/migration/migration.c b/migration/migration.c
index df61ca4e93..f084f54f6b 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -1143,8 +1143,9 @@ static bool migration_is_active(void)
migration_is_running() is the one that gates qmp_migrate() and
qmp_migrate_set_capabilities().
{
MigrationState *s = current_migration;
- return (s->state == MIGRATION_STATUS_ACTIVE ||
- s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE);
+ return ((s->state == MIGRATION_STATUS_ACTIVE ||
+ s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) &&
+ !qatomic_read(&s->failing));
}
static bool migrate_show_downtime(MigrationState *s)
@@ -1439,6 +1440,11 @@ static void migrate_fd_cleanup(MigrationState *s)
MIGRATION_STATUS_CANCELLED);
}
+ if (qatomic_xchg(&s->failing, 0)) {
+ migrate_set_state(&s->state, s->state,
+ MIGRATION_STATUS_FAILED);
+ }
I hope you've verified that sure every place that used to set FAILED
will also reach migrate_fd_cleanup() eventually.
Also, we probably still need the FAILING state. Otherwise, this will
trip code that expects a state change on failure. Anything that does:
if (state != MIGRATION_STATUS_FOO) {
...
}
So I think the change overall should be
-migrate_set_state(&s->state, s->state, MIGRATION_STATUS_FAILED);
+migrate_set_state(&s->state, s->state, MIGRATION_STATUS_FAILING);
void migrate_set_state(MigrationStatus *state, MigrationStatus old_state,
MigrationStatus new_state)
{
assert(new_state < MIGRATION_STATUS__MAX);
if (qatomic_cmpxchg(state, old_state, new_state) == old_state) {
trace_migrate_set_state(MigrationStatus_str(new_state));
+ if (new_state == MIGRATION_STATUS_FAILING) {
+ qatomic_set(&s->failing, 1);
+ }
migrate_generate_event(new_state);
}
}
And we should proably do the same for CANCELLING actually, but there the
(preexisting) issue is actual concurrency, while here it's just
inconsistency in the state.
Yes something like FAILING sounds reasonable. Though since we have
s->error, I wonder whether that's a better place to represent a migration
as "failing" in one place, because otherwise we need to set two places
(both FAILING state, and the s->error) - whenever something fails, we'd
better always update s->error so as to remember what failed, then reported
via query-migrate.
From that POV, s->failing is probably never gonna be needed (due to
s->error being present anyway)? So far, such Error* looks like the best
single point to say that the migration is failing - it also enforces the
Error to be provided whoever wants to set it to failing state.
--
Peter Xu
There is one place where we set the migration status to FAILED but we don’t set
s->error, i.e. in save_snapshot workflow, please check qemu_savevm_state but
not sure setting s->error in this case is possible (or required), as it seems a
different workflow to me.
In addition, one potentially real problem that I see is this comment in
migration_detect_error:
/*
* For postcopy, we allow the network to be down for a
* while. After that, it can be continued by a
* recovery phase.
*/
Let's say if we set s->error at some place and there was a file error on either
source or destination (qemu_file_get_error_obj_any returns a positive value
when called by migration_detect_error). We expect migration to fail in this
case but migration will continue to run since post-copy migration is tolerant
to file errors?
Hi, Shivam, On Wed, Jan 22, 2025 at 10:54:17AM +0000, Shivam Kumar wrote: > There is one place where we set the migration status to FAILED but we don’t set > s->error, i.e. in save_snapshot workflow, please check qemu_savevm_state but > not sure setting s->error in this case is possible (or required), as it seems a > different workflow to me. Yes it's very different indeed. I may not yet fully get the challenge on how switching to s->error (implies FAILING) would affect this one, though. I mean, in qemu_savevm_state() it tries to fetch qemufile errors with: ret = qemu_file_get_error(f); IIUC we could also try to fetch s->error just like what migration thread does, and if it sets means it's failing? > > In addition, one potentially real problem that I see is this comment in > migration_detect_error: > /* > * For postcopy, we allow the network to be down for a > * while. After that, it can be continued by a > * recovery phase. > */ > Let's say if we set s->error at some place and there was a file error on either > source or destination (qemu_file_get_error_obj_any returns a positive value This is trivial, but I suppose you meant s/positive/negative/ here.. as qemufile's last_error should always be negative, iiuc. > when called by migration_detect_error). We expect migration to fail in this > case but migration will continue to run since post-copy migration is tolerant > to file errors? Yes it can halt at postcopy_pause(). I'm not yet understand why it's an issue to using s->error, though. In general, I'm thinking whether we could also check s->error in detect error path like this: ===8<=== diff --git a/migration/migration.c b/migration/migration.c index 2d1da917c7..fbd97395e0 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -3015,17 +3015,17 @@ static MigThrError migration_detect_error(MigrationState *s) ret = qemu_file_get_error_obj_any(s->to_dst_file, s->postcopy_qemufile_src, &local_error); - if (!ret) { - /* Everything is fine */ - assert(!local_error); - return MIG_THR_ERR_NONE; - } - - if (local_error) { + if (ret) { + /* Passover qemufile errors to s->error */ + assert(local_error); migrate_set_error(s, local_error); error_free(local_error); } + if (!migrate_has_error(s)) { + return MIG_THR_ERR_NONE; + } + if (state == MIGRATION_STATUS_POSTCOPY_ACTIVE && ret) { /* * For postcopy, we allow the network to be down for a @@ -3037,6 +3037,8 @@ static MigThrError migration_detect_error(MigrationState *s) /* * For precopy (or postcopy with error outside IO), we fail * with no time. + * + * TODO: update FAILED only until the end of migration in BH. */ migrate_set_state(&s->state, state, MIGRATION_STATUS_FAILED); trace_migration_thread_file_err(); ===8<=== I kept a TODO above, I would hope if you reworked everything to route errors to s->error, then we can move this to the cleanup BH to avoid the race. Do you think that could work? Thanks,
> On 22 Jan 2025, at 10:10 PM, Peter Xu <peterx@redhat.com> wrote: > > !-------------------------------------------------------------------| > CAUTION: External Email > > |-------------------------------------------------------------------! > > Hi, Shivam, > > On Wed, Jan 22, 2025 at 10:54:17AM +0000, Shivam Kumar wrote: >> There is one place where we set the migration status to FAILED but we don’t set >> s->error, i.e. in save_snapshot workflow, please check qemu_savevm_state but >> not sure setting s->error in this case is possible (or required), as it seems a >> different workflow to me. > > Yes it's very different indeed. I may not yet fully get the challenge on > how switching to s->error (implies FAILING) would affect this one, though. > I mean, in qemu_savevm_state() it tries to fetch qemufile errors with: > > ret = qemu_file_get_error(f); > > IIUC we could also try to fetch s->error just like what migration thread > does, and if it sets means it's failing? Yes, I can just set s->error in qemu_savevm_state. @@ -1686,7 +1686,6 @@ static int qemu_savevm_state(QEMUFile *f, Error **errp) - status = MIGRATION_STATUS_FAILED; + s->error = *errp; But my question was: is migrate_fd_cleanup called (where we will set the status to FAILED if s->error is set) in the snapshot workflow? > >> >> In addition, one potentially real problem that I see is this comment in >> migration_detect_error: >> /* >> * For postcopy, we allow the network to be down for a >> * while. After that, it can be continued by a >> * recovery phase. >> */ >> Let's say if we set s->error at some place and there was a file error on either >> source or destination (qemu_file_get_error_obj_any returns a positive value > > This is trivial, but I suppose you meant s/positive/negative/ here.. as > qemufile's last_error should always be negative, iiuc. > >> when called by migration_detect_error). We expect migration to fail in this >> case but migration will continue to run since post-copy migration is tolerant >> to file errors? > > Yes it can halt at postcopy_pause(). I'm not yet understand why it's an > issue to using s->error, though. > > In general, I'm thinking whether we could also check s->error in detect > error path like this: > > ===8<=== > diff --git a/migration/migration.c b/migration/migration.c > index 2d1da917c7..fbd97395e0 100644 > --- a/migration/migration.c > +++ b/migration/migration.c > @@ -3015,17 +3015,17 @@ static MigThrError migration_detect_error(MigrationState *s) > ret = qemu_file_get_error_obj_any(s->to_dst_file, > s->postcopy_qemufile_src, > &local_error); > - if (!ret) { > - /* Everything is fine */ > - assert(!local_error); > - return MIG_THR_ERR_NONE; > - } > - > - if (local_error) { > + if (ret) { > + /* Passover qemufile errors to s->error */ > + assert(local_error); > migrate_set_error(s, local_error); > error_free(local_error); > } > > + if (!migrate_has_error(s)) { > + return MIG_THR_ERR_NONE; > + } > + > if (state == MIGRATION_STATUS_POSTCOPY_ACTIVE && ret) { > /* > * For postcopy, we allow the network to be down for a > @@ -3037,6 +3037,8 @@ static MigThrError migration_detect_error(MigrationState *s) > /* > * For precopy (or postcopy with error outside IO), we fail > * with no time. > + * > + * TODO: update FAILED only until the end of migration in BH. > */ > migrate_set_state(&s->state, state, MIGRATION_STATUS_FAILED); > trace_migration_thread_file_err(); > ===8<=== > > I kept a TODO above, I would hope if you reworked everything to route > errors to s->error, then we can move this to the cleanup BH to avoid the > race. > > Do you think that could work? I meant: in case of post-copy, what if we have another error somewhere and s->error was set, but then we also saw a file error when we called qemu_file_get_error_obj_any. In this case, migration should fail IMO but it would be paused instead, right?
On Thu, Jan 23, 2025 at 09:53:16AM +0000, Shivam Kumar wrote: > > > > On 22 Jan 2025, at 10:10 PM, Peter Xu <peterx@redhat.com> wrote: > > > > !-------------------------------------------------------------------| > > CAUTION: External Email > > > > |-------------------------------------------------------------------! > > > > Hi, Shivam, > > > > On Wed, Jan 22, 2025 at 10:54:17AM +0000, Shivam Kumar wrote: > >> There is one place where we set the migration status to FAILED but we don’t set > >> s->error, i.e. in save_snapshot workflow, please check qemu_savevm_state but > >> not sure setting s->error in this case is possible (or required), as it seems a > >> different workflow to me. > > > > Yes it's very different indeed. I may not yet fully get the challenge on > > how switching to s->error (implies FAILING) would affect this one, though. > > I mean, in qemu_savevm_state() it tries to fetch qemufile errors with: > > > > ret = qemu_file_get_error(f); > > > > IIUC we could also try to fetch s->error just like what migration thread > > does, and if it sets means it's failing? > Yes, I can just set s->error in qemu_savevm_state. > @@ -1686,7 +1686,6 @@ static int qemu_savevm_state(QEMUFile *f, Error **errp) > - status = MIGRATION_STATUS_FAILED; > + s->error = *errp; > But my question was: is migrate_fd_cleanup called (where we will set the status > to FAILED if s->error is set) in the snapshot workflow? I see what you meant. It's not called indeed. We may need to process it the same as what migrate_fd_cleanup() does. So far the snapshot code reuses migration code in a partial way, so it's not crystal clear where the line is, e.g., obviously it still moves the migration state machine but it never shows "active" phase at all (even if it has a major chunk of duration that it's actively migrating the data to the snapshot files). Here the state machine only goes from SETUP to either FAILED or COMPLETED. From that POV looks like we should duplicate such s->error detection logic here on deciding whether to switch to FAILED, if we treat s->error as the internal "single failure point" for migration. > > > >> > >> In addition, one potentially real problem that I see is this comment in > >> migration_detect_error: > >> /* > >> * For postcopy, we allow the network to be down for a > >> * while. After that, it can be continued by a > >> * recovery phase. > >> */ > >> Let's say if we set s->error at some place and there was a file error on either > >> source or destination (qemu_file_get_error_obj_any returns a positive value > > > > This is trivial, but I suppose you meant s/positive/negative/ here.. as > > qemufile's last_error should always be negative, iiuc. > > > >> when called by migration_detect_error). We expect migration to fail in this > >> case but migration will continue to run since post-copy migration is tolerant > >> to file errors? > > > > Yes it can halt at postcopy_pause(). I'm not yet understand why it's an > > issue to using s->error, though. > > > > In general, I'm thinking whether we could also check s->error in detect > > error path like this: > > > > ===8<=== > > diff --git a/migration/migration.c b/migration/migration.c > > index 2d1da917c7..fbd97395e0 100644 > > --- a/migration/migration.c > > +++ b/migration/migration.c > > @@ -3015,17 +3015,17 @@ static MigThrError migration_detect_error(MigrationState *s) > > ret = qemu_file_get_error_obj_any(s->to_dst_file, > > s->postcopy_qemufile_src, > > &local_error); > > - if (!ret) { > > - /* Everything is fine */ > > - assert(!local_error); > > - return MIG_THR_ERR_NONE; > > - } > > - > > - if (local_error) { > > + if (ret) { > > + /* Passover qemufile errors to s->error */ > > + assert(local_error); > > migrate_set_error(s, local_error); > > error_free(local_error); > > } > > > > + if (!migrate_has_error(s)) { > > + return MIG_THR_ERR_NONE; > > + } > > + > > if (state == MIGRATION_STATUS_POSTCOPY_ACTIVE && ret) { > > /* > > * For postcopy, we allow the network to be down for a > > @@ -3037,6 +3037,8 @@ static MigThrError migration_detect_error(MigrationState *s) > > /* > > * For precopy (or postcopy with error outside IO), we fail > > * with no time. > > + * > > + * TODO: update FAILED only until the end of migration in BH. > > */ > > migrate_set_state(&s->state, state, MIGRATION_STATUS_FAILED); > > trace_migration_thread_file_err(); > > ===8<=== > > > > I kept a TODO above, I would hope if you reworked everything to route > > errors to s->error, then we can move this to the cleanup BH to avoid the > > race. > > > > Do you think that could work? > > I meant: in case of post-copy, what if we have another error somewhere and > s->error was set, but then we also saw a file error when we called > qemu_file_get_error_obj_any. In this case, migration should fail IMO but it > would be paused instead, right? Yeah you got a point, but I see no good reason to cancel any postcopy migration, no matter which error it is - either a qemufile error or another - simply because postcopy cancel means VM crash. There's nothing worse that that.. So IMHO we could treat it the same as EIO errors in this case as of now, and we always pause postcopy no matter which kind of error it hits. At least for non-recoverable errors we can have some active process to look at on src QEMU instance, OTOH there's no direct benefit for us to differenciate different error cases to crash VM earlier. Thanks,
> On 23 Jan 2025, at 9:57 PM, Peter Xu <peterx@redhat.com> wrote: > > !-------------------------------------------------------------------| > CAUTION: External Email > > |-------------------------------------------------------------------! > > On Thu, Jan 23, 2025 at 09:53:16AM +0000, Shivam Kumar wrote: >> >> >>> On 22 Jan 2025, at 10:10 PM, Peter Xu <peterx@redhat.com> wrote: >>> >>> !-------------------------------------------------------------------| >>> CAUTION: External Email >>> >>> |-------------------------------------------------------------------! >>> >>> Hi, Shivam, >>> >>> On Wed, Jan 22, 2025 at 10:54:17AM +0000, Shivam Kumar wrote: >>>> There is one place where we set the migration status to FAILED but we don’t set >>>> s->error, i.e. in save_snapshot workflow, please check qemu_savevm_state but >>>> not sure setting s->error in this case is possible (or required), as it seems a >>>> different workflow to me. >>> >>> Yes it's very different indeed. I may not yet fully get the challenge on >>> how switching to s->error (implies FAILING) would affect this one, though. >>> I mean, in qemu_savevm_state() it tries to fetch qemufile errors with: >>> >>> ret = qemu_file_get_error(f); >>> >>> IIUC we could also try to fetch s->error just like what migration thread >>> does, and if it sets means it's failing? >> Yes, I can just set s->error in qemu_savevm_state. >> @@ -1686,7 +1686,6 @@ static int qemu_savevm_state(QEMUFile *f, Error **errp) >> - status = MIGRATION_STATUS_FAILED; >> + s->error = *errp; >> But my question was: is migrate_fd_cleanup called (where we will set the status >> to FAILED if s->error is set) in the snapshot workflow? > > I see what you meant. It's not called indeed. We may need to process it > the same as what migrate_fd_cleanup() does. > > So far the snapshot code reuses migration code in a partial way, so it's > not crystal clear where the line is, e.g., obviously it still moves the > migration state machine but it never shows "active" phase at all (even if > it has a major chunk of duration that it's actively migrating the data to > the snapshot files). Here the state machine only goes from SETUP to either > FAILED or COMPLETED. > > From that POV looks like we should duplicate such s->error detection logic > here on deciding whether to switch to FAILED, if we treat s->error as the > internal "single failure point" for migration. I hope setting the state to FAILED at the end of save_snapshot might work: --- a/migration/savevm.c +++ b/migration/savevm.c @@ -1722,7 +1722,7 @@ cleanup: qemu_savevm_state_cleanup(); if (ret != 0) { - qatomic_set(&ms->failing, 1); + migrate_set_error(ms, *errp); } else { migrate_set_state(&ms->state, MIGRATION_STATUS_SETUP, MIGRATION_STATUS_COMPLETED); @@ -3054,6 +3054,7 @@ bool save_snapshot(const char *name, bool overwrite, const char *vmstate, RunState saved_state = runstate_get(); uint64_t vm_state_size; g_autoptr(GDateTime) now = g_date_time_new_now_local(); + MigrationState *ms; GLOBAL_STATE_CODE(); @@ -3149,8 +3150,13 @@ bool save_snapshot(const char *name, bool overwrite, const char *vmstate, the_end: bdrv_drain_all_end(); - + ms = migrate_get_current(); vm_resume(saved_state); + if (migrate_has_error(ms)) { + migrate_set_state(&ms->state, MIGRATION_STATUS_SETUP, + MIGRATION_STATUS_FAILED); + } + return ret == 0; } Does this make sense? >> >> I meant: in case of post-copy, what if we have another error somewhere and >> s->error was set, but then we also saw a file error when we called >> qemu_file_get_error_obj_any. In this case, migration should fail IMO but it >> would be paused instead, right? > > Yeah you got a point, but I see no good reason to cancel any postcopy > migration, no matter which error it is - either a qemufile error or another > - simply because postcopy cancel means VM crash. There's nothing worse > that that.. > > So IMHO we could treat it the same as EIO errors in this case as of now, > and we always pause postcopy no matter which kind of error it hits. At > least for non-recoverable errors we can have some active process to look > at on src QEMU instance, OTOH there's no direct benefit for us to > differenciate different error cases to crash VM earlier. > > Thanks, > > -- > Peter Xu > Yes, this makes sense. Thanks. Thanks, Shivam
On Thu, Jan 23, 2025 at 06:05:28PM +0000, Shivam Kumar wrote: > > > > On 23 Jan 2025, at 9:57 PM, Peter Xu <peterx@redhat.com> wrote: > > > > !-------------------------------------------------------------------| > > CAUTION: External Email > > > > |-------------------------------------------------------------------! > > > > On Thu, Jan 23, 2025 at 09:53:16AM +0000, Shivam Kumar wrote: > >> > >> > >>> On 22 Jan 2025, at 10:10 PM, Peter Xu <peterx@redhat.com> wrote: > >>> > >>> !-------------------------------------------------------------------| > >>> CAUTION: External Email > >>> > >>> |-------------------------------------------------------------------! > >>> > >>> Hi, Shivam, > >>> > >>> On Wed, Jan 22, 2025 at 10:54:17AM +0000, Shivam Kumar wrote: > >>>> There is one place where we set the migration status to FAILED but we don’t set > >>>> s->error, i.e. in save_snapshot workflow, please check qemu_savevm_state but > >>>> not sure setting s->error in this case is possible (or required), as it seems a > >>>> different workflow to me. > >>> > >>> Yes it's very different indeed. I may not yet fully get the challenge on > >>> how switching to s->error (implies FAILING) would affect this one, though. > >>> I mean, in qemu_savevm_state() it tries to fetch qemufile errors with: > >>> > >>> ret = qemu_file_get_error(f); > >>> > >>> IIUC we could also try to fetch s->error just like what migration thread > >>> does, and if it sets means it's failing? > >> Yes, I can just set s->error in qemu_savevm_state. > >> @@ -1686,7 +1686,6 @@ static int qemu_savevm_state(QEMUFile *f, Error **errp) > >> - status = MIGRATION_STATUS_FAILED; > >> + s->error = *errp; > >> But my question was: is migrate_fd_cleanup called (where we will set the status > >> to FAILED if s->error is set) in the snapshot workflow? > > > > I see what you meant. It's not called indeed. We may need to process it > > the same as what migrate_fd_cleanup() does. > > > > So far the snapshot code reuses migration code in a partial way, so it's > > not crystal clear where the line is, e.g., obviously it still moves the > > migration state machine but it never shows "active" phase at all (even if > > it has a major chunk of duration that it's actively migrating the data to > > the snapshot files). Here the state machine only goes from SETUP to either > > FAILED or COMPLETED. > > > > From that POV looks like we should duplicate such s->error detection logic > > here on deciding whether to switch to FAILED, if we treat s->error as the > > internal "single failure point" for migration. > I hope setting the state to FAILED at the end of save_snapshot might work: > --- a/migration/savevm.c > +++ b/migration/savevm.c > @@ -1722,7 +1722,7 @@ cleanup: > qemu_savevm_state_cleanup(); > > if (ret != 0) { > - qatomic_set(&ms->failing, 1); > + migrate_set_error(ms, *errp); > } else { > migrate_set_state(&ms->state, MIGRATION_STATUS_SETUP, > MIGRATION_STATUS_COMPLETED); > @@ -3054,6 +3054,7 @@ bool save_snapshot(const char *name, bool overwrite, const char *vmstate, > RunState saved_state = runstate_get(); > uint64_t vm_state_size; > g_autoptr(GDateTime) now = g_date_time_new_now_local(); > + MigrationState *ms; > > GLOBAL_STATE_CODE(); > > @@ -3149,8 +3150,13 @@ bool save_snapshot(const char *name, bool overwrite, const char *vmstate, > > the_end: > bdrv_drain_all_end(); > - > + ms = migrate_get_current(); > vm_resume(saved_state); > + if (migrate_has_error(ms)) { > + migrate_set_state(&ms->state, MIGRATION_STATUS_SETUP, > + MIGRATION_STATUS_FAILED); > + } > + I actually am not sure whether we need to postpone the FAILED update until here for save_snapshot. After all, merely the whole qemu_savevm_state() takes BQL, then there's no way to race it with another "migrate" (QMP command "migrate" needs BQL too) at this point. OTOH, it also feels weird to update state in qemu_savevm_state_cleanup().. Perhaps we can keep how qemu_savevm_state() would update FAILED state, then we only need to update FAILED for precopy in migrate_fd_cleanup()? We can still check for s->error in qemu_savevm_state(), though, because after switching to a heavier use of s->error maybe we can fail the save_snapshot() too with some s->error set (even if qemufile is still working). Then we may want to fail the save_snapshot() too. Thanks,
diff --git a/migration/migration.c b/migration/migration.c index df61ca4e93..f084f54f6b 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -1143,8 +1143,9 @@ static bool migration_is_active(void) { MigrationState *s = current_migration; - return (s->state == MIGRATION_STATUS_ACTIVE || - s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE); + return ((s->state == MIGRATION_STATUS_ACTIVE || + s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) && + !qatomic_read(&s->failing)); } static bool migrate_show_downtime(MigrationState *s) @@ -1439,6 +1440,11 @@ static void migrate_fd_cleanup(MigrationState *s) MIGRATION_STATUS_CANCELLED); } + if (qatomic_xchg(&s->failing, 0)) { + migrate_set_state(&s->state, s->state, + MIGRATION_STATUS_FAILED); + } + if (s->error) { /* It is used on info migrate. We can't free it */ error_report_err(error_copy(s->error)); @@ -1484,17 +1490,16 @@ static void migrate_error_free(MigrationState *s) static void migrate_fd_error(MigrationState *s, const Error *error) { MigrationStatus current = s->state; - MigrationStatus next; assert(s->to_dst_file == NULL); switch (current) { case MIGRATION_STATUS_SETUP: - next = MIGRATION_STATUS_FAILED; + qatomic_set(&s->failing, 1); break; case MIGRATION_STATUS_POSTCOPY_RECOVER_SETUP: /* Never fail a postcopy migration; switch back to PAUSED instead */ - next = MIGRATION_STATUS_POSTCOPY_PAUSED; + migrate_set_state(&s->state, current, MIGRATION_STATUS_POSTCOPY_PAUSED); break; default: /* @@ -1506,7 +1511,6 @@ static void migrate_fd_error(MigrationState *s, const Error *error) return; } - migrate_set_state(&s->state, current, next); migrate_set_error(s, error); } @@ -2101,8 +2105,7 @@ void qmp_migrate(const char *uri, bool has_channels, } else { error_setg(&local_err, QERR_INVALID_PARAMETER_VALUE, "uri", "a valid migration protocol"); - migrate_set_state(&s->state, MIGRATION_STATUS_SETUP, - MIGRATION_STATUS_FAILED); + qatomic_set(&s->failing, 1); } if (local_err) { @@ -2498,7 +2501,7 @@ static int postcopy_start(MigrationState *ms, Error **errp) if (migrate_postcopy_preempt()) { migration_wait_main_channel(ms); if (postcopy_preempt_establish_channel(ms)) { - migrate_set_state(&ms->state, ms->state, MIGRATION_STATUS_FAILED); + qatomic_set(&ms->failing, 1); error_setg(errp, "%s: Failed to establish preempt channel", __func__); return -1; @@ -2648,8 +2651,7 @@ static int postcopy_start(MigrationState *ms, Error **errp) fail_closefb: qemu_fclose(fb); fail: - migrate_set_state(&ms->state, MIGRATION_STATUS_POSTCOPY_ACTIVE, - MIGRATION_STATUS_FAILED); + qatomic_set(&ms->failing, 1); if (restart_block) { /* A failure happened early enough that we know the destination hasn't * accessed block devices, so we're safe to recover. @@ -2782,8 +2784,7 @@ static void migration_completion_failed(MigrationState *s, bql_unlock(); } - migrate_set_state(&s->state, current_active_state, - MIGRATION_STATUS_FAILED); + qatomic_set(&s->failing, 1); } /** @@ -2850,8 +2851,6 @@ fail: */ static void bg_migration_completion(MigrationState *s) { - int current_active_state = s->state; - if (s->state == MIGRATION_STATUS_ACTIVE) { /* * By this moment we have RAM content saved into the migration stream. @@ -2874,8 +2873,7 @@ static void bg_migration_completion(MigrationState *s) return; fail: - migrate_set_state(&s->state, current_active_state, - MIGRATION_STATUS_FAILED); + qatomic_set(&s->failing, 1); } typedef enum MigThrError { @@ -3047,6 +3045,10 @@ static MigThrError migration_detect_error(MigrationState *s) return MIG_THR_ERR_FATAL; } + if (qatomic_read(&s->failing)) { + return MIG_THR_ERR_FATAL; + } + /* * Try to detect any file errors. Note that postcopy_qemufile_src will * be NULL when postcopy preempt is not enabled. @@ -3077,7 +3079,7 @@ static MigThrError migration_detect_error(MigrationState *s) * For precopy (or postcopy with error outside IO), we fail * with no time. */ - migrate_set_state(&s->state, state, MIGRATION_STATUS_FAILED); + qatomic_set(&s->failing, 1); trace_migration_thread_file_err(); /* Time to stop the migration, now. */ @@ -3492,8 +3494,7 @@ static void *migration_thread(void *opaque) if (ret) { migrate_set_error(s, local_err); error_free(local_err); - migrate_set_state(&s->state, MIGRATION_STATUS_ACTIVE, - MIGRATION_STATUS_FAILED); + qatomic_set(&s->failing, 1); goto out; } @@ -3617,8 +3618,7 @@ static void *bg_migration_thread(void *opaque) if (ret) { migrate_set_error(s, local_err); error_free(local_err); - migrate_set_state(&s->state, MIGRATION_STATUS_ACTIVE, - MIGRATION_STATUS_FAILED); + qatomic_set(&s->failing, 1); goto fail_setup; } @@ -3685,8 +3685,7 @@ static void *bg_migration_thread(void *opaque) fail: if (early_fail) { - migrate_set_state(&s->state, MIGRATION_STATUS_ACTIVE, - MIGRATION_STATUS_FAILED); + qatomic_set(&s->failing, 1); bql_unlock(); } @@ -3805,7 +3804,7 @@ void migrate_fd_connect(MigrationState *s, Error *error_in) fail: migrate_set_error(s, local_err); - migrate_set_state(&s->state, s->state, MIGRATION_STATUS_FAILED); + qatomic_set(&s->failing, 1); error_report_err(local_err); migrate_fd_cleanup(s); } diff --git a/migration/migration.h b/migration/migration.h index 7b6e718690..3b808d971f 100644 --- a/migration/migration.h +++ b/migration/migration.h @@ -471,6 +471,15 @@ struct MigrationState { bool switchover_acked; /* Is this a rdma migration */ bool rdma_migration; + /* + * Is migration failing? Migration status should not be set to + * MIGRATION_STATUS_FAILED until migrate fd cleanup is done, else + * the client can trigger another instance of migration before + * the cleanup is complete. Hence, set the status to 'failing' + * when a migration failure happens and once the cleanup is done, + * set it to MIGRATION_STATUS_FAILED. + */ + int failing; }; void migrate_set_state(MigrationStatus *state, MigrationStatus old_state, diff --git a/migration/multifd.c b/migration/multifd.c index 4f973d70e0..48ece2162c 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -878,8 +878,7 @@ bool multifd_send_setup(void) return true; err: - migrate_set_state(&s->state, MIGRATION_STATUS_SETUP, - MIGRATION_STATUS_FAILED); + qatomic_set(&s->failing, 1); return false; } @@ -949,8 +948,7 @@ static void multifd_recv_terminate_threads(Error *err) migrate_set_error(s, err); if (s->state == MIGRATION_STATUS_SETUP || s->state == MIGRATION_STATUS_ACTIVE) { - migrate_set_state(&s->state, s->state, - MIGRATION_STATUS_FAILED); + qatomic_set(&s->failing, 1); } } diff --git a/migration/savevm.c b/migration/savevm.c index 927b1146c0..4f0ef34f23 100644 --- a/migration/savevm.c +++ b/migration/savevm.c @@ -1686,7 +1686,6 @@ static int qemu_savevm_state(QEMUFile *f, Error **errp) { int ret; MigrationState *ms = migrate_get_current(); - MigrationStatus status; if (migration_is_running()) { error_setg(errp, "There's a migration process in progress"); @@ -1723,11 +1722,11 @@ cleanup: qemu_savevm_state_cleanup(); if (ret != 0) { - status = MIGRATION_STATUS_FAILED; + qatomic_set(&ms->failing, 1); } else { - status = MIGRATION_STATUS_COMPLETED; + migrate_set_state(&ms->state, MIGRATION_STATUS_SETUP, + MIGRATION_STATUS_COMPLETED); } - migrate_set_state(&ms->state, MIGRATION_STATUS_SETUP, status); /* f is outer parameter, it should not stay in global migration state after * this function finished */
Even if a live migration fails due to some reason, migration status should not be set to MIGRATION_STATUS_FAILED until migrate fd cleanup is done, else the client can trigger another instance of migration before the cleanup is complete (as it would assume no migration is active) or reset migration capabilities affecting old migration's cleanup. Hence, set the status to 'failing' when a migration failure happens and once the cleanup is complete, set the migration status to MIGRATION_STATUS_FAILED. Signed-off-by: Shivam Kumar <shivam.kumar1@nutanix.com> --- migration/migration.c | 49 +++++++++++++++++++++---------------------- migration/migration.h | 9 ++++++++ migration/multifd.c | 6 ++---- migration/savevm.c | 7 +++---- 4 files changed, 38 insertions(+), 33 deletions(-)