diff mbox series

[V3,09/13] migration: notifier error checking

Message ID 1707418446-134863-10-git-send-email-steven.sistare@oracle.com (mailing list archive)
State New, archived
Headers show
Series allow cpr-reboot for vfio | expand

Commit Message

Steven Sistare Feb. 8, 2024, 6:54 p.m. UTC
Check the status returned by migration notifiers and report errors.
If notifiers fail, call the notifiers again so they can clean up.
None of the notifiers return an error status at this time.

Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
---
 include/migration/misc.h |  3 ++-
 migration/migration.c    | 40 +++++++++++++++++++++++++++++-----------
 2 files changed, 31 insertions(+), 12 deletions(-)

Comments

David Hildenbrand Feb. 12, 2024, 9:24 a.m. UTC | #1
On 08.02.24 19:54, Steve Sistare wrote:
> Check the status returned by migration notifiers and report errors.
> If notifiers fail, call the notifiers again so they can clean up.

IIUC, if any of the notifiers will actually start to fail, say, during 
MIG_EVENT_PRECOPY_SETUP, you will call MIG_EVENT_PRECOPY_FAILED on all 
notifiers.

That will include notifiers that have never seen a 
MIG_EVENT_PRECOPY_SETUP call.

Is that what we expect notifiers to be able to deal with? Can we 
document that?
Steven Sistare Feb. 12, 2024, 3:37 p.m. UTC | #2
On 2/12/2024 4:24 AM, David Hildenbrand wrote:
> On 08.02.24 19:54, Steve Sistare wrote:
>> Check the status returned by migration notifiers and report errors.
>> If notifiers fail, call the notifiers again so they can clean up.
> 
> IIUC, if any of the notifiers will actually start to fail, say, during MIG_EVENT_PRECOPY_SETUP, you will call MIG_EVENT_PRECOPY_FAILED on all notifiers.
> 
> That will include notifiers that have never seen a MIG_EVENT_PRECOPY_SETUP call.

Correct.

> Is that what we expect notifiers to be able to deal with? Can we document that?

The notifiers have always needed to handle failure without knowing the previous migration
states, and are robust about unwinding their own internal state.  I will document it.

Thanks for all the RBs.

- Steve
Peter Xu Feb. 20, 2024, 7:12 a.m. UTC | #3
On Thu, Feb 08, 2024 at 10:54:02AM -0800, Steve Sistare wrote:
> Check the status returned by migration notifiers and report errors.
> If notifiers fail, call the notifiers again so they can clean up.
> None of the notifiers return an error status at this time.
> 
> Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
> ---
>  include/migration/misc.h |  3 ++-
>  migration/migration.c    | 40 +++++++++++++++++++++++++++++-----------
>  2 files changed, 31 insertions(+), 12 deletions(-)
> 
> diff --git a/include/migration/misc.h b/include/migration/misc.h
> index 0ea1902..6dc234b 100644
> --- a/include/migration/misc.h
> +++ b/include/migration/misc.h
> @@ -82,7 +82,8 @@ void migration_add_notifier(NotifierWithReturn *notify,
>  void migration_add_notifier_mode(NotifierWithReturn *notify,
>                                   MigrationNotifyFunc func, MigMode mode);
>  void migration_remove_notifier(NotifierWithReturn *notify);
> -void migration_call_notifiers(MigrationState *s, MigrationEventType type);
> +int migration_call_notifiers(MigrationState *s, MigrationEventType type,
> +                             Error **errp);
>  bool migration_in_setup(MigrationState *);
>  bool migration_has_finished(MigrationState *);
>  bool migration_has_failed(MigrationState *);
> diff --git a/migration/migration.c b/migration/migration.c
> index 01d8867..d1fce9e 100644
> --- a/migration/migration.c
> +++ b/migration/migration.c
> @@ -1318,6 +1318,8 @@ void migrate_set_state(int *state, int old_state, int new_state)
>  
>  static void migrate_fd_cleanup(MigrationState *s)
>  {
> +    Error *local_err = NULL;
> +
>      g_free(s->hostname);
>      s->hostname = NULL;
>      json_writer_free(s->vmdesc);
> @@ -1362,13 +1364,23 @@ static void migrate_fd_cleanup(MigrationState *s)
>                            MIGRATION_STATUS_CANCELLED);
>      }
>  
> +    if (!migration_has_failed(s) &&
> +        migration_call_notifiers(s, MIG_EVENT_PRECOPY_DONE, &local_err)) {
> +
> +        migrate_set_state(&s->state, s->state, MIGRATION_STATUS_FAILED);
> +        migrate_set_error(s, local_err);
> +        error_free(local_err);
> +    }
> +
>      if (s->error) {
>          /* It is used on info migrate.  We can't free it */
>          error_report_err(error_copy(s->error));
>      }
> -    migration_call_notifiers(s, s->state == MIGRATION_STATUS_COMPLETED ?
> -                             MIG_EVENT_PRECOPY_DONE :
> -                             MIG_EVENT_PRECOPY_FAILED);
> +
> +    if (migration_has_failed(s)) {
> +        migration_call_notifiers(s, MIG_EVENT_PRECOPY_FAILED, NULL);
> +    }

AFAIU, the whole point of such split is, allowing DONE notifies to fail too
and then if that happens we can invoke FAIL notifiers again.

Perhaps we can avoid that complexity, but rather document only SETUP
notifiers can fail?

The problem is that failing a notifier at this stage (if migration already
finished) can already be too late; dest QEMU can already have started
running, so no way to roll back.  We can document that, check and assert
for !SETUP cases to make sure error is never hit?

> +
>      block_cleanup_parameters();
>      yank_unregister_instance(MIGRATION_YANK_INSTANCE);
>  }
> @@ -1481,13 +1493,15 @@ void migration_remove_notifier(NotifierWithReturn *notify)
>      }
>  }
>  
> -void migration_call_notifiers(MigrationState *s, MigrationEventType type)
> +int migration_call_notifiers(MigrationState *s, MigrationEventType type,
> +                             Error **errp)
>  {
>      MigMode mode = s->parameters.mode;
>      MigrationEvent e;
>  
>      e.type = type;
> -    notifier_with_return_list_notify(&migration_state_notifiers[mode], &e, 0);
> +    return notifier_with_return_list_notify(&migration_state_notifiers[mode],
> +                                            &e, errp);
>  }
>  
>  bool migration_in_setup(MigrationState *s)
> @@ -2535,7 +2549,9 @@ static int postcopy_start(MigrationState *ms, Error **errp)
>       * at the transition to postcopy and after the device state; in particular
>       * spice needs to trigger a transition now
>       */
> -    migration_call_notifiers(ms, MIG_EVENT_PRECOPY_DONE);
> +    if (migration_call_notifiers(ms, MIG_EVENT_PRECOPY_DONE, errp)) {
> +        goto fail;
> +    }
>  
>      migration_downtime_end(ms);
>  
> @@ -2555,11 +2571,10 @@ static int postcopy_start(MigrationState *ms, Error **errp)
>  
>      ret = qemu_file_get_error(ms->to_dst_file);
>      if (ret) {
> -        error_setg(errp, "postcopy_start: Migration stream errored");
> -        migrate_set_state(&ms->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
> -                              MIGRATION_STATUS_FAILED);
> +        error_setg_errno(errp, -ret, "postcopy_start: Migration stream error");
> +        bql_lock();
> +        goto fail;
>      }
> -
>      trace_postcopy_preempt_enabled(migrate_postcopy_preempt());
>  
>      return ret;
> @@ -2580,6 +2595,7 @@ fail:
>              error_report_err(local_err);
>          }
>      }
> +    migration_call_notifiers(ms, MIG_EVENT_PRECOPY_FAILED, NULL);
>      bql_unlock();
>      return -1;
>  }
> @@ -3594,7 +3610,9 @@ void migrate_fd_connect(MigrationState *s, Error *error_in)
>          rate_limit = migrate_max_bandwidth();
>  
>          /* Notify before starting migration thread */
> -        migration_call_notifiers(s, MIG_EVENT_PRECOPY_SETUP);
> +        if (migration_call_notifiers(s, MIG_EVENT_PRECOPY_SETUP, &local_err)) {
> +            goto fail;
> +        }
>      }
>  
>      migration_rate_set(rate_limit);
> -- 
> 1.8.3.1
>
Steven Sistare Feb. 20, 2024, 10:12 p.m. UTC | #4
On 2/20/2024 2:12 AM, Peter Xu wrote:
> On Thu, Feb 08, 2024 at 10:54:02AM -0800, Steve Sistare wrote:
>> Check the status returned by migration notifiers and report errors.
>> If notifiers fail, call the notifiers again so they can clean up.
>> None of the notifiers return an error status at this time.
>>
>> Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
>> ---
>>  include/migration/misc.h |  3 ++-
>>  migration/migration.c    | 40 +++++++++++++++++++++++++++++-----------
>>  2 files changed, 31 insertions(+), 12 deletions(-)
>>
>> diff --git a/include/migration/misc.h b/include/migration/misc.h
>> index 0ea1902..6dc234b 100644
>> --- a/include/migration/misc.h
>> +++ b/include/migration/misc.h
>> @@ -82,7 +82,8 @@ void migration_add_notifier(NotifierWithReturn *notify,
>>  void migration_add_notifier_mode(NotifierWithReturn *notify,
>>                                   MigrationNotifyFunc func, MigMode mode);
>>  void migration_remove_notifier(NotifierWithReturn *notify);
>> -void migration_call_notifiers(MigrationState *s, MigrationEventType type);
>> +int migration_call_notifiers(MigrationState *s, MigrationEventType type,
>> +                             Error **errp);
>>  bool migration_in_setup(MigrationState *);
>>  bool migration_has_finished(MigrationState *);
>>  bool migration_has_failed(MigrationState *);
>> diff --git a/migration/migration.c b/migration/migration.c
>> index 01d8867..d1fce9e 100644
>> --- a/migration/migration.c
>> +++ b/migration/migration.c
>> @@ -1318,6 +1318,8 @@ void migrate_set_state(int *state, int old_state, int new_state)
>>  
>>  static void migrate_fd_cleanup(MigrationState *s)
>>  {
>> +    Error *local_err = NULL;
>> +
>>      g_free(s->hostname);
>>      s->hostname = NULL;
>>      json_writer_free(s->vmdesc);
>> @@ -1362,13 +1364,23 @@ static void migrate_fd_cleanup(MigrationState *s)
>>                            MIGRATION_STATUS_CANCELLED);
>>      }
>>  
>> +    if (!migration_has_failed(s) &&
>> +        migration_call_notifiers(s, MIG_EVENT_PRECOPY_DONE, &local_err)) {
>> +
>> +        migrate_set_state(&s->state, s->state, MIGRATION_STATUS_FAILED);
>> +        migrate_set_error(s, local_err);
>> +        error_free(local_err);
>> +    }
>> +
>>      if (s->error) {
>>          /* It is used on info migrate.  We can't free it */
>>          error_report_err(error_copy(s->error));
>>      }
>> -    migration_call_notifiers(s, s->state == MIGRATION_STATUS_COMPLETED ?
>> -                             MIG_EVENT_PRECOPY_DONE :
>> -                             MIG_EVENT_PRECOPY_FAILED);
>> +
>> +    if (migration_has_failed(s)) {
>> +        migration_call_notifiers(s, MIG_EVENT_PRECOPY_FAILED, NULL);
>> +    }
> 
> AFAIU, the whole point of such split is, allowing DONE notifies to fail too
> and then if that happens we can invoke FAIL notifiers again.

Correct.

> 
> Perhaps we can avoid that complexity, but rather document only SETUP
> notifiers can fail?
> 
> The problem is that failing a notifier at this stage (if migration already
> finished) can already be too late; dest QEMU can already have started
> running, so no way to roll back.  We can document that, check and assert
> for !SETUP cases to make sure error is never hit?

Makes sense. I will modify the patch as you suggest.

- Steve

>> +
>>      block_cleanup_parameters();
>>      yank_unregister_instance(MIGRATION_YANK_INSTANCE);
>>  }
>> @@ -1481,13 +1493,15 @@ void migration_remove_notifier(NotifierWithReturn *notify)
>>      }
>>  }
>>  
>> -void migration_call_notifiers(MigrationState *s, MigrationEventType type)
>> +int migration_call_notifiers(MigrationState *s, MigrationEventType type,
>> +                             Error **errp)
>>  {
>>      MigMode mode = s->parameters.mode;
>>      MigrationEvent e;
>>  
>>      e.type = type;
>> -    notifier_with_return_list_notify(&migration_state_notifiers[mode], &e, 0);
>> +    return notifier_with_return_list_notify(&migration_state_notifiers[mode],
>> +                                            &e, errp);
>>  }
>>  
>>  bool migration_in_setup(MigrationState *s)
>> @@ -2535,7 +2549,9 @@ static int postcopy_start(MigrationState *ms, Error **errp)
>>       * at the transition to postcopy and after the device state; in particular
>>       * spice needs to trigger a transition now
>>       */
>> -    migration_call_notifiers(ms, MIG_EVENT_PRECOPY_DONE);
>> +    if (migration_call_notifiers(ms, MIG_EVENT_PRECOPY_DONE, errp)) {
>> +        goto fail;
>> +    }
>>  
>>      migration_downtime_end(ms);
>>  
>> @@ -2555,11 +2571,10 @@ static int postcopy_start(MigrationState *ms, Error **errp)
>>  
>>      ret = qemu_file_get_error(ms->to_dst_file);
>>      if (ret) {
>> -        error_setg(errp, "postcopy_start: Migration stream errored");
>> -        migrate_set_state(&ms->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
>> -                              MIGRATION_STATUS_FAILED);
>> +        error_setg_errno(errp, -ret, "postcopy_start: Migration stream error");
>> +        bql_lock();
>> +        goto fail;
>>      }
>> -
>>      trace_postcopy_preempt_enabled(migrate_postcopy_preempt());
>>  
>>      return ret;
>> @@ -2580,6 +2595,7 @@ fail:
>>              error_report_err(local_err);
>>          }
>>      }
>> +    migration_call_notifiers(ms, MIG_EVENT_PRECOPY_FAILED, NULL);
>>      bql_unlock();
>>      return -1;
>>  }
>> @@ -3594,7 +3610,9 @@ void migrate_fd_connect(MigrationState *s, Error *error_in)
>>          rate_limit = migrate_max_bandwidth();
>>  
>>          /* Notify before starting migration thread */
>> -        migration_call_notifiers(s, MIG_EVENT_PRECOPY_SETUP);
>> +        if (migration_call_notifiers(s, MIG_EVENT_PRECOPY_SETUP, &local_err)) {
>> +            goto fail;
>> +        }
>>      }
>>  
>>      migration_rate_set(rate_limit);
>> -- 
>> 1.8.3.1
>>
>
diff mbox series

Patch

diff --git a/include/migration/misc.h b/include/migration/misc.h
index 0ea1902..6dc234b 100644
--- a/include/migration/misc.h
+++ b/include/migration/misc.h
@@ -82,7 +82,8 @@  void migration_add_notifier(NotifierWithReturn *notify,
 void migration_add_notifier_mode(NotifierWithReturn *notify,
                                  MigrationNotifyFunc func, MigMode mode);
 void migration_remove_notifier(NotifierWithReturn *notify);
-void migration_call_notifiers(MigrationState *s, MigrationEventType type);
+int migration_call_notifiers(MigrationState *s, MigrationEventType type,
+                             Error **errp);
 bool migration_in_setup(MigrationState *);
 bool migration_has_finished(MigrationState *);
 bool migration_has_failed(MigrationState *);
diff --git a/migration/migration.c b/migration/migration.c
index 01d8867..d1fce9e 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -1318,6 +1318,8 @@  void migrate_set_state(int *state, int old_state, int new_state)
 
 static void migrate_fd_cleanup(MigrationState *s)
 {
+    Error *local_err = NULL;
+
     g_free(s->hostname);
     s->hostname = NULL;
     json_writer_free(s->vmdesc);
@@ -1362,13 +1364,23 @@  static void migrate_fd_cleanup(MigrationState *s)
                           MIGRATION_STATUS_CANCELLED);
     }
 
+    if (!migration_has_failed(s) &&
+        migration_call_notifiers(s, MIG_EVENT_PRECOPY_DONE, &local_err)) {
+
+        migrate_set_state(&s->state, s->state, MIGRATION_STATUS_FAILED);
+        migrate_set_error(s, local_err);
+        error_free(local_err);
+    }
+
     if (s->error) {
         /* It is used on info migrate.  We can't free it */
         error_report_err(error_copy(s->error));
     }
-    migration_call_notifiers(s, s->state == MIGRATION_STATUS_COMPLETED ?
-                             MIG_EVENT_PRECOPY_DONE :
-                             MIG_EVENT_PRECOPY_FAILED);
+
+    if (migration_has_failed(s)) {
+        migration_call_notifiers(s, MIG_EVENT_PRECOPY_FAILED, NULL);
+    }
+
     block_cleanup_parameters();
     yank_unregister_instance(MIGRATION_YANK_INSTANCE);
 }
@@ -1481,13 +1493,15 @@  void migration_remove_notifier(NotifierWithReturn *notify)
     }
 }
 
-void migration_call_notifiers(MigrationState *s, MigrationEventType type)
+int migration_call_notifiers(MigrationState *s, MigrationEventType type,
+                             Error **errp)
 {
     MigMode mode = s->parameters.mode;
     MigrationEvent e;
 
     e.type = type;
-    notifier_with_return_list_notify(&migration_state_notifiers[mode], &e, 0);
+    return notifier_with_return_list_notify(&migration_state_notifiers[mode],
+                                            &e, errp);
 }
 
 bool migration_in_setup(MigrationState *s)
@@ -2535,7 +2549,9 @@  static int postcopy_start(MigrationState *ms, Error **errp)
      * at the transition to postcopy and after the device state; in particular
      * spice needs to trigger a transition now
      */
-    migration_call_notifiers(ms, MIG_EVENT_PRECOPY_DONE);
+    if (migration_call_notifiers(ms, MIG_EVENT_PRECOPY_DONE, errp)) {
+        goto fail;
+    }
 
     migration_downtime_end(ms);
 
@@ -2555,11 +2571,10 @@  static int postcopy_start(MigrationState *ms, Error **errp)
 
     ret = qemu_file_get_error(ms->to_dst_file);
     if (ret) {
-        error_setg(errp, "postcopy_start: Migration stream errored");
-        migrate_set_state(&ms->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
-                              MIGRATION_STATUS_FAILED);
+        error_setg_errno(errp, -ret, "postcopy_start: Migration stream error");
+        bql_lock();
+        goto fail;
     }
-
     trace_postcopy_preempt_enabled(migrate_postcopy_preempt());
 
     return ret;
@@ -2580,6 +2595,7 @@  fail:
             error_report_err(local_err);
         }
     }
+    migration_call_notifiers(ms, MIG_EVENT_PRECOPY_FAILED, NULL);
     bql_unlock();
     return -1;
 }
@@ -3594,7 +3610,9 @@  void migrate_fd_connect(MigrationState *s, Error *error_in)
         rate_limit = migrate_max_bandwidth();
 
         /* Notify before starting migration thread */
-        migration_call_notifiers(s, MIG_EVENT_PRECOPY_SETUP);
+        if (migration_call_notifiers(s, MIG_EVENT_PRECOPY_SETUP, &local_err)) {
+            goto fail;
+        }
     }
 
     migration_rate_set(rate_limit);