diff mbox series

[5/6] migration: Fix hang after error in destination setup phase

Message ID 20241202220137.32584-6-farosas@suse.de (mailing list archive)
State New
Headers show
Series migration: Fix issues during qmp_migrate_cancel | expand

Commit Message

Fabiano Rosas Dec. 2, 2024, 10:01 p.m. UTC
If the destination side fails at migration_ioc_process_incoming()
before starting the coroutine, it will report the error but QEMU will
not exit.

Set the migration state to FAILED and exit the process if
exit-on-error allows.

CC: Thomas Huth <thuth@redhat.com>
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2633
Reported-by: Daniel P. Berrangé <berrange@redhat.com>
Signed-off-by: Fabiano Rosas <farosas@suse.de>
---
 migration/channel.c   | 11 ++++++-----
 migration/migration.c | 31 ++++++++++++++++++-------------
 migration/migration.h |  2 +-
 3 files changed, 25 insertions(+), 19 deletions(-)

Comments

Peter Xu Dec. 4, 2024, 6:44 p.m. UTC | #1
On Mon, Dec 02, 2024 at 07:01:36PM -0300, Fabiano Rosas wrote:
> If the destination side fails at migration_ioc_process_incoming()
> before starting the coroutine, it will report the error but QEMU will
> not exit.
> 
> Set the migration state to FAILED and exit the process if
> exit-on-error allows.
> 
> CC: Thomas Huth <thuth@redhat.com>
> Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2633
> Reported-by: Daniel P. Berrangé <berrange@redhat.com>
> Signed-off-by: Fabiano Rosas <farosas@suse.de>

(I skipped the postcopy patches as of now, until we finish the discussion
 in patch 2)

> ---
>  migration/channel.c   | 11 ++++++-----
>  migration/migration.c | 31 ++++++++++++++++++-------------
>  migration/migration.h |  2 +-
>  3 files changed, 25 insertions(+), 19 deletions(-)
> 
> diff --git a/migration/channel.c b/migration/channel.c
> index f9de064f3b..6d7f9172d8 100644
> --- a/migration/channel.c
> +++ b/migration/channel.c
> @@ -40,13 +40,14 @@ void migration_channel_process_incoming(QIOChannel *ioc)
>  
>      if (migrate_channel_requires_tls_upgrade(ioc)) {
>          migration_tls_channel_process_incoming(s, ioc, &local_err);
> +
> +        if (local_err) {
> +            error_report_err(local_err);
> +        }

What if tls processing failed here, do we have similar issue that qemu will
stall?  Do we want to cover that too?

> +
>      } else {
>          migration_ioc_register_yank(ioc);
> -        migration_ioc_process_incoming(ioc, &local_err);
> -    }
> -
> -    if (local_err) {
> -        error_report_err(local_err);
> +        migration_ioc_process_incoming(ioc);
>      }
>  }
>  
> diff --git a/migration/migration.c b/migration/migration.c
> index 8a61cc26d7..cd88ebc875 100644
> --- a/migration/migration.c
> +++ b/migration/migration.c
> @@ -943,7 +943,7 @@ static bool migration_should_start_incoming(bool main_channel)
>      return true;
>  }
>  
> -void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp)
> +void migration_ioc_process_incoming(QIOChannel *ioc)
>  {
>      MigrationIncomingState *mis = migration_incoming_get_current();
>      Error *local_err = NULL;
> @@ -966,10 +966,9 @@ void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp)
>           * issue is not possible.
>           */
>          ret = migration_channel_read_peek(ioc, (void *)&channel_magic,
> -                                          sizeof(channel_magic), errp);
> -
> +                                          sizeof(channel_magic), &local_err);
>          if (ret != 0) {
> -            return;
> +            goto err;
>          }
>  
>          default_channel = (channel_magic == cpu_to_be32(QEMU_VM_FILE_MAGIC));
> @@ -977,8 +976,8 @@ void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp)
>          default_channel = !mis->from_src_file;
>      }
>  
> -    if (multifd_recv_setup(errp) != 0) {
> -        return;
> +    if (multifd_recv_setup(&local_err) != 0) {
> +        goto err;
>      }
>  
>      if (default_channel) {
> @@ -995,18 +994,24 @@ void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp)
>              postcopy_preempt_new_channel(mis, f);
>          }
>          if (local_err) {
> -            error_propagate(errp, local_err);
> -            return;
> +            goto err;
>          }
>      }
>  
> -    if (migration_should_start_incoming(default_channel)) {
> -        /* If it's a recovery, we're done */
> -        if (postcopy_try_recover()) {
> -            return;
> -        }
> +    if (migration_should_start_incoming(default_channel) &&
> +        !postcopy_try_recover()) {
>          migration_incoming_process();
>      }
> +
> +    return;
> +
> +err:
> +    error_report_err(local_err);
> +    migrate_set_state(&mis->state, MIGRATION_STATUS_SETUP,
> +                      MIGRATION_STATUS_FAILED);
> +    if (mis->exit_on_error) {
> +        exit(EXIT_FAILURE);
> +    }
>  }
>  
>  /**
> diff --git a/migration/migration.h b/migration/migration.h
> index 0956e9274b..c367e5ea40 100644
> --- a/migration/migration.h
> +++ b/migration/migration.h
> @@ -477,7 +477,7 @@ void migrate_set_state(MigrationStatus *state, MigrationStatus old_state,
>                         MigrationStatus new_state);
>  
>  void migration_fd_process_incoming(QEMUFile *f);
> -void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp);
> +void migration_ioc_process_incoming(QIOChannel *ioc);
>  void migration_incoming_process(void);
>  
>  bool  migration_has_all_channels(void);
> -- 
> 2.35.3
>
Fabiano Rosas Dec. 4, 2024, 7:05 p.m. UTC | #2
Peter Xu <peterx@redhat.com> writes:

> On Mon, Dec 02, 2024 at 07:01:36PM -0300, Fabiano Rosas wrote:
>> If the destination side fails at migration_ioc_process_incoming()
>> before starting the coroutine, it will report the error but QEMU will
>> not exit.
>> 
>> Set the migration state to FAILED and exit the process if
>> exit-on-error allows.
>> 
>> CC: Thomas Huth <thuth@redhat.com>
>> Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2633
>> Reported-by: Daniel P. Berrangé <berrange@redhat.com>
>> Signed-off-by: Fabiano Rosas <farosas@suse.de>
>
> (I skipped the postcopy patches as of now, until we finish the discussion
>  in patch 2)
>
>> ---
>>  migration/channel.c   | 11 ++++++-----
>>  migration/migration.c | 31 ++++++++++++++++++-------------
>>  migration/migration.h |  2 +-
>>  3 files changed, 25 insertions(+), 19 deletions(-)
>> 
>> diff --git a/migration/channel.c b/migration/channel.c
>> index f9de064f3b..6d7f9172d8 100644
>> --- a/migration/channel.c
>> +++ b/migration/channel.c
>> @@ -40,13 +40,14 @@ void migration_channel_process_incoming(QIOChannel *ioc)
>>  
>>      if (migrate_channel_requires_tls_upgrade(ioc)) {
>>          migration_tls_channel_process_incoming(s, ioc, &local_err);
>> +
>> +        if (local_err) {
>> +            error_report_err(local_err);
>> +        }
>
> What if tls processing failed here, do we have similar issue that qemu will
> stall?  Do we want to cover that too?
>

Hmm, I think I confused this with the multifd_channel_connect() stuff
where the code is reentrant and we take the "regular" path when called a
second time. I need to look closer into this part.

>> +
>>      } else {
>>          migration_ioc_register_yank(ioc);
>> -        migration_ioc_process_incoming(ioc, &local_err);
>> -    }
>> -
>> -    if (local_err) {
>> -        error_report_err(local_err);
>> +        migration_ioc_process_incoming(ioc);
>>      }
>>  }
>>  
>> diff --git a/migration/migration.c b/migration/migration.c
>> index 8a61cc26d7..cd88ebc875 100644
>> --- a/migration/migration.c
>> +++ b/migration/migration.c
>> @@ -943,7 +943,7 @@ static bool migration_should_start_incoming(bool main_channel)
>>      return true;
>>  }
>>  
>> -void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp)
>> +void migration_ioc_process_incoming(QIOChannel *ioc)
>>  {
>>      MigrationIncomingState *mis = migration_incoming_get_current();
>>      Error *local_err = NULL;
>> @@ -966,10 +966,9 @@ void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp)
>>           * issue is not possible.
>>           */
>>          ret = migration_channel_read_peek(ioc, (void *)&channel_magic,
>> -                                          sizeof(channel_magic), errp);
>> -
>> +                                          sizeof(channel_magic), &local_err);
>>          if (ret != 0) {
>> -            return;
>> +            goto err;
>>          }
>>  
>>          default_channel = (channel_magic == cpu_to_be32(QEMU_VM_FILE_MAGIC));
>> @@ -977,8 +976,8 @@ void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp)
>>          default_channel = !mis->from_src_file;
>>      }
>>  
>> -    if (multifd_recv_setup(errp) != 0) {
>> -        return;
>> +    if (multifd_recv_setup(&local_err) != 0) {
>> +        goto err;
>>      }
>>  
>>      if (default_channel) {
>> @@ -995,18 +994,24 @@ void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp)
>>              postcopy_preempt_new_channel(mis, f);
>>          }
>>          if (local_err) {
>> -            error_propagate(errp, local_err);
>> -            return;
>> +            goto err;
>>          }
>>      }
>>  
>> -    if (migration_should_start_incoming(default_channel)) {
>> -        /* If it's a recovery, we're done */
>> -        if (postcopy_try_recover()) {
>> -            return;
>> -        }
>> +    if (migration_should_start_incoming(default_channel) &&
>> +        !postcopy_try_recover()) {
>>          migration_incoming_process();
>>      }
>> +
>> +    return;
>> +
>> +err:
>> +    error_report_err(local_err);
>> +    migrate_set_state(&mis->state, MIGRATION_STATUS_SETUP,
>> +                      MIGRATION_STATUS_FAILED);
>> +    if (mis->exit_on_error) {
>> +        exit(EXIT_FAILURE);
>> +    }
>>  }
>>  
>>  /**
>> diff --git a/migration/migration.h b/migration/migration.h
>> index 0956e9274b..c367e5ea40 100644
>> --- a/migration/migration.h
>> +++ b/migration/migration.h
>> @@ -477,7 +477,7 @@ void migrate_set_state(MigrationStatus *state, MigrationStatus old_state,
>>                         MigrationStatus new_state);
>>  
>>  void migration_fd_process_incoming(QEMUFile *f);
>> -void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp);
>> +void migration_ioc_process_incoming(QIOChannel *ioc);
>>  void migration_incoming_process(void);
>>  
>>  bool  migration_has_all_channels(void);
>> -- 
>> 2.35.3
>>
diff mbox series

Patch

diff --git a/migration/channel.c b/migration/channel.c
index f9de064f3b..6d7f9172d8 100644
--- a/migration/channel.c
+++ b/migration/channel.c
@@ -40,13 +40,14 @@  void migration_channel_process_incoming(QIOChannel *ioc)
 
     if (migrate_channel_requires_tls_upgrade(ioc)) {
         migration_tls_channel_process_incoming(s, ioc, &local_err);
+
+        if (local_err) {
+            error_report_err(local_err);
+        }
+
     } else {
         migration_ioc_register_yank(ioc);
-        migration_ioc_process_incoming(ioc, &local_err);
-    }
-
-    if (local_err) {
-        error_report_err(local_err);
+        migration_ioc_process_incoming(ioc);
     }
 }
 
diff --git a/migration/migration.c b/migration/migration.c
index 8a61cc26d7..cd88ebc875 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -943,7 +943,7 @@  static bool migration_should_start_incoming(bool main_channel)
     return true;
 }
 
-void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp)
+void migration_ioc_process_incoming(QIOChannel *ioc)
 {
     MigrationIncomingState *mis = migration_incoming_get_current();
     Error *local_err = NULL;
@@ -966,10 +966,9 @@  void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp)
          * issue is not possible.
          */
         ret = migration_channel_read_peek(ioc, (void *)&channel_magic,
-                                          sizeof(channel_magic), errp);
-
+                                          sizeof(channel_magic), &local_err);
         if (ret != 0) {
-            return;
+            goto err;
         }
 
         default_channel = (channel_magic == cpu_to_be32(QEMU_VM_FILE_MAGIC));
@@ -977,8 +976,8 @@  void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp)
         default_channel = !mis->from_src_file;
     }
 
-    if (multifd_recv_setup(errp) != 0) {
-        return;
+    if (multifd_recv_setup(&local_err) != 0) {
+        goto err;
     }
 
     if (default_channel) {
@@ -995,18 +994,24 @@  void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp)
             postcopy_preempt_new_channel(mis, f);
         }
         if (local_err) {
-            error_propagate(errp, local_err);
-            return;
+            goto err;
         }
     }
 
-    if (migration_should_start_incoming(default_channel)) {
-        /* If it's a recovery, we're done */
-        if (postcopy_try_recover()) {
-            return;
-        }
+    if (migration_should_start_incoming(default_channel) &&
+        !postcopy_try_recover()) {
         migration_incoming_process();
     }
+
+    return;
+
+err:
+    error_report_err(local_err);
+    migrate_set_state(&mis->state, MIGRATION_STATUS_SETUP,
+                      MIGRATION_STATUS_FAILED);
+    if (mis->exit_on_error) {
+        exit(EXIT_FAILURE);
+    }
 }
 
 /**
diff --git a/migration/migration.h b/migration/migration.h
index 0956e9274b..c367e5ea40 100644
--- a/migration/migration.h
+++ b/migration/migration.h
@@ -477,7 +477,7 @@  void migrate_set_state(MigrationStatus *state, MigrationStatus old_state,
                        MigrationStatus new_state);
 
 void migration_fd_process_incoming(QEMUFile *f);
-void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp);
+void migration_ioc_process_incoming(QIOChannel *ioc);
 void migration_incoming_process(void);
 
 bool  migration_has_all_channels(void);