diff mbox

[[RFC,WIP] v1] Keeping the Source side alive incase of network failure (Migration recover from network failure)

Message ID 1464721866-28275-1-git-send-email-haris.phnx@gmail.com (mailing list archive)
State New, archived
Headers show

Commit Message

Md Haris Iqbal May 31, 2016, 7:11 p.m. UTC
---
 include/migration/migration.h |  1 +
 migration/migration.c         | 41 ++++++++++++++++++++++++++++++++++++-----
 vl.c                          |  4 ++++
 3 files changed, 41 insertions(+), 5 deletions(-)

Comments

Dr. David Alan Gilbert June 1, 2016, 4:15 p.m. UTC | #1
* Md Haris Iqbal (haris.phnx@gmail.com) wrote:

Remember to add a more detailed comment about what the patch is doing.
(And possibly split it up a bit more)

> ---
>  include/migration/migration.h |  1 +
>  migration/migration.c         | 41 ++++++++++++++++++++++++++++++++++++-----
>  vl.c                          |  4 ++++
>  3 files changed, 41 insertions(+), 5 deletions(-)

> diff --git a/include/migration/migration.h b/include/migration/migration.h
> index ac2c12c..33da695 100644
> --- a/include/migration/migration.h
> +++ b/include/migration/migration.h
> @@ -325,6 +325,7 @@ void global_state_store_running(void);
>  void flush_page_queue(MigrationState *ms);
>  int ram_save_queue_pages(MigrationState *ms, const char *rbname,
>                           ram_addr_t start, ram_addr_t len);
> +int qemu_migrate_postcopy_outgoing_recovery(MigrationState *ms);
>  
>  PostcopyState postcopy_state_get(void);
>  /* Set the state and return the old state */
> diff --git a/migration/migration.c b/migration/migration.c
> index 991313a..ee0c2a8 100644
> --- a/migration/migration.c
> +++ b/migration/migration.c
> @@ -539,6 +539,7 @@ static bool migration_is_setup_or_active(int state)
>      case MIGRATION_STATUS_ACTIVE:
>      case MIGRATION_STATUS_POSTCOPY_ACTIVE:
>      case MIGRATION_STATUS_SETUP:
> +    case MIGRATION_STATUS_POSTCOPY_RECOVERY:
>          return true;
>  
>      default:
> @@ -1634,6 +1635,8 @@ static void *migration_thread(void *opaque)
>      /* The active state we expect to be in; ACTIVE or POSTCOPY_ACTIVE */
>      enum MigrationStatus current_active_state = MIGRATION_STATUS_ACTIVE;
>  
> +    int32_t ret;

The return type of qemu_file_get_error is int not int32_t.

>      rcu_register_thread();
>  
>      qemu_savevm_state_header(s->to_dst_file);
> @@ -1700,11 +1703,26 @@ static void *migration_thread(void *opaque)
>              }
>          }
>  
> -        if (qemu_file_get_error(s->to_dst_file)) {
> -            migrate_set_state(&s->state, current_active_state,
> -                              MIGRATION_STATUS_FAILED);
> -            trace_migration_thread_file_err();
> -            break;
> +        if ((ret = qemu_file_get_error(s->to_dst_file))) {
> +            fprintf(stderr, "1 : Error %s %d\n", strerror(-ret), -ret);

Remember to clean those fprintf's out at the end; although it can be useful
to leave some trace_ calls in; so for example modify
the trace_migrate_thread_file_err to take an error number.

> +            if(ret != -EIO && s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
> +                /* Network Failure during postcopy */

That probably needs commenting as to why it's not -EIO - since that's probably
what people might expect from a network error;  we'll have to keep an eye out
to see if that's realyl a reliable check.
(Also qemu normally puts a space after the 'if')

> +                current_active_state = MIGRATION_STATUS_POSTCOPY_RECOVERY;

That probably needs a migrate_set_state .

> +                runstate_set(RUN_STATE_POSTMIGRATE_RECOVERY);
> +                fprintf(stderr, "1.1 : Error %s %d\n", strerror(-ret), -ret);
> +                ret = qemu_migrate_postcopy_outgoing_recovery(s);
> +                if(ret < 0) {
> +                    break;
> +                }
> +
> +            } else {
> +                migrate_set_state(&s->state, current_active_state,
> +                                             MIGRATION_STATUS_FAILED);
> +                fprintf(stderr, "1.2 : Error %s %d\n", strerror(-ret), -ret);
> +                trace_migration_thread_file_err();
> +                break;
> +            }
>          }
>          current_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
>          if (current_time >= initial_time + BUFFER_DELAY) {
> @@ -1797,6 +1815,19 @@ void migrate_fd_connect(MigrationState *s)
>      s->migration_thread_running = true;
>  }
>  
> +int qemu_migrate_postcopy_outgoing_recovery(MigrationState* ms)
> +{
> +    migrate_set_state(&ms->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
> +                                  MIGRATION_STATUS_POSTCOPY_RECOVERY);
> +
> +    /* Code for network recovery to be added here */
> +    while(1) {
> +        fprintf(stderr, "Not letting it fail\n");
> +        sleep(2);
> +    }
> +
> +}
> +
>  PostcopyState  postcopy_state_get(void)
>  {
>      return atomic_mb_read(&incoming_postcopy_state);
> diff --git a/vl.c b/vl.c
> index 5fd22cb..c237140 100644
> --- a/vl.c
> +++ b/vl.c
> @@ -618,6 +618,10 @@ static const RunStateTransition runstate_transitions_def[] = {
>      { RUN_STATE_FINISH_MIGRATE, RUN_STATE_RUNNING },
>      { RUN_STATE_FINISH_MIGRATE, RUN_STATE_POSTMIGRATE },
>      { RUN_STATE_FINISH_MIGRATE, RUN_STATE_PRELAUNCH },
> +    { RUN_STATE_FINISH_MIGRATE, RUN_STATE_POSTMIGRATE_RECOVERY },
> +
> +    { RUN_STATE_POSTMIGRATE_RECOVERY, RUN_STATE_FINISH_MIGRATE },
> +    { RUN_STATE_POSTMIGRATE_RECOVERY, RUN_STATE_SHUTDOWN },
>  
>      { RUN_STATE_RESTORE_VM, RUN_STATE_RUNNING },
>      { RUN_STATE_RESTORE_VM, RUN_STATE_PRELAUNCH },
> -- 
> 2.7.4

Try to keep your patches together in a patch series; I find
  git format-patch -n --cover-letter and then the range of patches to generate
produces the nices result and then use git send-email.

Dave
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Md Haris Iqbal June 2, 2016, 8:57 a.m. UTC | #2
On Wed, Jun 1, 2016 at 9:45 PM, Dr. David Alan Gilbert
<dgilbert@redhat.com> wrote:
> * Md Haris Iqbal (haris.phnx@gmail.com) wrote:
>
> Remember to add a more detailed comment about what the patch is doing.

Sure. I will do that in for the upcoming patches.

> (And possibly split it up a bit more)

Done, I will split it according to files.

>
>> ---
>>  include/migration/migration.h |  1 +
>>  migration/migration.c         | 41 ++++++++++++++++++++++++++++++++++++-----
>>  vl.c                          |  4 ++++
>>  3 files changed, 41 insertions(+), 5 deletions(-)
>
>> diff --git a/include/migration/migration.h b/include/migration/migration.h
>> index ac2c12c..33da695 100644
>> --- a/include/migration/migration.h
>> +++ b/include/migration/migration.h
>> @@ -325,6 +325,7 @@ void global_state_store_running(void);
>>  void flush_page_queue(MigrationState *ms);
>>  int ram_save_queue_pages(MigrationState *ms, const char *rbname,
>>                           ram_addr_t start, ram_addr_t len);
>> +int qemu_migrate_postcopy_outgoing_recovery(MigrationState *ms);
>>
>>  PostcopyState postcopy_state_get(void);
>>  /* Set the state and return the old state */
>> diff --git a/migration/migration.c b/migration/migration.c
>> index 991313a..ee0c2a8 100644
>> --- a/migration/migration.c
>> +++ b/migration/migration.c
>> @@ -539,6 +539,7 @@ static bool migration_is_setup_or_active(int state)
>>      case MIGRATION_STATUS_ACTIVE:
>>      case MIGRATION_STATUS_POSTCOPY_ACTIVE:
>>      case MIGRATION_STATUS_SETUP:
>> +    case MIGRATION_STATUS_POSTCOPY_RECOVERY:
>>          return true;
>>
>>      default:
>> @@ -1634,6 +1635,8 @@ static void *migration_thread(void *opaque)
>>      /* The active state we expect to be in; ACTIVE or POSTCOPY_ACTIVE */
>>      enum MigrationStatus current_active_state = MIGRATION_STATUS_ACTIVE;
>>
>> +    int32_t ret;
>
> The return type of qemu_file_get_error is int not int32_t.

But, isn't signed int typedef'd to int32_t? And signed int is the same as int.

>
>>      rcu_register_thread();
>>
>>      qemu_savevm_state_header(s->to_dst_file);
>> @@ -1700,11 +1703,26 @@ static void *migration_thread(void *opaque)
>>              }
>>          }
>>
>> -        if (qemu_file_get_error(s->to_dst_file)) {
>> -            migrate_set_state(&s->state, current_active_state,
>> -                              MIGRATION_STATUS_FAILED);
>> -            trace_migration_thread_file_err();
>> -            break;
>> +        if ((ret = qemu_file_get_error(s->to_dst_file))) {
>> +            fprintf(stderr, "1 : Error %s %d\n", strerror(-ret), -ret);
>
> Remember to clean those fprintf's out at the end;
Yes, I will do that while preparing the final patch. These are good
for now, because I can see things happening when I run.

> although it can be useful
> to leave some trace_ calls in; so for example modify
> the trace_migrate_thread_file_err to take an error number.

Yes, I will add trace_ calls.

>
>> +            if(ret != -EIO && s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
>> +                /* Network Failure during postcopy */
>
> That probably needs commenting as to why it's not -EIO - since that's probably
> what people might expect from a network error;  we'll have to keep an eye out
> to see if that's realyl a reliable check.
> (Also qemu normally puts a space after the 'if')

Done.

>
>> +                current_active_state = MIGRATION_STATUS_POSTCOPY_RECOVERY;
>
> That probably needs a migrate_set_state .
>
>> +                runstate_set(RUN_STATE_POSTMIGRATE_RECOVERY);
>> +                fprintf(stderr, "1.1 : Error %s %d\n", strerror(-ret), -ret);
>> +                ret = qemu_migrate_postcopy_outgoing_recovery(s);
>> +                if(ret < 0) {
>> +                    break;
>> +                }
>> +
>> +            } else {
>> +                migrate_set_state(&s->state, current_active_state,
>> +                                             MIGRATION_STATUS_FAILED);
>> +                fprintf(stderr, "1.2 : Error %s %d\n", strerror(-ret), -ret);
>> +                trace_migration_thread_file_err();
>> +                break;
>> +            }
>>          }
>>          current_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
>>          if (current_time >= initial_time + BUFFER_DELAY) {
>> @@ -1797,6 +1815,19 @@ void migrate_fd_connect(MigrationState *s)
>>      s->migration_thread_running = true;
>>  }
>>
>> +int qemu_migrate_postcopy_outgoing_recovery(MigrationState* ms)
>> +{
>> +    migrate_set_state(&ms->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
>> +                                  MIGRATION_STATUS_POSTCOPY_RECOVERY);
>> +
>> +    /* Code for network recovery to be added here */
>> +    while(1) {
>> +        fprintf(stderr, "Not letting it fail\n");
>> +        sleep(2);
>> +    }
>> +
>> +}
>> +
>>  PostcopyState  postcopy_state_get(void)
>>  {
>>      return atomic_mb_read(&incoming_postcopy_state);
>> diff --git a/vl.c b/vl.c
>> index 5fd22cb..c237140 100644
>> --- a/vl.c
>> +++ b/vl.c
>> @@ -618,6 +618,10 @@ static const RunStateTransition runstate_transitions_def[] = {
>>      { RUN_STATE_FINISH_MIGRATE, RUN_STATE_RUNNING },
>>      { RUN_STATE_FINISH_MIGRATE, RUN_STATE_POSTMIGRATE },
>>      { RUN_STATE_FINISH_MIGRATE, RUN_STATE_PRELAUNCH },
>> +    { RUN_STATE_FINISH_MIGRATE, RUN_STATE_POSTMIGRATE_RECOVERY },
>> +
>> +    { RUN_STATE_POSTMIGRATE_RECOVERY, RUN_STATE_FINISH_MIGRATE },
>> +    { RUN_STATE_POSTMIGRATE_RECOVERY, RUN_STATE_SHUTDOWN },
>>
>>      { RUN_STATE_RESTORE_VM, RUN_STATE_RUNNING },
>>      { RUN_STATE_RESTORE_VM, RUN_STATE_PRELAUNCH },
>> --
>> 2.7.4
>
> Try to keep your patches together in a patch series; I find
>   git format-patch -n --cover-letter and then the range of patches to generate
> produces the nices result and then use git send-email.

Ok. I will use this from next time.

>
> Dave
> --
> Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
diff mbox

Patch

diff --git a/include/migration/migration.h b/include/migration/migration.h
index ac2c12c..33da695 100644
--- a/include/migration/migration.h
+++ b/include/migration/migration.h
@@ -325,6 +325,7 @@  void global_state_store_running(void);
 void flush_page_queue(MigrationState *ms);
 int ram_save_queue_pages(MigrationState *ms, const char *rbname,
                          ram_addr_t start, ram_addr_t len);
+int qemu_migrate_postcopy_outgoing_recovery(MigrationState *ms);
 
 PostcopyState postcopy_state_get(void);
 /* Set the state and return the old state */
diff --git a/migration/migration.c b/migration/migration.c
index 991313a..ee0c2a8 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -539,6 +539,7 @@  static bool migration_is_setup_or_active(int state)
     case MIGRATION_STATUS_ACTIVE:
     case MIGRATION_STATUS_POSTCOPY_ACTIVE:
     case MIGRATION_STATUS_SETUP:
+    case MIGRATION_STATUS_POSTCOPY_RECOVERY:
         return true;
 
     default:
@@ -1634,6 +1635,8 @@  static void *migration_thread(void *opaque)
     /* The active state we expect to be in; ACTIVE or POSTCOPY_ACTIVE */
     enum MigrationStatus current_active_state = MIGRATION_STATUS_ACTIVE;
 
+    int32_t ret;
+
     rcu_register_thread();
 
     qemu_savevm_state_header(s->to_dst_file);
@@ -1700,11 +1703,26 @@  static void *migration_thread(void *opaque)
             }
         }
 
-        if (qemu_file_get_error(s->to_dst_file)) {
-            migrate_set_state(&s->state, current_active_state,
-                              MIGRATION_STATUS_FAILED);
-            trace_migration_thread_file_err();
-            break;
+        if ((ret = qemu_file_get_error(s->to_dst_file))) {
+            fprintf(stderr, "1 : Error %s %d\n", strerror(-ret), -ret);
+            if(ret != -EIO && s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
+                /* Network Failure during postcopy */
+
+                current_active_state = MIGRATION_STATUS_POSTCOPY_RECOVERY;
+                runstate_set(RUN_STATE_POSTMIGRATE_RECOVERY);
+                fprintf(stderr, "1.1 : Error %s %d\n", strerror(-ret), -ret);
+                ret = qemu_migrate_postcopy_outgoing_recovery(s);
+                if(ret < 0) {
+                    break;
+                }
+
+            } else {
+                migrate_set_state(&s->state, current_active_state,
+                                             MIGRATION_STATUS_FAILED);
+                fprintf(stderr, "1.2 : Error %s %d\n", strerror(-ret), -ret);
+                trace_migration_thread_file_err();
+                break;
+            }
         }
         current_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
         if (current_time >= initial_time + BUFFER_DELAY) {
@@ -1797,6 +1815,19 @@  void migrate_fd_connect(MigrationState *s)
     s->migration_thread_running = true;
 }
 
+int qemu_migrate_postcopy_outgoing_recovery(MigrationState* ms)
+{
+    migrate_set_state(&ms->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
+                                  MIGRATION_STATUS_POSTCOPY_RECOVERY);
+
+    /* Code for network recovery to be added here */
+    while(1) {
+        fprintf(stderr, "Not letting it fail\n");
+        sleep(2);
+    }
+
+}
+
 PostcopyState  postcopy_state_get(void)
 {
     return atomic_mb_read(&incoming_postcopy_state);
diff --git a/vl.c b/vl.c
index 5fd22cb..c237140 100644
--- a/vl.c
+++ b/vl.c
@@ -618,6 +618,10 @@  static const RunStateTransition runstate_transitions_def[] = {
     { RUN_STATE_FINISH_MIGRATE, RUN_STATE_RUNNING },
     { RUN_STATE_FINISH_MIGRATE, RUN_STATE_POSTMIGRATE },
     { RUN_STATE_FINISH_MIGRATE, RUN_STATE_PRELAUNCH },
+    { RUN_STATE_FINISH_MIGRATE, RUN_STATE_POSTMIGRATE_RECOVERY },
+
+    { RUN_STATE_POSTMIGRATE_RECOVERY, RUN_STATE_FINISH_MIGRATE },
+    { RUN_STATE_POSTMIGRATE_RECOVERY, RUN_STATE_SHUTDOWN },
 
     { RUN_STATE_RESTORE_VM, RUN_STATE_RUNNING },
     { RUN_STATE_RESTORE_VM, RUN_STATE_PRELAUNCH },