diff mbox series

[V2,04/11] migration: stop vm earlier for cpr

Message ID 1719776434-435013-5-git-send-email-steven.sistare@oracle.com (mailing list archive)
State New, archived
Headers show
Series Live update: cpr-exec | expand

Commit Message

Steven Sistare June 30, 2024, 7:40 p.m. UTC
Stop the vm earlier for cpr, to guarantee consistent device state when
CPR state is saved.

Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
---
 migration/migration.c | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

Comments

Fabiano Rosas July 17, 2024, 6:59 p.m. UTC | #1
Steve Sistare <steven.sistare@oracle.com> writes:

> Stop the vm earlier for cpr, to guarantee consistent device state when
> CPR state is saved.
>
> Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
> ---
>  migration/migration.c | 22 +++++++++++++---------
>  1 file changed, 13 insertions(+), 9 deletions(-)
>
> diff --git a/migration/migration.c b/migration/migration.c
> index 0f47765..8a8e927 100644
> --- a/migration/migration.c
> +++ b/migration/migration.c
> @@ -2077,6 +2077,7 @@ void qmp_migrate(const char *uri, bool has_channels,
>      MigrationState *s = migrate_get_current();
>      g_autoptr(MigrationChannel) channel = NULL;
>      MigrationAddress *addr = NULL;
> +    bool stopped = false;
>  
>      /*
>       * Having preliminary checks for uri and channel
> @@ -2120,6 +2121,15 @@ void qmp_migrate(const char *uri, bool has_channels,
>          }
>      }
>  
> +    if (migrate_mode_is_cpr(s)) {
> +        int ret = migration_stop_vm(s, RUN_STATE_FINISH_MIGRATE);
> +        if (ret < 0) {
> +            error_setg(&local_err, "migration_stop_vm failed, error %d", -ret);
> +            goto out;
> +        }
> +        stopped = true;
> +    }
> +
>      if (cpr_state_save(&local_err)) {
>          goto out;
>      }
> @@ -2155,6 +2165,9 @@ out:
>          }
>          migrate_fd_error(s, local_err);
>          error_propagate(errp, local_err);
> +        if (stopped && runstate_is_live(s->vm_old_state)) {
> +            vm_start();
> +        }

What about non-live states? Shouldn't this be:

if (stopped) {
   vm_resume();
}

>          return;
>      }
>  }
> @@ -3738,7 +3751,6 @@ void migrate_fd_connect(MigrationState *s, Error *error_in)
>      Error *local_err = NULL;
>      uint64_t rate_limit;
>      bool resume = (s->state == MIGRATION_STATUS_POSTCOPY_RECOVER_SETUP);
> -    int ret;
>  
>      /*
>       * If there's a previous error, free it and prepare for another one.
> @@ -3810,14 +3822,6 @@ void migrate_fd_connect(MigrationState *s, Error *error_in)
>          return;
>      }
>  
> -    if (migrate_mode_is_cpr(s)) {
> -        ret = migration_stop_vm(s, RUN_STATE_FINISH_MIGRATE);
> -        if (ret < 0) {
> -            error_setg(&local_err, "migration_stop_vm failed, error %d", -ret);
> -            goto fail;
> -        }
> -    }
> -
>      if (migrate_background_snapshot()) {
>          qemu_thread_create(&s->thread, "mig/snapshot",
>                  bg_migration_thread, s, QEMU_THREAD_JOINABLE);
Steven Sistare July 20, 2024, 8 p.m. UTC | #2
On 7/17/2024 2:59 PM, Fabiano Rosas wrote:
> Steve Sistare <steven.sistare@oracle.com> writes:
> 
>> Stop the vm earlier for cpr, to guarantee consistent device state when
>> CPR state is saved.
>>
>> Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
>> ---
>>   migration/migration.c | 22 +++++++++++++---------
>>   1 file changed, 13 insertions(+), 9 deletions(-)
>>
>> diff --git a/migration/migration.c b/migration/migration.c
>> index 0f47765..8a8e927 100644
>> --- a/migration/migration.c
>> +++ b/migration/migration.c
>> @@ -2077,6 +2077,7 @@ void qmp_migrate(const char *uri, bool has_channels,
>>       MigrationState *s = migrate_get_current();
>>       g_autoptr(MigrationChannel) channel = NULL;
>>       MigrationAddress *addr = NULL;
>> +    bool stopped = false;
>>   
>>       /*
>>        * Having preliminary checks for uri and channel
>> @@ -2120,6 +2121,15 @@ void qmp_migrate(const char *uri, bool has_channels,
>>           }
>>       }
>>   
>> +    if (migrate_mode_is_cpr(s)) {
>> +        int ret = migration_stop_vm(s, RUN_STATE_FINISH_MIGRATE);
>> +        if (ret < 0) {
>> +            error_setg(&local_err, "migration_stop_vm failed, error %d", -ret);
>> +            goto out;
>> +        }
>> +        stopped = true;
>> +    }
>> +
>>       if (cpr_state_save(&local_err)) {
>>           goto out;
>>       }
>> @@ -2155,6 +2165,9 @@ out:
>>           }
>>           migrate_fd_error(s, local_err);
>>           error_propagate(errp, local_err);
>> +        if (stopped && runstate_is_live(s->vm_old_state)) {
>> +            vm_start();
>> +        }
> 
> What about non-live states? Shouldn't this be:
> 
> if (stopped) {
>     vm_resume();
> }

Not quite.  vm_old_state may be a stopped state, so we don't want to resume.
However, I should probably restore the old stopped state here.  I'll try some more
error recovery scenarios.

- Steve

> 
>>           return;
>>       }
>>   }
>> @@ -3738,7 +3751,6 @@ void migrate_fd_connect(MigrationState *s, Error *error_in)
>>       Error *local_err = NULL;
>>       uint64_t rate_limit;
>>       bool resume = (s->state == MIGRATION_STATUS_POSTCOPY_RECOVER_SETUP);
>> -    int ret;
>>   
>>       /*
>>        * If there's a previous error, free it and prepare for another one.
>> @@ -3810,14 +3822,6 @@ void migrate_fd_connect(MigrationState *s, Error *error_in)
>>           return;
>>       }
>>   
>> -    if (migrate_mode_is_cpr(s)) {
>> -        ret = migration_stop_vm(s, RUN_STATE_FINISH_MIGRATE);
>> -        if (ret < 0) {
>> -            error_setg(&local_err, "migration_stop_vm failed, error %d", -ret);
>> -            goto fail;
>> -        }
>> -    }
>> -
>>       if (migrate_background_snapshot()) {
>>           qemu_thread_create(&s->thread, "mig/snapshot",
>>                   bg_migration_thread, s, QEMU_THREAD_JOINABLE);
Fabiano Rosas July 22, 2024, 1:42 p.m. UTC | #3
Steven Sistare <steven.sistare@oracle.com> writes:

> On 7/17/2024 2:59 PM, Fabiano Rosas wrote:
>> Steve Sistare <steven.sistare@oracle.com> writes:
>> 
>>> Stop the vm earlier for cpr, to guarantee consistent device state when
>>> CPR state is saved.
>>>
>>> Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
>>> ---
>>>   migration/migration.c | 22 +++++++++++++---------
>>>   1 file changed, 13 insertions(+), 9 deletions(-)
>>>
>>> diff --git a/migration/migration.c b/migration/migration.c
>>> index 0f47765..8a8e927 100644
>>> --- a/migration/migration.c
>>> +++ b/migration/migration.c
>>> @@ -2077,6 +2077,7 @@ void qmp_migrate(const char *uri, bool has_channels,
>>>       MigrationState *s = migrate_get_current();
>>>       g_autoptr(MigrationChannel) channel = NULL;
>>>       MigrationAddress *addr = NULL;
>>> +    bool stopped = false;
>>>   
>>>       /*
>>>        * Having preliminary checks for uri and channel
>>> @@ -2120,6 +2121,15 @@ void qmp_migrate(const char *uri, bool has_channels,
>>>           }
>>>       }
>>>   
>>> +    if (migrate_mode_is_cpr(s)) {
>>> +        int ret = migration_stop_vm(s, RUN_STATE_FINISH_MIGRATE);
>>> +        if (ret < 0) {
>>> +            error_setg(&local_err, "migration_stop_vm failed, error %d", -ret);
>>> +            goto out;
>>> +        }
>>> +        stopped = true;
>>> +    }
>>> +
>>>       if (cpr_state_save(&local_err)) {
>>>           goto out;
>>>       }
>>> @@ -2155,6 +2165,9 @@ out:
>>>           }
>>>           migrate_fd_error(s, local_err);
>>>           error_propagate(errp, local_err);
>>> +        if (stopped && runstate_is_live(s->vm_old_state)) {
>>> +            vm_start();
>>> +        }
>> 
>> What about non-live states? Shouldn't this be:
>> 
>> if (stopped) {
>>     vm_resume();
>> }
>
> Not quite.  vm_old_state may be a stopped state, so we don't want to resume.
> However, I should probably restore the old stopped state here.  I'll try some more
> error recovery scenarios.

AIUI vm_resume() does the right thing already:

void vm_resume(RunState state)
{
    if (runstate_is_live(state)) {
        vm_start();
    } else {
        runstate_set(state);
    }
}

>
> - Steve
>
>> 
>>>           return;
>>>       }
>>>   }
>>> @@ -3738,7 +3751,6 @@ void migrate_fd_connect(MigrationState *s, Error *error_in)
>>>       Error *local_err = NULL;
>>>       uint64_t rate_limit;
>>>       bool resume = (s->state == MIGRATION_STATUS_POSTCOPY_RECOVER_SETUP);
>>> -    int ret;
>>>   
>>>       /*
>>>        * If there's a previous error, free it and prepare for another one.
>>> @@ -3810,14 +3822,6 @@ void migrate_fd_connect(MigrationState *s, Error *error_in)
>>>           return;
>>>       }
>>>   
>>> -    if (migrate_mode_is_cpr(s)) {
>>> -        ret = migration_stop_vm(s, RUN_STATE_FINISH_MIGRATE);
>>> -        if (ret < 0) {
>>> -            error_setg(&local_err, "migration_stop_vm failed, error %d", -ret);
>>> -            goto fail;
>>> -        }
>>> -    }
>>> -
>>>       if (migrate_background_snapshot()) {
>>>           qemu_thread_create(&s->thread, "mig/snapshot",
>>>                   bg_migration_thread, s, QEMU_THREAD_JOINABLE);
Steven Sistare Aug. 6, 2024, 8:52 p.m. UTC | #4
On 7/22/2024 9:42 AM, Fabiano Rosas wrote:
> Steven Sistare <steven.sistare@oracle.com> writes:
> 
>> On 7/17/2024 2:59 PM, Fabiano Rosas wrote:
>>> Steve Sistare <steven.sistare@oracle.com> writes:
>>>
>>>> Stop the vm earlier for cpr, to guarantee consistent device state when
>>>> CPR state is saved.
>>>>
>>>> Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
>>>> ---
>>>>    migration/migration.c | 22 +++++++++++++---------
>>>>    1 file changed, 13 insertions(+), 9 deletions(-)
>>>>
>>>> diff --git a/migration/migration.c b/migration/migration.c
>>>> index 0f47765..8a8e927 100644
>>>> --- a/migration/migration.c
>>>> +++ b/migration/migration.c
>>>> @@ -2077,6 +2077,7 @@ void qmp_migrate(const char *uri, bool has_channels,
>>>>        MigrationState *s = migrate_get_current();
>>>>        g_autoptr(MigrationChannel) channel = NULL;
>>>>        MigrationAddress *addr = NULL;
>>>> +    bool stopped = false;
>>>>    
>>>>        /*
>>>>         * Having preliminary checks for uri and channel
>>>> @@ -2120,6 +2121,15 @@ void qmp_migrate(const char *uri, bool has_channels,
>>>>            }
>>>>        }
>>>>    
>>>> +    if (migrate_mode_is_cpr(s)) {
>>>> +        int ret = migration_stop_vm(s, RUN_STATE_FINISH_MIGRATE);
>>>> +        if (ret < 0) {
>>>> +            error_setg(&local_err, "migration_stop_vm failed, error %d", -ret);
>>>> +            goto out;
>>>> +        }
>>>> +        stopped = true;
>>>> +    }
>>>> +
>>>>        if (cpr_state_save(&local_err)) {
>>>>            goto out;
>>>>        }
>>>> @@ -2155,6 +2165,9 @@ out:
>>>>            }
>>>>            migrate_fd_error(s, local_err);
>>>>            error_propagate(errp, local_err);
>>>> +        if (stopped && runstate_is_live(s->vm_old_state)) {
>>>> +            vm_start();
>>>> +        }
>>>
>>> What about non-live states? Shouldn't this be:
>>>
>>> if (stopped) {
>>>      vm_resume();
>>> }
>>
>> Not quite.  vm_old_state may be a stopped state, so we don't want to resume.
>> However, I should probably restore the old stopped state here.  I'll try some more
>> error recovery scenarios.
> 
> AIUI vm_resume() does the right thing already:
> 
> void vm_resume(RunState state)
> {
>      if (runstate_is_live(state)) {
>          vm_start();
>      } else {
>          runstate_set(state);
>      }
> }

Yes, thanks, I do need to set vm_old_state if not live.  It should be:

out:
     ...
     if (stopped) {
         vm_resume(s->vm_old_state);
     }

- Steve

>>>>            return;
>>>>        }
>>>>    }
>>>> @@ -3738,7 +3751,6 @@ void migrate_fd_connect(MigrationState *s, Error *error_in)
>>>>        Error *local_err = NULL;
>>>>        uint64_t rate_limit;
>>>>        bool resume = (s->state == MIGRATION_STATUS_POSTCOPY_RECOVER_SETUP);
>>>> -    int ret;
>>>>    
>>>>        /*
>>>>         * If there's a previous error, free it and prepare for another one.
>>>> @@ -3810,14 +3822,6 @@ void migrate_fd_connect(MigrationState *s, Error *error_in)
>>>>            return;
>>>>        }
>>>>    
>>>> -    if (migrate_mode_is_cpr(s)) {
>>>> -        ret = migration_stop_vm(s, RUN_STATE_FINISH_MIGRATE);
>>>> -        if (ret < 0) {
>>>> -            error_setg(&local_err, "migration_stop_vm failed, error %d", -ret);
>>>> -            goto fail;
>>>> -        }
>>>> -    }
>>>> -
>>>>        if (migrate_background_snapshot()) {
>>>>            qemu_thread_create(&s->thread, "mig/snapshot",
>>>>                    bg_migration_thread, s, QEMU_THREAD_JOINABLE);
diff mbox series

Patch

diff --git a/migration/migration.c b/migration/migration.c
index 0f47765..8a8e927 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -2077,6 +2077,7 @@  void qmp_migrate(const char *uri, bool has_channels,
     MigrationState *s = migrate_get_current();
     g_autoptr(MigrationChannel) channel = NULL;
     MigrationAddress *addr = NULL;
+    bool stopped = false;
 
     /*
      * Having preliminary checks for uri and channel
@@ -2120,6 +2121,15 @@  void qmp_migrate(const char *uri, bool has_channels,
         }
     }
 
+    if (migrate_mode_is_cpr(s)) {
+        int ret = migration_stop_vm(s, RUN_STATE_FINISH_MIGRATE);
+        if (ret < 0) {
+            error_setg(&local_err, "migration_stop_vm failed, error %d", -ret);
+            goto out;
+        }
+        stopped = true;
+    }
+
     if (cpr_state_save(&local_err)) {
         goto out;
     }
@@ -2155,6 +2165,9 @@  out:
         }
         migrate_fd_error(s, local_err);
         error_propagate(errp, local_err);
+        if (stopped && runstate_is_live(s->vm_old_state)) {
+            vm_start();
+        }
         return;
     }
 }
@@ -3738,7 +3751,6 @@  void migrate_fd_connect(MigrationState *s, Error *error_in)
     Error *local_err = NULL;
     uint64_t rate_limit;
     bool resume = (s->state == MIGRATION_STATUS_POSTCOPY_RECOVER_SETUP);
-    int ret;
 
     /*
      * If there's a previous error, free it and prepare for another one.
@@ -3810,14 +3822,6 @@  void migrate_fd_connect(MigrationState *s, Error *error_in)
         return;
     }
 
-    if (migrate_mode_is_cpr(s)) {
-        ret = migration_stop_vm(s, RUN_STATE_FINISH_MIGRATE);
-        if (ret < 0) {
-            error_setg(&local_err, "migration_stop_vm failed, error %d", -ret);
-            goto fail;
-        }
-    }
-
     if (migrate_background_snapshot()) {
         qemu_thread_create(&s->thread, "mig/snapshot",
                 bg_migration_thread, s, QEMU_THREAD_JOINABLE);