Message ID | 1501229198-30588-30-git-send-email-peterx@redhat.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
* Peter Xu (peterx@redhat.com) wrote: > Firstly, MigThrError enumeration is introduced to describe the error in > migration_detect_error() better. This gives the migration_thread() a > chance to know whether a recovery has happened. > > Then, if a recovery is detected, migration_thread() will reset its local > variables to prepare for that. > > Signed-off-by: Peter Xu <peterx@redhat.com> > --- > migration/migration.c | 40 +++++++++++++++++++++++++++++----------- > 1 file changed, 29 insertions(+), 11 deletions(-) > > diff --git a/migration/migration.c b/migration/migration.c > index ecebe30..439bc22 100644 > --- a/migration/migration.c > +++ b/migration/migration.c > @@ -2159,6 +2159,15 @@ static bool postcopy_should_start(MigrationState *s) > return atomic_read(&s->start_postcopy) || s->start_postcopy_fast; > } > > +typedef enum MigThrError { > + /* No error detected */ > + MIG_THR_ERR_NONE = 0, > + /* Detected error, but resumed successfully */ > + MIG_THR_ERR_RECOVERED = 1, > + /* Detected fatal error, need to exit */ > + MIG_THR_ERR_FATAL = 2, > +} MigThrError; > + Could you move this patch earlier to when postcopy_pause is created so it's created with this enum? > static int postcopy_resume_handshake(MigrationState *s) > { > qemu_mutex_lock(&s->resume_lock); > @@ -2209,10 +2218,10 @@ static int postcopy_do_resume(MigrationState *s) > > /* > * We don't return until we are in a safe state to continue current > - * postcopy migration. Returns true to continue the migration, or > - * false to terminate current migration. > + * postcopy migration. Returns MIG_THR_ERR_RECOVERED if recovered, or > + * MIG_THR_ERR_FATAL if unrecovery failure happened. > */ > -static bool postcopy_pause(MigrationState *s) > +static MigThrError postcopy_pause(MigrationState *s) > { > assert(s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE); > > @@ -2247,7 +2256,7 @@ do_pause: > if (postcopy_do_resume(s) == 0) { > /* Let's continue! */ > trace_postcopy_pause_continued(); > - return true; > + return MIG_THR_ERR_RECOVERED; > } else { > /* > * Something wrong happened during the recovery, let's > @@ -2258,12 +2267,11 @@ do_pause: > } > } else { > /* This is not right... Time to quit. */ > - return false; > + return MIG_THR_ERR_FATAL; > } > } > > -/* Return true if we want to stop the migration, otherwise false. */ > -static bool migration_detect_error(MigrationState *s) > +static MigThrError migration_detect_error(MigrationState *s) > { > int ret; > > @@ -2272,7 +2280,7 @@ static bool migration_detect_error(MigrationState *s) > > if (!ret) { > /* Everything is fine */ > - return false; > + return MIG_THR_ERR_NONE; > } > > if (s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE && ret == -EIO) { > @@ -2281,7 +2289,7 @@ static bool migration_detect_error(MigrationState *s) > * while. After that, it can be continued by a > * recovery phase. > */ > - return !postcopy_pause(s); > + return postcopy_pause(s); > } else { > /* > * For precopy (or postcopy with error outside IO), we fail > @@ -2291,7 +2299,7 @@ static bool migration_detect_error(MigrationState *s) > trace_migration_thread_file_err(); > > /* Time to stop the migration, now. */ > - return true; > + return MIG_THR_ERR_FATAL; > } > } > > @@ -2319,6 +2327,7 @@ static void *migration_thread(void *opaque) > /* The active state we expect to be in; ACTIVE or POSTCOPY_ACTIVE */ > enum MigrationStatus current_active_state = MIGRATION_STATUS_ACTIVE; > bool enable_colo = migrate_colo_enabled(); > + MigThrError thr_error; > > rcu_register_thread(); > > @@ -2395,8 +2404,17 @@ static void *migration_thread(void *opaque) > * Try to detect any kind of failures, and see whether we > * should stop the migration now. > */ > - if (migration_detect_error(s)) { > + thr_error = migration_detect_error(s); > + if (thr_error == MIG_THR_ERR_FATAL) { > + /* Stop migration */ > break; > + } else if (thr_error == MIG_THR_ERR_RECOVERED) { > + /* > + * Just recovered from a e.g. network failure, reset all > + * the local variables. > + */ > + initial_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); > + initial_bytes = 0; They don't seem that important to reset? Dave > } > > current_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); > -- > 2.7.4 > -- Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
On Thu, Aug 03, 2017 at 02:54:35PM +0100, Dr. David Alan Gilbert wrote: > * Peter Xu (peterx@redhat.com) wrote: > > Firstly, MigThrError enumeration is introduced to describe the error in > > migration_detect_error() better. This gives the migration_thread() a > > chance to know whether a recovery has happened. > > > > Then, if a recovery is detected, migration_thread() will reset its local > > variables to prepare for that. > > > > Signed-off-by: Peter Xu <peterx@redhat.com> > > --- > > migration/migration.c | 40 +++++++++++++++++++++++++++++----------- > > 1 file changed, 29 insertions(+), 11 deletions(-) > > > > diff --git a/migration/migration.c b/migration/migration.c > > index ecebe30..439bc22 100644 > > --- a/migration/migration.c > > +++ b/migration/migration.c > > @@ -2159,6 +2159,15 @@ static bool postcopy_should_start(MigrationState *s) > > return atomic_read(&s->start_postcopy) || s->start_postcopy_fast; > > } > > > > +typedef enum MigThrError { > > + /* No error detected */ > > + MIG_THR_ERR_NONE = 0, > > + /* Detected error, but resumed successfully */ > > + MIG_THR_ERR_RECOVERED = 1, > > + /* Detected fatal error, need to exit */ > > + MIG_THR_ERR_FATAL = 2, > > +} MigThrError; > > + > > Could you move this patch earlier to when postcopy_pause is created > so it's created with this enum? Sure. [...] > > @@ -2319,6 +2327,7 @@ static void *migration_thread(void *opaque) > > /* The active state we expect to be in; ACTIVE or POSTCOPY_ACTIVE */ > > enum MigrationStatus current_active_state = MIGRATION_STATUS_ACTIVE; > > bool enable_colo = migrate_colo_enabled(); > > + MigThrError thr_error; > > > > rcu_register_thread(); > > > > @@ -2395,8 +2404,17 @@ static void *migration_thread(void *opaque) > > * Try to detect any kind of failures, and see whether we > > * should stop the migration now. > > */ > > - if (migration_detect_error(s)) { > > + thr_error = migration_detect_error(s); > > + if (thr_error == MIG_THR_ERR_FATAL) { > > + /* Stop migration */ > > break; > > + } else if (thr_error == MIG_THR_ERR_RECOVERED) { > > + /* > > + * Just recovered from a e.g. network failure, reset all > > + * the local variables. > > + */ > > + initial_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); > > + initial_bytes = 0; > > They don't seem that important to reset? The problem is that we have this in migration_thread(): if (current_time >= initial_time + BUFFER_DELAY) { uint64_t transferred_bytes = qemu_ftell(s->to_dst_file) - initial_bytes; uint64_t time_spent = current_time - initial_time; double bandwidth = (double)transferred_bytes / time_spent; threshold_size = bandwidth * s->parameters.downtime_limit; ... } Here qemu_ftell() would possibly be very small since we have just resumed... and then transferred_bytes will be extremely huge since "qemu_ftell(s->to_dst_file) - initial_bytes" is actually negative... Then, with luck, we'll got extremely huge "bandwidth" as well.
* Peter Xu (peterx@redhat.com) wrote: > On Thu, Aug 03, 2017 at 02:54:35PM +0100, Dr. David Alan Gilbert wrote: > > * Peter Xu (peterx@redhat.com) wrote: > > > Firstly, MigThrError enumeration is introduced to describe the error in > > > migration_detect_error() better. This gives the migration_thread() a > > > chance to know whether a recovery has happened. > > > > > > Then, if a recovery is detected, migration_thread() will reset its local > > > variables to prepare for that. > > > > > > Signed-off-by: Peter Xu <peterx@redhat.com> > > > --- > > > migration/migration.c | 40 +++++++++++++++++++++++++++++----------- > > > 1 file changed, 29 insertions(+), 11 deletions(-) > > > > > > diff --git a/migration/migration.c b/migration/migration.c > > > index ecebe30..439bc22 100644 > > > --- a/migration/migration.c > > > +++ b/migration/migration.c > > > @@ -2159,6 +2159,15 @@ static bool postcopy_should_start(MigrationState *s) > > > return atomic_read(&s->start_postcopy) || s->start_postcopy_fast; > > > } > > > > > > +typedef enum MigThrError { > > > + /* No error detected */ > > > + MIG_THR_ERR_NONE = 0, > > > + /* Detected error, but resumed successfully */ > > > + MIG_THR_ERR_RECOVERED = 1, > > > + /* Detected fatal error, need to exit */ > > > + MIG_THR_ERR_FATAL = 2, > > > +} MigThrError; > > > + > > > > Could you move this patch earlier to when postcopy_pause is created > > so it's created with this enum? > > Sure. > > [...] > > > > @@ -2319,6 +2327,7 @@ static void *migration_thread(void *opaque) > > > /* The active state we expect to be in; ACTIVE or POSTCOPY_ACTIVE */ > > > enum MigrationStatus current_active_state = MIGRATION_STATUS_ACTIVE; > > > bool enable_colo = migrate_colo_enabled(); > > > + MigThrError thr_error; > > > > > > rcu_register_thread(); > > > > > > @@ -2395,8 +2404,17 @@ static void *migration_thread(void *opaque) > > > * Try to detect any kind of failures, and see whether we > > > * should stop the migration now. > > > */ > > > - if (migration_detect_error(s)) { > > > + thr_error = migration_detect_error(s); > > > + if (thr_error == MIG_THR_ERR_FATAL) { > > > + /* Stop migration */ > > > break; > > > + } else if (thr_error == MIG_THR_ERR_RECOVERED) { > > > + /* > > > + * Just recovered from a e.g. network failure, reset all > > > + * the local variables. > > > + */ > > > + initial_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); > > > + initial_bytes = 0; > > > > They don't seem that important to reset? > > The problem is that we have this in migration_thread(): > > if (current_time >= initial_time + BUFFER_DELAY) { > uint64_t transferred_bytes = qemu_ftell(s->to_dst_file) - > initial_bytes; > uint64_t time_spent = current_time - initial_time; > double bandwidth = (double)transferred_bytes / time_spent; > threshold_size = bandwidth * s->parameters.downtime_limit; > ... > } > > Here qemu_ftell() would possibly be very small since we have just > resumed... and then transferred_bytes will be extremely huge since > "qemu_ftell(s->to_dst_file) - initial_bytes" is actually negative... > Then, with luck, we'll got extremely huge "bandwidth" as well. Ah yes that's a good reason to reset it then; add a comment like 'important to avoid breaking transferred_bytes and bandwidth calculation' Dave > -- > Peter Xu -- Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
On Fri, Aug 04, 2017 at 10:52:27AM +0100, Dr. David Alan Gilbert wrote: > * Peter Xu (peterx@redhat.com) wrote: > > On Thu, Aug 03, 2017 at 02:54:35PM +0100, Dr. David Alan Gilbert wrote: [...] > > > > @@ -2319,6 +2327,7 @@ static void *migration_thread(void *opaque) > > > > /* The active state we expect to be in; ACTIVE or POSTCOPY_ACTIVE */ > > > > enum MigrationStatus current_active_state = MIGRATION_STATUS_ACTIVE; > > > > bool enable_colo = migrate_colo_enabled(); > > > > + MigThrError thr_error; > > > > > > > > rcu_register_thread(); > > > > > > > > @@ -2395,8 +2404,17 @@ static void *migration_thread(void *opaque) > > > > * Try to detect any kind of failures, and see whether we > > > > * should stop the migration now. > > > > */ > > > > - if (migration_detect_error(s)) { > > > > + thr_error = migration_detect_error(s); > > > > + if (thr_error == MIG_THR_ERR_FATAL) { > > > > + /* Stop migration */ > > > > break; > > > > + } else if (thr_error == MIG_THR_ERR_RECOVERED) { > > > > + /* > > > > + * Just recovered from a e.g. network failure, reset all > > > > + * the local variables. > > > > + */ > > > > + initial_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); > > > > + initial_bytes = 0; > > > > > > They don't seem that important to reset? > > > > The problem is that we have this in migration_thread(): > > > > if (current_time >= initial_time + BUFFER_DELAY) { > > uint64_t transferred_bytes = qemu_ftell(s->to_dst_file) - > > initial_bytes; > > uint64_t time_spent = current_time - initial_time; > > double bandwidth = (double)transferred_bytes / time_spent; > > threshold_size = bandwidth * s->parameters.downtime_limit; > > ... > > } > > > > Here qemu_ftell() would possibly be very small since we have just > > resumed... and then transferred_bytes will be extremely huge since > > "qemu_ftell(s->to_dst_file) - initial_bytes" is actually negative... > > Then, with luck, we'll got extremely huge "bandwidth" as well. > > Ah yes that's a good reason to reset it then; add a comment like > 'important to avoid breaking transferred_bytes and bandwidth > calculation' Will do.
diff --git a/migration/migration.c b/migration/migration.c index ecebe30..439bc22 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -2159,6 +2159,15 @@ static bool postcopy_should_start(MigrationState *s) return atomic_read(&s->start_postcopy) || s->start_postcopy_fast; } +typedef enum MigThrError { + /* No error detected */ + MIG_THR_ERR_NONE = 0, + /* Detected error, but resumed successfully */ + MIG_THR_ERR_RECOVERED = 1, + /* Detected fatal error, need to exit */ + MIG_THR_ERR_FATAL = 2, +} MigThrError; + static int postcopy_resume_handshake(MigrationState *s) { qemu_mutex_lock(&s->resume_lock); @@ -2209,10 +2218,10 @@ static int postcopy_do_resume(MigrationState *s) /* * We don't return until we are in a safe state to continue current - * postcopy migration. Returns true to continue the migration, or - * false to terminate current migration. + * postcopy migration. Returns MIG_THR_ERR_RECOVERED if recovered, or + * MIG_THR_ERR_FATAL if unrecovery failure happened. */ -static bool postcopy_pause(MigrationState *s) +static MigThrError postcopy_pause(MigrationState *s) { assert(s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE); @@ -2247,7 +2256,7 @@ do_pause: if (postcopy_do_resume(s) == 0) { /* Let's continue! */ trace_postcopy_pause_continued(); - return true; + return MIG_THR_ERR_RECOVERED; } else { /* * Something wrong happened during the recovery, let's @@ -2258,12 +2267,11 @@ do_pause: } } else { /* This is not right... Time to quit. */ - return false; + return MIG_THR_ERR_FATAL; } } -/* Return true if we want to stop the migration, otherwise false. */ -static bool migration_detect_error(MigrationState *s) +static MigThrError migration_detect_error(MigrationState *s) { int ret; @@ -2272,7 +2280,7 @@ static bool migration_detect_error(MigrationState *s) if (!ret) { /* Everything is fine */ - return false; + return MIG_THR_ERR_NONE; } if (s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE && ret == -EIO) { @@ -2281,7 +2289,7 @@ static bool migration_detect_error(MigrationState *s) * while. After that, it can be continued by a * recovery phase. */ - return !postcopy_pause(s); + return postcopy_pause(s); } else { /* * For precopy (or postcopy with error outside IO), we fail @@ -2291,7 +2299,7 @@ static bool migration_detect_error(MigrationState *s) trace_migration_thread_file_err(); /* Time to stop the migration, now. */ - return true; + return MIG_THR_ERR_FATAL; } } @@ -2319,6 +2327,7 @@ static void *migration_thread(void *opaque) /* The active state we expect to be in; ACTIVE or POSTCOPY_ACTIVE */ enum MigrationStatus current_active_state = MIGRATION_STATUS_ACTIVE; bool enable_colo = migrate_colo_enabled(); + MigThrError thr_error; rcu_register_thread(); @@ -2395,8 +2404,17 @@ static void *migration_thread(void *opaque) * Try to detect any kind of failures, and see whether we * should stop the migration now. */ - if (migration_detect_error(s)) { + thr_error = migration_detect_error(s); + if (thr_error == MIG_THR_ERR_FATAL) { + /* Stop migration */ break; + } else if (thr_error == MIG_THR_ERR_RECOVERED) { + /* + * Just recovered from a e.g. network failure, reset all + * the local variables. + */ + initial_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); + initial_bytes = 0; } current_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
Firstly, MigThrError enumeration is introduced to describe the error in migration_detect_error() better. This gives the migration_thread() a chance to know whether a recovery has happened. Then, if a recovery is detected, migration_thread() will reset its local variables to prepare for that. Signed-off-by: Peter Xu <peterx@redhat.com> --- migration/migration.c | 40 +++++++++++++++++++++++++++++----------- 1 file changed, 29 insertions(+), 11 deletions(-)