diff mbox series

[v1,1/1] migration: Fix yank on postcopy multifd crashing guest after migration

Message ID 20221109055629.789795-1-leobras@redhat.com (mailing list archive)
State New, archived
Headers show
Series [v1,1/1] migration: Fix yank on postcopy multifd crashing guest after migration | expand

Commit Message

Leonardo Bras Nov. 9, 2022, 5:56 a.m. UTC
When multifd and postcopy-ram capabilities are enabled, if a
migrate-start-postcopy is attempted, the migration will finish sending the
memory pages and then crash with the following error:

qemu-system-x86_64: ../util/yank.c:107: yank_unregister_instance: Assertion
`QLIST_EMPTY(&entry->yankfns)' failed.

This happens because even though all multifd channels could
yank_register_function(), none of them could unregister it before
unregistering the MIGRATION_YANK_INSTANCE, causing the assert to fail.

Fix that by calling multifd_load_cleanup() on postcopy_ram_listen_thread()
before MIGRATION_YANK_INSTANCE is unregistered.

Fixes: b5eea99ec2 ("migration: Add yank feature")
Reported-by: Li Xiaohui <xiaohli@redhat.com>
Signed-off-by: Leonardo Bras <leobras@redhat.com>
---
 migration/migration.h |  1 +
 migration/migration.c | 18 +++++++++++++-----
 migration/savevm.c    |  2 ++
 3 files changed, 16 insertions(+), 5 deletions(-)

Comments

Dr. David Alan Gilbert Nov. 9, 2022, 1:31 p.m. UTC | #1
* Leonardo Bras (leobras@redhat.com) wrote:
> When multifd and postcopy-ram capabilities are enabled, if a
> migrate-start-postcopy is attempted, the migration will finish sending the
> memory pages and then crash with the following error:

How does that happen? Isn't multifd+postcopy still disabled, I see in 
migrate_caps_check

    if (cap_list[MIGRATION_CAPABILITY_POSTCOPY_RAM]) {
    ....
        if (cap_list[MIGRATION_CAPABILITY_MULTIFD]) {
            error_setg(errp, "Postcopy is not yet compatible with multifd");
            return false;
        }
    }


Dave

> qemu-system-x86_64: ../util/yank.c:107: yank_unregister_instance: Assertion
> `QLIST_EMPTY(&entry->yankfns)' failed.
> 
> This happens because even though all multifd channels could
> yank_register_function(), none of them could unregister it before
> unregistering the MIGRATION_YANK_INSTANCE, causing the assert to fail.
> 
> Fix that by calling multifd_load_cleanup() on postcopy_ram_listen_thread()
> before MIGRATION_YANK_INSTANCE is unregistered.
> 
> Fixes: b5eea99ec2 ("migration: Add yank feature")
> Reported-by: Li Xiaohui <xiaohli@redhat.com>
> Signed-off-by: Leonardo Bras <leobras@redhat.com>
> ---
>  migration/migration.h |  1 +
>  migration/migration.c | 18 +++++++++++++-----
>  migration/savevm.c    |  2 ++
>  3 files changed, 16 insertions(+), 5 deletions(-)
> 
> diff --git a/migration/migration.h b/migration/migration.h
> index cdad8aceaa..240f64efb0 100644
> --- a/migration/migration.h
> +++ b/migration/migration.h
> @@ -473,6 +473,7 @@ void migration_make_urgent_request(void);
>  void migration_consume_urgent_request(void);
>  bool migration_rate_limit(void);
>  void migration_cancel(const Error *error);
> +bool migration_load_cleanup(void);
>  
>  void populate_vfio_info(MigrationInfo *info);
>  void postcopy_temp_page_reset(PostcopyTmpPage *tmp_page);
> diff --git a/migration/migration.c b/migration/migration.c
> index 739bb683f3..4f363b2a95 100644
> --- a/migration/migration.c
> +++ b/migration/migration.c
> @@ -486,6 +486,17 @@ void migrate_add_address(SocketAddress *address)
>                        QAPI_CLONE(SocketAddress, address));
>  }
>  
> +bool migration_load_cleanup(void)
> +{
> +    Error *local_err = NULL;
> +
> +    if (multifd_load_cleanup(&local_err)) {
> +        error_report_err(local_err);
> +        return true;
> +    }
> +    return false;
> +}
> +
>  static void qemu_start_incoming_migration(const char *uri, Error **errp)
>  {
>      const char *p = NULL;
> @@ -540,8 +551,7 @@ static void process_incoming_migration_bh(void *opaque)
>       */
>      qemu_announce_self(&mis->announce_timer, migrate_announce_params());
>  
> -    if (multifd_load_cleanup(&local_err) != 0) {
> -        error_report_err(local_err);
> +    if (migration_load_cleanup()) {
>          autostart = false;
>      }
>      /* If global state section was not received or we are in running
> @@ -646,9 +656,7 @@ fail:
>      migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
>                        MIGRATION_STATUS_FAILED);
>      qemu_fclose(mis->from_src_file);
> -    if (multifd_load_cleanup(&local_err) != 0) {
> -        error_report_err(local_err);
> -    }
> +    migration_load_cleanup();
>      exit(EXIT_FAILURE);
>  }
>  
> diff --git a/migration/savevm.c b/migration/savevm.c
> index a0cdb714f7..250caff7f4 100644
> --- a/migration/savevm.c
> +++ b/migration/savevm.c
> @@ -1889,6 +1889,8 @@ static void *postcopy_ram_listen_thread(void *opaque)
>          exit(EXIT_FAILURE);
>      }
>  
> +    migration_load_cleanup();
> +
>      migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
>                                     MIGRATION_STATUS_COMPLETED);
>      /*
> -- 
> 2.38.1
>
Leonardo Bras Nov. 9, 2022, 4:59 p.m. UTC | #2
On Wed, Nov 9, 2022 at 10:31 AM Dr. David Alan Gilbert
<dgilbert@redhat.com> wrote:
>
> * Leonardo Bras (leobras@redhat.com) wrote:
> > When multifd and postcopy-ram capabilities are enabled, if a
> > migrate-start-postcopy is attempted, the migration will finish sending the
> > memory pages and then crash with the following error:
>
> How does that happen? Isn't multifd+postcopy still disabled, I see in
> migrate_caps_check
>
>     if (cap_list[MIGRATION_CAPABILITY_POSTCOPY_RAM]) {
>     ....
>         if (cap_list[MIGRATION_CAPABILITY_MULTIFD]) {
>             error_setg(errp, "Postcopy is not yet compatible with multifd");
>             return false;
>         }
>     }
>

I can't see this happening in upstream code (v7.2.0-rc0). Could you
please tell me the lines where this happens?

I mean, I see cap_list[MIGRATION_CAPABILITY_MULTIFD] and
cap_list[MIGRATION_CAPABILITY_POSTCOPY_RAM] in migrate_caps_check()
but I can't see them nested like this, so I am probably missing
something.

This procedure to reproduce was shared by Xiaohui Li (I added a few tweaks):

1.Boot a guest with any qemu command on source host;
2.Boot a guest with same qemu command but append '-incoming defer' on
destination host;
3.Enable multifd and postcopy capabilities on src and dst hosts:
{"execute":"migrate-set-capabilities","arguments":{"capabilities":[{"capability":"multifd","state":true}]}}
{"execute":"migrate-set-capabilities","arguments":{"capabilities":[{"capability":"postcopy-ram","state":true}]}}
4.During migration is active, switch to postcopy mode:
{"execute":"migrate-start-postcopy"}

Best regards,
Leo


>
> Dave
>
> > qemu-system-x86_64: ../util/yank.c:107: yank_unregister_instance: Assertion
> > `QLIST_EMPTY(&entry->yankfns)' failed.
> >
> > This happens because even though all multifd channels could
> > yank_register_function(), none of them could unregister it before
> > unregistering the MIGRATION_YANK_INSTANCE, causing the assert to fail.
> >
> > Fix that by calling multifd_load_cleanup() on postcopy_ram_listen_thread()
> > before MIGRATION_YANK_INSTANCE is unregistered.
> >
> > Fixes: b5eea99ec2 ("migration: Add yank feature")
> > Reported-by: Li Xiaohui <xiaohli@redhat.com>
> > Signed-off-by: Leonardo Bras <leobras@redhat.com>
> > ---
> >  migration/migration.h |  1 +
> >  migration/migration.c | 18 +++++++++++++-----
> >  migration/savevm.c    |  2 ++
> >  3 files changed, 16 insertions(+), 5 deletions(-)
> >
> > diff --git a/migration/migration.h b/migration/migration.h
> > index cdad8aceaa..240f64efb0 100644
> > --- a/migration/migration.h
> > +++ b/migration/migration.h
> > @@ -473,6 +473,7 @@ void migration_make_urgent_request(void);
> >  void migration_consume_urgent_request(void);
> >  bool migration_rate_limit(void);
> >  void migration_cancel(const Error *error);
> > +bool migration_load_cleanup(void);
> >
> >  void populate_vfio_info(MigrationInfo *info);
> >  void postcopy_temp_page_reset(PostcopyTmpPage *tmp_page);
> > diff --git a/migration/migration.c b/migration/migration.c
> > index 739bb683f3..4f363b2a95 100644
> > --- a/migration/migration.c
> > +++ b/migration/migration.c
> > @@ -486,6 +486,17 @@ void migrate_add_address(SocketAddress *address)
> >                        QAPI_CLONE(SocketAddress, address));
> >  }
> >
> > +bool migration_load_cleanup(void)
> > +{
> > +    Error *local_err = NULL;
> > +
> > +    if (multifd_load_cleanup(&local_err)) {
> > +        error_report_err(local_err);
> > +        return true;
> > +    }
> > +    return false;
> > +}
> > +
> >  static void qemu_start_incoming_migration(const char *uri, Error **errp)
> >  {
> >      const char *p = NULL;
> > @@ -540,8 +551,7 @@ static void process_incoming_migration_bh(void *opaque)
> >       */
> >      qemu_announce_self(&mis->announce_timer, migrate_announce_params());
> >
> > -    if (multifd_load_cleanup(&local_err) != 0) {
> > -        error_report_err(local_err);
> > +    if (migration_load_cleanup()) {
> >          autostart = false;
> >      }
> >      /* If global state section was not received or we are in running
> > @@ -646,9 +656,7 @@ fail:
> >      migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
> >                        MIGRATION_STATUS_FAILED);
> >      qemu_fclose(mis->from_src_file);
> > -    if (multifd_load_cleanup(&local_err) != 0) {
> > -        error_report_err(local_err);
> > -    }
> > +    migration_load_cleanup();
> >      exit(EXIT_FAILURE);
> >  }
> >
> > diff --git a/migration/savevm.c b/migration/savevm.c
> > index a0cdb714f7..250caff7f4 100644
> > --- a/migration/savevm.c
> > +++ b/migration/savevm.c
> > @@ -1889,6 +1889,8 @@ static void *postcopy_ram_listen_thread(void *opaque)
> >          exit(EXIT_FAILURE);
> >      }
> >
> > +    migration_load_cleanup();
> > +
> >      migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
> >                                     MIGRATION_STATUS_COMPLETED);
> >      /*
> > --
> > 2.38.1
> >
> --
> Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
>
Juan Quintela Nov. 10, 2022, 1:47 p.m. UTC | #3
Leonardo Bras <leobras@redhat.com> wrote:
D> When multifd and postcopy-ram capabilities are enabled, if a
> migrate-start-postcopy is attempted, the migration will finish sending the
> memory pages and then crash with the following error:
>
> qemu-system-x86_64: ../util/yank.c:107: yank_unregister_instance: Assertion
> `QLIST_EMPTY(&entry->yankfns)' failed.
>
> This happens because even though all multifd channels could
> yank_register_function(), none of them could unregister it before
> unregistering the MIGRATION_YANK_INSTANCE, causing the assert to fail.
>
> Fix that by calling multifd_load_cleanup() on postcopy_ram_listen_thread()
> before MIGRATION_YANK_INSTANCE is unregistered.

Hi

One question,
What warantees that migration_load_cleanup() is not called twice?

I can't see anything that provides that here?  Or does postcopy have
never done the cleanup of multifd channels before?

Later, Juan.


> Fixes: b5eea99ec2 ("migration: Add yank feature")
> Reported-by: Li Xiaohui <xiaohli@redhat.com>
> Signed-off-by: Leonardo Bras <leobras@redhat.com>
> ---
>  migration/migration.h |  1 +
>  migration/migration.c | 18 +++++++++++++-----
>  migration/savevm.c    |  2 ++
>  3 files changed, 16 insertions(+), 5 deletions(-)
>
> diff --git a/migration/migration.h b/migration/migration.h
> index cdad8aceaa..240f64efb0 100644
> --- a/migration/migration.h
> +++ b/migration/migration.h
> @@ -473,6 +473,7 @@ void migration_make_urgent_request(void);
>  void migration_consume_urgent_request(void);
>  bool migration_rate_limit(void);
>  void migration_cancel(const Error *error);
> +bool migration_load_cleanup(void);
>  
>  void populate_vfio_info(MigrationInfo *info);
>  void postcopy_temp_page_reset(PostcopyTmpPage *tmp_page);
> diff --git a/migration/migration.c b/migration/migration.c
> index 739bb683f3..4f363b2a95 100644
> --- a/migration/migration.c
> +++ b/migration/migration.c
> @@ -486,6 +486,17 @@ void migrate_add_address(SocketAddress *address)
>                        QAPI_CLONE(SocketAddress, address));
>  }
>  
> +bool migration_load_cleanup(void)
> +{
> +    Error *local_err = NULL;
> +
> +    if (multifd_load_cleanup(&local_err)) {
> +        error_report_err(local_err);
> +        return true;
> +    }
> +    return false;
> +}
> +
>  static void qemu_start_incoming_migration(const char *uri, Error **errp)
>  {
>      const char *p = NULL;
> @@ -540,8 +551,7 @@ static void process_incoming_migration_bh(void *opaque)
>       */
>      qemu_announce_self(&mis->announce_timer, migrate_announce_params());
>  
> -    if (multifd_load_cleanup(&local_err) != 0) {
> -        error_report_err(local_err);
> +    if (migration_load_cleanup()) {
>          autostart = false;
>      }
>      /* If global state section was not received or we are in running
> @@ -646,9 +656,7 @@ fail:
>      migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
>                        MIGRATION_STATUS_FAILED);
>      qemu_fclose(mis->from_src_file);
> -    if (multifd_load_cleanup(&local_err) != 0) {
> -        error_report_err(local_err);
> -    }
> +    migration_load_cleanup();
>      exit(EXIT_FAILURE);
>  }
>  
> diff --git a/migration/savevm.c b/migration/savevm.c
> index a0cdb714f7..250caff7f4 100644
> --- a/migration/savevm.c
> +++ b/migration/savevm.c
> @@ -1889,6 +1889,8 @@ static void *postcopy_ram_listen_thread(void *opaque)
>          exit(EXIT_FAILURE);
>      }
>  
> +    migration_load_cleanup();
> +

This addition is the one that I don't understand why it was not
needed/done before.

>      migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
>                                     MIGRATION_STATUS_COMPLETED);
>      /*

Later, Juan.
Leonardo Bras Nov. 15, 2022, 2:32 a.m. UTC | #4
On Thu, Nov 10, 2022 at 10:48 AM Juan Quintela <quintela@redhat.com> wrote:
>
> Leonardo Bras <leobras@redhat.com> wrote:
> D> When multifd and postcopy-ram capabilities are enabled, if a
> > migrate-start-postcopy is attempted, the migration will finish sending the
> > memory pages and then crash with the following error:
> >
> > qemu-system-x86_64: ../util/yank.c:107: yank_unregister_instance: Assertion
> > `QLIST_EMPTY(&entry->yankfns)' failed.
> >
> > This happens because even though all multifd channels could
> > yank_register_function(), none of them could unregister it before
> > unregistering the MIGRATION_YANK_INSTANCE, causing the assert to fail.
> >
> > Fix that by calling multifd_load_cleanup() on postcopy_ram_listen_thread()
> > before MIGRATION_YANK_INSTANCE is unregistered.
>
> Hi
>
> One question,
> What warantees that migration_load_cleanup() is not called twice?
>
> I can't see anything that provides that here?  Or does postcopy have
> never done the cleanup of multifd channels before?

IIUC, postcopy is not doing multifd cleanup for a while, at least
since 6.0.0-rc2.
That is as far as I went back testing, and by fixing other (build)
bugs, I could get the yank to abort the target qemu after the
migration finished on multifd + postcopy scenario.


>
> Later, Juan.
>
>
> > Fixes: b5eea99ec2 ("migration: Add yank feature")
> > Reported-by: Li Xiaohui <xiaohli@redhat.com>
> > Signed-off-by: Leonardo Bras <leobras@redhat.com>
> > ---
> >  migration/migration.h |  1 +
> >  migration/migration.c | 18 +++++++++++++-----
> >  migration/savevm.c    |  2 ++
> >  3 files changed, 16 insertions(+), 5 deletions(-)
> >
> > diff --git a/migration/migration.h b/migration/migration.h
> > index cdad8aceaa..240f64efb0 100644
> > --- a/migration/migration.h
> > +++ b/migration/migration.h
> > @@ -473,6 +473,7 @@ void migration_make_urgent_request(void);
> >  void migration_consume_urgent_request(void);
> >  bool migration_rate_limit(void);
> >  void migration_cancel(const Error *error);
> > +bool migration_load_cleanup(void);
> >
> >  void populate_vfio_info(MigrationInfo *info);
> >  void postcopy_temp_page_reset(PostcopyTmpPage *tmp_page);
> > diff --git a/migration/migration.c b/migration/migration.c
> > index 739bb683f3..4f363b2a95 100644
> > --- a/migration/migration.c
> > +++ b/migration/migration.c
> > @@ -486,6 +486,17 @@ void migrate_add_address(SocketAddress *address)
> >                        QAPI_CLONE(SocketAddress, address));
> >  }
> >
> > +bool migration_load_cleanup(void)
> > +{
> > +    Error *local_err = NULL;
> > +
> > +    if (multifd_load_cleanup(&local_err)) {
> > +        error_report_err(local_err);
> > +        return true;
> > +    }
> > +    return false;
> > +}
> > +
> >  static void qemu_start_incoming_migration(const char *uri, Error **errp)
> >  {
> >      const char *p = NULL;
> > @@ -540,8 +551,7 @@ static void process_incoming_migration_bh(void *opaque)
> >       */
> >      qemu_announce_self(&mis->announce_timer, migrate_announce_params());
> >
> > -    if (multifd_load_cleanup(&local_err) != 0) {
> > -        error_report_err(local_err);
> > +    if (migration_load_cleanup()) {
> >          autostart = false;
> >      }
> >      /* If global state section was not received or we are in running
> > @@ -646,9 +656,7 @@ fail:
> >      migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
> >                        MIGRATION_STATUS_FAILED);
> >      qemu_fclose(mis->from_src_file);
> > -    if (multifd_load_cleanup(&local_err) != 0) {
> > -        error_report_err(local_err);
> > -    }
> > +    migration_load_cleanup();
> >      exit(EXIT_FAILURE);
> >  }
> >
> > diff --git a/migration/savevm.c b/migration/savevm.c
> > index a0cdb714f7..250caff7f4 100644
> > --- a/migration/savevm.c
> > +++ b/migration/savevm.c
> > @@ -1889,6 +1889,8 @@ static void *postcopy_ram_listen_thread(void *opaque)
> >          exit(EXIT_FAILURE);
> >      }
> >
> > +    migration_load_cleanup();
> > +
>
> This addition is the one that I don't understand why it was not
> needed/done before.

Please see the above comment, but tl;dr, it was not done before.


Thanks you for reviewing,
Leo

>
> >      migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
> >                                     MIGRATION_STATUS_COMPLETED);
> >      /*
>
> Later, Juan.
>
Peter Xu Nov. 24, 2022, 4:04 p.m. UTC | #5
On Wed, Nov 09, 2022 at 02:56:29AM -0300, Leonardo Bras wrote:
> diff --git a/migration/savevm.c b/migration/savevm.c
> index a0cdb714f7..250caff7f4 100644
> --- a/migration/savevm.c
> +++ b/migration/savevm.c
> @@ -1889,6 +1889,8 @@ static void *postcopy_ram_listen_thread(void *opaque)
>          exit(EXIT_FAILURE);
>      }
>  
> +    migration_load_cleanup();

It's a bit weird to call multifd-load-clean in a listen phase..

How about moving it right above
trace_process_incoming_migration_co_postcopy_end_main()?  Then the new
helper can also be static.

> +
>      migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
>                                     MIGRATION_STATUS_COMPLETED);
>      /*
> -- 
> 2.38.1
>
Leonardo Bras Nov. 29, 2022, 8:28 p.m. UTC | #6
Hello Peter,

On Thu, Nov 24, 2022 at 1:04 PM Peter Xu <peterx@redhat.com> wrote:
>
> On Wed, Nov 09, 2022 at 02:56:29AM -0300, Leonardo Bras wrote:
> > diff --git a/migration/savevm.c b/migration/savevm.c
> > index a0cdb714f7..250caff7f4 100644
> > --- a/migration/savevm.c
> > +++ b/migration/savevm.c
> > @@ -1889,6 +1889,8 @@ static void *postcopy_ram_listen_thread(void *opaque)
> >          exit(EXIT_FAILURE);
> >      }
> >
> > +    migration_load_cleanup();
>
> It's a bit weird to call multifd-load-clean in a listen phase..

I agree.

>
> How about moving it right above
> trace_process_incoming_migration_co_postcopy_end_main()?  Then the new
> helper can also be static.

Seems a nice Idea to have this function to be static.

We have to guarantee this is run after the migration finished, but
before migration_incoming_state_destroy().

You suggested calling it right above of
trace_process_incoming_migration_co_postcopy_end_main(), which git
grep pointed me to an if clause in process_incoming_migration_co().
If I got the location correctly, it would not help: this coroutine is
ran just after the VM went to the target host, and not when the
migration finished.

If we are using multifd channels, this will break the migration with
segmentation fault (SIGSEGV), since the channels have not finished
sending yet.

Best regards,
Leo




>
> > +
> >      migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
> >                                     MIGRATION_STATUS_COMPLETED);
> >      /*
> > --
> > 2.38.1
> >
>
> --
> Peter Xu
>
Peter Xu Nov. 29, 2022, 8:50 p.m. UTC | #7
On Tue, Nov 29, 2022 at 05:28:26PM -0300, Leonardo Bras Soares Passos wrote:
> Hello Peter,

Leo,

> 
> On Thu, Nov 24, 2022 at 1:04 PM Peter Xu <peterx@redhat.com> wrote:
> >
> > On Wed, Nov 09, 2022 at 02:56:29AM -0300, Leonardo Bras wrote:
> > > diff --git a/migration/savevm.c b/migration/savevm.c
> > > index a0cdb714f7..250caff7f4 100644
> > > --- a/migration/savevm.c
> > > +++ b/migration/savevm.c
> > > @@ -1889,6 +1889,8 @@ static void *postcopy_ram_listen_thread(void *opaque)
> > >          exit(EXIT_FAILURE);
> > >      }
> > >
> > > +    migration_load_cleanup();
> >
> > It's a bit weird to call multifd-load-clean in a listen phase..
> 
> I agree.
> 
> >
> > How about moving it right above
> > trace_process_incoming_migration_co_postcopy_end_main()?  Then the new
> > helper can also be static.
> 
> Seems a nice Idea to have this function to be static.
> 
> We have to guarantee this is run after the migration finished, but
> before migration_incoming_state_destroy().

IIUC it doesn't need to be when migration finished.  It should be fine as
long as we finished precopy phase, and that's what the migration coroutine
does, iiuc.  The thing is postcopy doesn't use multifd at all, so logically
it can be released before postcopy starts.

Actually, IMHO it'll be safer to do it like that, just to make sure we
won't accidentally receive multifd pages _after_ postcopy starts, because
that'll be another more severe and hard to debug issue since the guest can
see partial copied pages from multifd recv channels.

> 
> You suggested calling it right above of
> trace_process_incoming_migration_co_postcopy_end_main(), which git
> grep pointed me to an if clause in process_incoming_migration_co().
> If I got the location correctly, it would not help: this coroutine is
> ran just after the VM went to the target host, and not when the
> migration finished.
> 
> If we are using multifd channels, this will break the migration with
> segmentation fault (SIGSEGV), since the channels have not finished
> sending yet.

If this happens, then I had a feeling that there's something else that
needs syncs.  As I discussed above, we should make sure multifd pages all
landed before we start vcpu threads.

Said that, now I think I'm not against your original proposal to fix this
immediate crash.  However I am still wondering whether we really should
disable multifd with postcopy, as there seem to be still a few missing
pieces even to enable multifd during precopy-only.

Thanks,
Leonardo Bras Feb. 9, 2023, 4:14 a.m. UTC | #8
On Tue, 2022-11-29 at 15:50 -0500, Peter Xu wrote:
> On Tue, Nov 29, 2022 at 05:28:26PM -0300, Leonardo Bras Soares Passos wrote:
> > Hello Peter,
> 
> Leo,
> 
> > 
> > On Thu, Nov 24, 2022 at 1:04 PM Peter Xu <peterx@redhat.com> wrote:
> > > 
> > > On Wed, Nov 09, 2022 at 02:56:29AM -0300, Leonardo Bras wrote:
> > > > diff --git a/migration/savevm.c b/migration/savevm.c
> > > > index a0cdb714f7..250caff7f4 100644
> > > > --- a/migration/savevm.c
> > > > +++ b/migration/savevm.c
> > > > @@ -1889,6 +1889,8 @@ static void *postcopy_ram_listen_thread(void *opaque)
> > > >          exit(EXIT_FAILURE);
> > > >      }
> > > > 
> > > > +    migration_load_cleanup();
> > > 
> > > It's a bit weird to call multifd-load-clean in a listen phase..
> > 
> > I agree.
> > 
> > > 
> > > How about moving it right above
> > > trace_process_incoming_migration_co_postcopy_end_main()?  Then the new
> > > helper can also be static.
> > 
> > Seems a nice Idea to have this function to be static.
> > 
> > We have to guarantee this is run after the migration finished, but
> > before migration_incoming_state_destroy().
> 
> IIUC it doesn't need to be when migration finished.  It should be fine as
> long as we finished precopy phase, and that's what the migration coroutine
> does, iiuc.  The thing is postcopy doesn't use multifd at all, so logically
> it can be released before postcopy starts.
> 
> Actually, IMHO it'll be safer to do it like that, just to make sure we
> won't accidentally receive multifd pages _after_ postcopy starts, because
> that'll be another more severe and hard to debug issue since the guest can
> see partial copied pages from multifd recv channels.
> 
> > 
> > You suggested calling it right above of
> > trace_process_incoming_migration_co_postcopy_end_main(), which git
> > grep pointed me to an if clause in process_incoming_migration_co().
> > If I got the location correctly, it would not help: this coroutine is
> > ran just after the VM went to the target host, and not when the
> > migration finished.
> > 
> > If we are using multifd channels, this will break the migration with
> > segmentation fault (SIGSEGV), since the channels have not finished
> > sending yet.
> 
> If this happens, then I had a feeling that there's something else that
> needs syncs.  As I discussed above, we should make sure multifd pages all
> landed before we start vcpu threads.
> 
> Said that, now I think I'm not against your original proposal to fix this
> immediate crash.  However I am still wondering whether we really should
> disable multifd with postcopy, as there seem to be still a few missing
> pieces even to enable multifd during precopy-only.
> 
> Thanks,
> 


I got side-tracked on this issue.

Is there any patch disabling multifd + postcopy, or would it be fine to go back
working on a V2 for this one?

Best regards,
Leo
Peter Xu Feb. 9, 2023, 2:22 p.m. UTC | #9
On Thu, Feb 09, 2023 at 01:14:12AM -0300, Leonardo BrĂ¡s wrote:
> I got side-tracked on this issue.
> 
> Is there any patch disabling multifd + postcopy, or would it be fine to go back
> working on a V2 for this one?

IMHO it'll always make sense to post a new version for the immediate crash.

Personally I still think we should disable the two features being present
together until the full solution, but that can be discussed separately.

Thanks,
diff mbox series

Patch

diff --git a/migration/migration.h b/migration/migration.h
index cdad8aceaa..240f64efb0 100644
--- a/migration/migration.h
+++ b/migration/migration.h
@@ -473,6 +473,7 @@  void migration_make_urgent_request(void);
 void migration_consume_urgent_request(void);
 bool migration_rate_limit(void);
 void migration_cancel(const Error *error);
+bool migration_load_cleanup(void);
 
 void populate_vfio_info(MigrationInfo *info);
 void postcopy_temp_page_reset(PostcopyTmpPage *tmp_page);
diff --git a/migration/migration.c b/migration/migration.c
index 739bb683f3..4f363b2a95 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -486,6 +486,17 @@  void migrate_add_address(SocketAddress *address)
                       QAPI_CLONE(SocketAddress, address));
 }
 
+bool migration_load_cleanup(void)
+{
+    Error *local_err = NULL;
+
+    if (multifd_load_cleanup(&local_err)) {
+        error_report_err(local_err);
+        return true;
+    }
+    return false;
+}
+
 static void qemu_start_incoming_migration(const char *uri, Error **errp)
 {
     const char *p = NULL;
@@ -540,8 +551,7 @@  static void process_incoming_migration_bh(void *opaque)
      */
     qemu_announce_self(&mis->announce_timer, migrate_announce_params());
 
-    if (multifd_load_cleanup(&local_err) != 0) {
-        error_report_err(local_err);
+    if (migration_load_cleanup()) {
         autostart = false;
     }
     /* If global state section was not received or we are in running
@@ -646,9 +656,7 @@  fail:
     migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
                       MIGRATION_STATUS_FAILED);
     qemu_fclose(mis->from_src_file);
-    if (multifd_load_cleanup(&local_err) != 0) {
-        error_report_err(local_err);
-    }
+    migration_load_cleanup();
     exit(EXIT_FAILURE);
 }
 
diff --git a/migration/savevm.c b/migration/savevm.c
index a0cdb714f7..250caff7f4 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -1889,6 +1889,8 @@  static void *postcopy_ram_listen_thread(void *opaque)
         exit(EXIT_FAILURE);
     }
 
+    migration_load_cleanup();
+
     migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
                                    MIGRATION_STATUS_COMPLETED);
     /*