diff mbox series

[v2,5/5] migration: Move the yank unregister of channel_close out

Message ID 20210721193409.910462-6-peterx@redhat.com (mailing list archive)
State New, archived
Headers show
Series migrations: Fix potential rare race of migration-test after yank | expand

Commit Message

Peter Xu July 21, 2021, 7:34 p.m. UTC
It's efficient, but hackish to call yank unregister calls in channel_close(),
especially it'll be hard to debug when qemu crashed with some yank function
leaked.

Remove that hack, but instead explicitly unregister yank functions at the
places where needed, they are:

  (on src)
  - migrate_fd_cleanup
  - postcopy_pause

  (on dst)
  - migration_incoming_state_destroy
  - postcopy_pause_incoming

Signed-off-by: Peter Xu <peterx@redhat.com>
---
 migration/migration.c         |  5 +++++
 migration/qemu-file-channel.c |  3 ---
 migration/savevm.c            |  7 +++++++
 migration/yank_functions.c    | 14 ++++++++++++++
 migration/yank_functions.h    |  1 +
 5 files changed, 27 insertions(+), 3 deletions(-)

Comments

Dr. David Alan Gilbert July 22, 2021, 3:27 p.m. UTC | #1
* Peter Xu (peterx@redhat.com) wrote:
> It's efficient, but hackish to call yank unregister calls in channel_close(),
> especially it'll be hard to debug when qemu crashed with some yank function
> leaked.
> 
> Remove that hack, but instead explicitly unregister yank functions at the
> places where needed, they are:
> 
>   (on src)
>   - migrate_fd_cleanup
>   - postcopy_pause
> 
>   (on dst)
>   - migration_incoming_state_destroy
>   - postcopy_pause_incoming
> 
> Signed-off-by: Peter Xu <peterx@redhat.com>
> ---
>  migration/migration.c         |  5 +++++
>  migration/qemu-file-channel.c |  3 ---
>  migration/savevm.c            |  7 +++++++
>  migration/yank_functions.c    | 14 ++++++++++++++
>  migration/yank_functions.h    |  1 +
>  5 files changed, 27 insertions(+), 3 deletions(-)
> 
> diff --git a/migration/migration.c b/migration/migration.c
> index fa70400f98..bfeb65b8f7 100644
> --- a/migration/migration.c
> +++ b/migration/migration.c
> @@ -59,6 +59,7 @@
>  #include "multifd.h"
>  #include "qemu/yank.h"
>  #include "sysemu/cpus.h"
> +#include "yank_functions.h"
>  
>  #define MAX_THROTTLE  (128 << 20)      /* Migration transfer speed throttling */
>  
> @@ -273,6 +274,7 @@ void migration_incoming_state_destroy(void)
>      }
>  
>      if (mis->from_src_file) {
> +        migration_ioc_unregister_yank_from_file(mis->from_src_file);
>          qemu_fclose(mis->from_src_file);
>          mis->from_src_file = NULL;
>      }
> @@ -1811,6 +1813,7 @@ static void migrate_fd_cleanup(MigrationState *s)
>           * Close the file handle without the lock to make sure the
>           * critical section won't block for long.
>           */
> +        migration_ioc_unregister_yank_from_file(tmp);
>          qemu_fclose(tmp);
>      }
>  
> @@ -3352,6 +3355,8 @@ static MigThrError postcopy_pause(MigrationState *s)
>  
>          /* Current channel is possibly broken. Release it. */
>          assert(s->to_dst_file);
> +        /* Unregister yank for current channel */
> +        migration_ioc_unregister_yank_from_file(s->to_dst_file);

Should this go inside the lock?

Dave

>          qemu_mutex_lock(&s->qemu_file_lock);
>          file = s->to_dst_file;
>          s->to_dst_file = NULL;
> diff --git a/migration/qemu-file-channel.c b/migration/qemu-file-channel.c
> index 2f8b1fcd46..bb5a5752df 100644
> --- a/migration/qemu-file-channel.c
> +++ b/migration/qemu-file-channel.c
> @@ -107,9 +107,6 @@ static int channel_close(void *opaque, Error **errp)
>      int ret;
>      QIOChannel *ioc = QIO_CHANNEL(opaque);
>      ret = qio_channel_close(ioc, errp);
> -    if (OBJECT(ioc)->ref == 1) {
> -        migration_ioc_unregister_yank(ioc);
> -    }
>      object_unref(OBJECT(ioc));
>      return ret;
>  }
> diff --git a/migration/savevm.c b/migration/savevm.c
> index 96b5e5d639..7b7b64bd13 100644
> --- a/migration/savevm.c
> +++ b/migration/savevm.c
> @@ -65,6 +65,7 @@
>  #include "qemu/bitmap.h"
>  #include "net/announce.h"
>  #include "qemu/yank.h"
> +#include "yank_functions.h"
>  
>  const unsigned int postcopy_ram_discard_version;
>  
> @@ -2568,6 +2569,12 @@ static bool postcopy_pause_incoming(MigrationIncomingState *mis)
>      /* Clear the triggered bit to allow one recovery */
>      mis->postcopy_recover_triggered = false;
>  
> +    /*
> +     * Unregister yank with either from/to src would work, since ioc behind it
> +     * is the same
> +     */
> +    migration_ioc_unregister_yank_from_file(mis->from_src_file);
> +
>      assert(mis->from_src_file);
>      qemu_file_shutdown(mis->from_src_file);
>      qemu_fclose(mis->from_src_file);
> diff --git a/migration/yank_functions.c b/migration/yank_functions.c
> index 23697173ae..8c08aef14a 100644
> --- a/migration/yank_functions.c
> +++ b/migration/yank_functions.c
> @@ -14,6 +14,7 @@
>  #include "qemu/yank.h"
>  #include "io/channel-socket.h"
>  #include "io/channel-tls.h"
> +#include "qemu-file.h"
>  
>  void migration_yank_iochannel(void *opaque)
>  {
> @@ -46,3 +47,16 @@ void migration_ioc_unregister_yank(QIOChannel *ioc)
>                                   QIO_CHANNEL(ioc));
>      }
>  }
> +
> +void migration_ioc_unregister_yank_from_file(QEMUFile *file)
> +{
> +    QIOChannel *ioc = qemu_file_get_ioc(file);
> +
> +    if (ioc) {
> +        /*
> +         * For migration qemufiles, we'll always reach here.  Though we'll skip
> +         * calls from e.g. savevm/loadvm as they don't use yank.
> +         */
> +        migration_ioc_unregister_yank(ioc);
> +    }
> +}
> diff --git a/migration/yank_functions.h b/migration/yank_functions.h
> index 74c7f18c91..a7577955ed 100644
> --- a/migration/yank_functions.h
> +++ b/migration/yank_functions.h
> @@ -17,3 +17,4 @@
>  void migration_yank_iochannel(void *opaque);
>  void migration_ioc_register_yank(QIOChannel *ioc);
>  void migration_ioc_unregister_yank(QIOChannel *ioc);
> +void migration_ioc_unregister_yank_from_file(QEMUFile *file);
> -- 
> 2.31.1
>
Peter Xu July 22, 2021, 4:19 p.m. UTC | #2
On Thu, Jul 22, 2021 at 04:27:35PM +0100, Dr. David Alan Gilbert wrote:
> > @@ -3352,6 +3355,8 @@ static MigThrError postcopy_pause(MigrationState *s)
> >  
> >          /* Current channel is possibly broken. Release it. */
> >          assert(s->to_dst_file);
> > +        /* Unregister yank for current channel */
> > +        migration_ioc_unregister_yank_from_file(s->to_dst_file);
> 
> Should this go inside the lock?

Shouldn't need to; as we've got the assert() right above so otherwise we'll
abrt otherwise :)

The mutex lock/unlock right below this one is not protecting us from someone
changing it but really for being able to wait until someone finished using it
then we won't crash someone else.

I think the rational is to_dst_file is managed by migration thread while
from_dst_file is managed by rp_thread.

Maybe I add a comment above?

Thanks,
Dr. David Alan Gilbert July 22, 2021, 5:09 p.m. UTC | #3
* Peter Xu (peterx@redhat.com) wrote:
> On Thu, Jul 22, 2021 at 04:27:35PM +0100, Dr. David Alan Gilbert wrote:
> > > @@ -3352,6 +3355,8 @@ static MigThrError postcopy_pause(MigrationState *s)
> > >  
> > >          /* Current channel is possibly broken. Release it. */
> > >          assert(s->to_dst_file);
> > > +        /* Unregister yank for current channel */
> > > +        migration_ioc_unregister_yank_from_file(s->to_dst_file);
> > 
> > Should this go inside the lock?
> 
> Shouldn't need to; as we've got the assert() right above so otherwise we'll
> abrt otherwise :)

Hmm OK; although with out always having to think about it then you worry
about something getting in after the assert.

> The mutex lock/unlock right below this one is not protecting us from someone
> changing it but really for being able to wait until someone finished using it
> then we won't crash someone else.
> 
> I think the rational is to_dst_file is managed by migration thread while
> from_dst_file is managed by rp_thread.
> 
> Maybe I add a comment above?

OK, I almost pushed this further, but then I started to get worried that
we'd trace off a race with ordering on two locks with yank_lock; so yeh
lets just add a comment.

Dave

> Thanks,
> 
> -- 
> Peter Xu
>
Peter Xu July 22, 2021, 5:35 p.m. UTC | #4
On Thu, Jul 22, 2021 at 06:09:03PM +0100, Dr. David Alan Gilbert wrote:
> * Peter Xu (peterx@redhat.com) wrote:
> > On Thu, Jul 22, 2021 at 04:27:35PM +0100, Dr. David Alan Gilbert wrote:
> > > > @@ -3352,6 +3355,8 @@ static MigThrError postcopy_pause(MigrationState *s)
> > > >  
> > > >          /* Current channel is possibly broken. Release it. */
> > > >          assert(s->to_dst_file);
> > > > +        /* Unregister yank for current channel */
> > > > +        migration_ioc_unregister_yank_from_file(s->to_dst_file);
> > > 
> > > Should this go inside the lock?
> > 
> > Shouldn't need to; as we've got the assert() right above so otherwise we'll
> > abrt otherwise :)
> 
> Hmm OK; although with out always having to think about it then you worry
> about something getting in after the assert.

Right; the point is still that to_dst_file shouldn't be changed by other
thread, or it'll bring some more challenge.

If it can be mnodified when reaching this line, it means it can also reach
earlier at assert(), then we coredump.  So we should guaratee it won't happen,
or we'd better also remove that assertion..

I think the challenge here is, we do have a mutex to protect the files and from
that pov it's the same as other mutex.  However it's different in that this
mutex is also used in the oob handlers so we want to "keep it non-blocking and
as small as possible for the critical sections".

I didn't put it into the mutex protection because the yank code will further go
to take the yank_lock so it'll make things slightly more complicated (then the
file mutex is correlated to yank_lock too!).  I don't think that's a problem
because yank_lock is also "non-blocking" (ah! it can still block, got scheduled
out, but there's no explicit things that will proactively sleep..).  So since I
think it's safe without the lock then I kept it outside of the lock then we
don't need to discuss the safely of having it inside the critical section.

(However then I noticed we still need justification on why it can be moved out..)

> 
> > The mutex lock/unlock right below this one is not protecting us from someone
> > changing it but really for being able to wait until someone finished using it
> > then we won't crash someone else.
> > 
> > I think the rational is to_dst_file is managed by migration thread while
> > from_dst_file is managed by rp_thread.
> > 
> > Maybe I add a comment above?
> 
> OK, I almost pushed this further, but then I started to get worried that
> we'd trace off a race with ordering on two locks with yank_lock; so yeh
> lets just add a comment.

Agreed.  I think ordering is not a huge problem as yank_lock is very limitedly
used in yank and protect yank instance/functions only, so there shouldn't be a
path we take file mutex within it.  Will repost shortly, thanks.
diff mbox series

Patch

diff --git a/migration/migration.c b/migration/migration.c
index fa70400f98..bfeb65b8f7 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -59,6 +59,7 @@ 
 #include "multifd.h"
 #include "qemu/yank.h"
 #include "sysemu/cpus.h"
+#include "yank_functions.h"
 
 #define MAX_THROTTLE  (128 << 20)      /* Migration transfer speed throttling */
 
@@ -273,6 +274,7 @@  void migration_incoming_state_destroy(void)
     }
 
     if (mis->from_src_file) {
+        migration_ioc_unregister_yank_from_file(mis->from_src_file);
         qemu_fclose(mis->from_src_file);
         mis->from_src_file = NULL;
     }
@@ -1811,6 +1813,7 @@  static void migrate_fd_cleanup(MigrationState *s)
          * Close the file handle without the lock to make sure the
          * critical section won't block for long.
          */
+        migration_ioc_unregister_yank_from_file(tmp);
         qemu_fclose(tmp);
     }
 
@@ -3352,6 +3355,8 @@  static MigThrError postcopy_pause(MigrationState *s)
 
         /* Current channel is possibly broken. Release it. */
         assert(s->to_dst_file);
+        /* Unregister yank for current channel */
+        migration_ioc_unregister_yank_from_file(s->to_dst_file);
         qemu_mutex_lock(&s->qemu_file_lock);
         file = s->to_dst_file;
         s->to_dst_file = NULL;
diff --git a/migration/qemu-file-channel.c b/migration/qemu-file-channel.c
index 2f8b1fcd46..bb5a5752df 100644
--- a/migration/qemu-file-channel.c
+++ b/migration/qemu-file-channel.c
@@ -107,9 +107,6 @@  static int channel_close(void *opaque, Error **errp)
     int ret;
     QIOChannel *ioc = QIO_CHANNEL(opaque);
     ret = qio_channel_close(ioc, errp);
-    if (OBJECT(ioc)->ref == 1) {
-        migration_ioc_unregister_yank(ioc);
-    }
     object_unref(OBJECT(ioc));
     return ret;
 }
diff --git a/migration/savevm.c b/migration/savevm.c
index 96b5e5d639..7b7b64bd13 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -65,6 +65,7 @@ 
 #include "qemu/bitmap.h"
 #include "net/announce.h"
 #include "qemu/yank.h"
+#include "yank_functions.h"
 
 const unsigned int postcopy_ram_discard_version;
 
@@ -2568,6 +2569,12 @@  static bool postcopy_pause_incoming(MigrationIncomingState *mis)
     /* Clear the triggered bit to allow one recovery */
     mis->postcopy_recover_triggered = false;
 
+    /*
+     * Unregister yank with either from/to src would work, since ioc behind it
+     * is the same
+     */
+    migration_ioc_unregister_yank_from_file(mis->from_src_file);
+
     assert(mis->from_src_file);
     qemu_file_shutdown(mis->from_src_file);
     qemu_fclose(mis->from_src_file);
diff --git a/migration/yank_functions.c b/migration/yank_functions.c
index 23697173ae..8c08aef14a 100644
--- a/migration/yank_functions.c
+++ b/migration/yank_functions.c
@@ -14,6 +14,7 @@ 
 #include "qemu/yank.h"
 #include "io/channel-socket.h"
 #include "io/channel-tls.h"
+#include "qemu-file.h"
 
 void migration_yank_iochannel(void *opaque)
 {
@@ -46,3 +47,16 @@  void migration_ioc_unregister_yank(QIOChannel *ioc)
                                  QIO_CHANNEL(ioc));
     }
 }
+
+void migration_ioc_unregister_yank_from_file(QEMUFile *file)
+{
+    QIOChannel *ioc = qemu_file_get_ioc(file);
+
+    if (ioc) {
+        /*
+         * For migration qemufiles, we'll always reach here.  Though we'll skip
+         * calls from e.g. savevm/loadvm as they don't use yank.
+         */
+        migration_ioc_unregister_yank(ioc);
+    }
+}
diff --git a/migration/yank_functions.h b/migration/yank_functions.h
index 74c7f18c91..a7577955ed 100644
--- a/migration/yank_functions.h
+++ b/migration/yank_functions.h
@@ -17,3 +17,4 @@ 
 void migration_yank_iochannel(void *opaque);
 void migration_ioc_register_yank(QIOChannel *ioc);
 void migration_ioc_unregister_yank(QIOChannel *ioc);
+void migration_ioc_unregister_yank_from_file(QEMUFile *file);