Message ID | 20210721193409.910462-6-peterx@redhat.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | migrations: Fix potential rare race of migration-test after yank | expand |
* Peter Xu (peterx@redhat.com) wrote: > It's efficient, but hackish to call yank unregister calls in channel_close(), > especially it'll be hard to debug when qemu crashed with some yank function > leaked. > > Remove that hack, but instead explicitly unregister yank functions at the > places where needed, they are: > > (on src) > - migrate_fd_cleanup > - postcopy_pause > > (on dst) > - migration_incoming_state_destroy > - postcopy_pause_incoming > > Signed-off-by: Peter Xu <peterx@redhat.com> > --- > migration/migration.c | 5 +++++ > migration/qemu-file-channel.c | 3 --- > migration/savevm.c | 7 +++++++ > migration/yank_functions.c | 14 ++++++++++++++ > migration/yank_functions.h | 1 + > 5 files changed, 27 insertions(+), 3 deletions(-) > > diff --git a/migration/migration.c b/migration/migration.c > index fa70400f98..bfeb65b8f7 100644 > --- a/migration/migration.c > +++ b/migration/migration.c > @@ -59,6 +59,7 @@ > #include "multifd.h" > #include "qemu/yank.h" > #include "sysemu/cpus.h" > +#include "yank_functions.h" > > #define MAX_THROTTLE (128 << 20) /* Migration transfer speed throttling */ > > @@ -273,6 +274,7 @@ void migration_incoming_state_destroy(void) > } > > if (mis->from_src_file) { > + migration_ioc_unregister_yank_from_file(mis->from_src_file); > qemu_fclose(mis->from_src_file); > mis->from_src_file = NULL; > } > @@ -1811,6 +1813,7 @@ static void migrate_fd_cleanup(MigrationState *s) > * Close the file handle without the lock to make sure the > * critical section won't block for long. > */ > + migration_ioc_unregister_yank_from_file(tmp); > qemu_fclose(tmp); > } > > @@ -3352,6 +3355,8 @@ static MigThrError postcopy_pause(MigrationState *s) > > /* Current channel is possibly broken. Release it. */ > assert(s->to_dst_file); > + /* Unregister yank for current channel */ > + migration_ioc_unregister_yank_from_file(s->to_dst_file); Should this go inside the lock? Dave > qemu_mutex_lock(&s->qemu_file_lock); > file = s->to_dst_file; > s->to_dst_file = NULL; > diff --git a/migration/qemu-file-channel.c b/migration/qemu-file-channel.c > index 2f8b1fcd46..bb5a5752df 100644 > --- a/migration/qemu-file-channel.c > +++ b/migration/qemu-file-channel.c > @@ -107,9 +107,6 @@ static int channel_close(void *opaque, Error **errp) > int ret; > QIOChannel *ioc = QIO_CHANNEL(opaque); > ret = qio_channel_close(ioc, errp); > - if (OBJECT(ioc)->ref == 1) { > - migration_ioc_unregister_yank(ioc); > - } > object_unref(OBJECT(ioc)); > return ret; > } > diff --git a/migration/savevm.c b/migration/savevm.c > index 96b5e5d639..7b7b64bd13 100644 > --- a/migration/savevm.c > +++ b/migration/savevm.c > @@ -65,6 +65,7 @@ > #include "qemu/bitmap.h" > #include "net/announce.h" > #include "qemu/yank.h" > +#include "yank_functions.h" > > const unsigned int postcopy_ram_discard_version; > > @@ -2568,6 +2569,12 @@ static bool postcopy_pause_incoming(MigrationIncomingState *mis) > /* Clear the triggered bit to allow one recovery */ > mis->postcopy_recover_triggered = false; > > + /* > + * Unregister yank with either from/to src would work, since ioc behind it > + * is the same > + */ > + migration_ioc_unregister_yank_from_file(mis->from_src_file); > + > assert(mis->from_src_file); > qemu_file_shutdown(mis->from_src_file); > qemu_fclose(mis->from_src_file); > diff --git a/migration/yank_functions.c b/migration/yank_functions.c > index 23697173ae..8c08aef14a 100644 > --- a/migration/yank_functions.c > +++ b/migration/yank_functions.c > @@ -14,6 +14,7 @@ > #include "qemu/yank.h" > #include "io/channel-socket.h" > #include "io/channel-tls.h" > +#include "qemu-file.h" > > void migration_yank_iochannel(void *opaque) > { > @@ -46,3 +47,16 @@ void migration_ioc_unregister_yank(QIOChannel *ioc) > QIO_CHANNEL(ioc)); > } > } > + > +void migration_ioc_unregister_yank_from_file(QEMUFile *file) > +{ > + QIOChannel *ioc = qemu_file_get_ioc(file); > + > + if (ioc) { > + /* > + * For migration qemufiles, we'll always reach here. Though we'll skip > + * calls from e.g. savevm/loadvm as they don't use yank. > + */ > + migration_ioc_unregister_yank(ioc); > + } > +} > diff --git a/migration/yank_functions.h b/migration/yank_functions.h > index 74c7f18c91..a7577955ed 100644 > --- a/migration/yank_functions.h > +++ b/migration/yank_functions.h > @@ -17,3 +17,4 @@ > void migration_yank_iochannel(void *opaque); > void migration_ioc_register_yank(QIOChannel *ioc); > void migration_ioc_unregister_yank(QIOChannel *ioc); > +void migration_ioc_unregister_yank_from_file(QEMUFile *file); > -- > 2.31.1 >
On Thu, Jul 22, 2021 at 04:27:35PM +0100, Dr. David Alan Gilbert wrote: > > @@ -3352,6 +3355,8 @@ static MigThrError postcopy_pause(MigrationState *s) > > > > /* Current channel is possibly broken. Release it. */ > > assert(s->to_dst_file); > > + /* Unregister yank for current channel */ > > + migration_ioc_unregister_yank_from_file(s->to_dst_file); > > Should this go inside the lock? Shouldn't need to; as we've got the assert() right above so otherwise we'll abrt otherwise :) The mutex lock/unlock right below this one is not protecting us from someone changing it but really for being able to wait until someone finished using it then we won't crash someone else. I think the rational is to_dst_file is managed by migration thread while from_dst_file is managed by rp_thread. Maybe I add a comment above? Thanks,
* Peter Xu (peterx@redhat.com) wrote: > On Thu, Jul 22, 2021 at 04:27:35PM +0100, Dr. David Alan Gilbert wrote: > > > @@ -3352,6 +3355,8 @@ static MigThrError postcopy_pause(MigrationState *s) > > > > > > /* Current channel is possibly broken. Release it. */ > > > assert(s->to_dst_file); > > > + /* Unregister yank for current channel */ > > > + migration_ioc_unregister_yank_from_file(s->to_dst_file); > > > > Should this go inside the lock? > > Shouldn't need to; as we've got the assert() right above so otherwise we'll > abrt otherwise :) Hmm OK; although with out always having to think about it then you worry about something getting in after the assert. > The mutex lock/unlock right below this one is not protecting us from someone > changing it but really for being able to wait until someone finished using it > then we won't crash someone else. > > I think the rational is to_dst_file is managed by migration thread while > from_dst_file is managed by rp_thread. > > Maybe I add a comment above? OK, I almost pushed this further, but then I started to get worried that we'd trace off a race with ordering on two locks with yank_lock; so yeh lets just add a comment. Dave > Thanks, > > -- > Peter Xu >
On Thu, Jul 22, 2021 at 06:09:03PM +0100, Dr. David Alan Gilbert wrote: > * Peter Xu (peterx@redhat.com) wrote: > > On Thu, Jul 22, 2021 at 04:27:35PM +0100, Dr. David Alan Gilbert wrote: > > > > @@ -3352,6 +3355,8 @@ static MigThrError postcopy_pause(MigrationState *s) > > > > > > > > /* Current channel is possibly broken. Release it. */ > > > > assert(s->to_dst_file); > > > > + /* Unregister yank for current channel */ > > > > + migration_ioc_unregister_yank_from_file(s->to_dst_file); > > > > > > Should this go inside the lock? > > > > Shouldn't need to; as we've got the assert() right above so otherwise we'll > > abrt otherwise :) > > Hmm OK; although with out always having to think about it then you worry > about something getting in after the assert. Right; the point is still that to_dst_file shouldn't be changed by other thread, or it'll bring some more challenge. If it can be mnodified when reaching this line, it means it can also reach earlier at assert(), then we coredump. So we should guaratee it won't happen, or we'd better also remove that assertion.. I think the challenge here is, we do have a mutex to protect the files and from that pov it's the same as other mutex. However it's different in that this mutex is also used in the oob handlers so we want to "keep it non-blocking and as small as possible for the critical sections". I didn't put it into the mutex protection because the yank code will further go to take the yank_lock so it'll make things slightly more complicated (then the file mutex is correlated to yank_lock too!). I don't think that's a problem because yank_lock is also "non-blocking" (ah! it can still block, got scheduled out, but there's no explicit things that will proactively sleep..). So since I think it's safe without the lock then I kept it outside of the lock then we don't need to discuss the safely of having it inside the critical section. (However then I noticed we still need justification on why it can be moved out..) > > > The mutex lock/unlock right below this one is not protecting us from someone > > changing it but really for being able to wait until someone finished using it > > then we won't crash someone else. > > > > I think the rational is to_dst_file is managed by migration thread while > > from_dst_file is managed by rp_thread. > > > > Maybe I add a comment above? > > OK, I almost pushed this further, but then I started to get worried that > we'd trace off a race with ordering on two locks with yank_lock; so yeh > lets just add a comment. Agreed. I think ordering is not a huge problem as yank_lock is very limitedly used in yank and protect yank instance/functions only, so there shouldn't be a path we take file mutex within it. Will repost shortly, thanks.
diff --git a/migration/migration.c b/migration/migration.c index fa70400f98..bfeb65b8f7 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -59,6 +59,7 @@ #include "multifd.h" #include "qemu/yank.h" #include "sysemu/cpus.h" +#include "yank_functions.h" #define MAX_THROTTLE (128 << 20) /* Migration transfer speed throttling */ @@ -273,6 +274,7 @@ void migration_incoming_state_destroy(void) } if (mis->from_src_file) { + migration_ioc_unregister_yank_from_file(mis->from_src_file); qemu_fclose(mis->from_src_file); mis->from_src_file = NULL; } @@ -1811,6 +1813,7 @@ static void migrate_fd_cleanup(MigrationState *s) * Close the file handle without the lock to make sure the * critical section won't block for long. */ + migration_ioc_unregister_yank_from_file(tmp); qemu_fclose(tmp); } @@ -3352,6 +3355,8 @@ static MigThrError postcopy_pause(MigrationState *s) /* Current channel is possibly broken. Release it. */ assert(s->to_dst_file); + /* Unregister yank for current channel */ + migration_ioc_unregister_yank_from_file(s->to_dst_file); qemu_mutex_lock(&s->qemu_file_lock); file = s->to_dst_file; s->to_dst_file = NULL; diff --git a/migration/qemu-file-channel.c b/migration/qemu-file-channel.c index 2f8b1fcd46..bb5a5752df 100644 --- a/migration/qemu-file-channel.c +++ b/migration/qemu-file-channel.c @@ -107,9 +107,6 @@ static int channel_close(void *opaque, Error **errp) int ret; QIOChannel *ioc = QIO_CHANNEL(opaque); ret = qio_channel_close(ioc, errp); - if (OBJECT(ioc)->ref == 1) { - migration_ioc_unregister_yank(ioc); - } object_unref(OBJECT(ioc)); return ret; } diff --git a/migration/savevm.c b/migration/savevm.c index 96b5e5d639..7b7b64bd13 100644 --- a/migration/savevm.c +++ b/migration/savevm.c @@ -65,6 +65,7 @@ #include "qemu/bitmap.h" #include "net/announce.h" #include "qemu/yank.h" +#include "yank_functions.h" const unsigned int postcopy_ram_discard_version; @@ -2568,6 +2569,12 @@ static bool postcopy_pause_incoming(MigrationIncomingState *mis) /* Clear the triggered bit to allow one recovery */ mis->postcopy_recover_triggered = false; + /* + * Unregister yank with either from/to src would work, since ioc behind it + * is the same + */ + migration_ioc_unregister_yank_from_file(mis->from_src_file); + assert(mis->from_src_file); qemu_file_shutdown(mis->from_src_file); qemu_fclose(mis->from_src_file); diff --git a/migration/yank_functions.c b/migration/yank_functions.c index 23697173ae..8c08aef14a 100644 --- a/migration/yank_functions.c +++ b/migration/yank_functions.c @@ -14,6 +14,7 @@ #include "qemu/yank.h" #include "io/channel-socket.h" #include "io/channel-tls.h" +#include "qemu-file.h" void migration_yank_iochannel(void *opaque) { @@ -46,3 +47,16 @@ void migration_ioc_unregister_yank(QIOChannel *ioc) QIO_CHANNEL(ioc)); } } + +void migration_ioc_unregister_yank_from_file(QEMUFile *file) +{ + QIOChannel *ioc = qemu_file_get_ioc(file); + + if (ioc) { + /* + * For migration qemufiles, we'll always reach here. Though we'll skip + * calls from e.g. savevm/loadvm as they don't use yank. + */ + migration_ioc_unregister_yank(ioc); + } +} diff --git a/migration/yank_functions.h b/migration/yank_functions.h index 74c7f18c91..a7577955ed 100644 --- a/migration/yank_functions.h +++ b/migration/yank_functions.h @@ -17,3 +17,4 @@ void migration_yank_iochannel(void *opaque); void migration_ioc_register_yank(QIOChannel *ioc); void migration_ioc_unregister_yank(QIOChannel *ioc); +void migration_ioc_unregister_yank_from_file(QEMUFile *file);
It's efficient, but hackish to call yank unregister calls in channel_close(), especially it'll be hard to debug when qemu crashed with some yank function leaked. Remove that hack, but instead explicitly unregister yank functions at the places where needed, they are: (on src) - migrate_fd_cleanup - postcopy_pause (on dst) - migration_incoming_state_destroy - postcopy_pause_incoming Signed-off-by: Peter Xu <peterx@redhat.com> --- migration/migration.c | 5 +++++ migration/qemu-file-channel.c | 3 --- migration/savevm.c | 7 +++++++ migration/yank_functions.c | 14 ++++++++++++++ migration/yank_functions.h | 1 + 5 files changed, 27 insertions(+), 3 deletions(-)