Message ID | 20231017202633.296756-4-peterx@redhat.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | migration: Better error handling in rp thread, allow failures in recover | expand |
Peter Xu <peterx@redhat.com> writes: > From: Fabiano Rosas <farosas@suse.de> > > To do so, create two paired sockets, but make them not providing real data. > Feed those fake sockets to src/dst QEMUs for recovery to let them go into > RECOVER stage without going out. Test that we can always kick it out and > recover again with the right ports. > > This patch is based on Fabiano's version here: > > https://lore.kernel.org/r/877cowmdu0.fsf@suse.de > > Signed-off-by: Fabiano Rosas <farosas@suse.de> > [peterx: write commit message, remove case 1, fix bugs, and more] > Signed-off-by: Peter Xu <peterx@redhat.com> This patch breaks the windows build. We need this: -->8-- From 96fee99f2a3c8e11951100d94159eba02dd98540 Mon Sep 17 00:00:00 2001 From: Fabiano Rosas <farosas@suse.de> Date: Tue, 31 Oct 2023 17:41:56 -0300 Subject: [PATCH] fixup! tests/migration-test: Add a test for postcopy hangs during RECOVER Signed-off-by: Fabiano Rosas <farosas@suse.de> --- tests/qtest/migration-test.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c index 2359d349cf..604ffe7746 100644 --- a/tests/qtest/migration-test.c +++ b/tests/qtest/migration-test.c @@ -1417,6 +1417,7 @@ static void wait_for_postcopy_status(QTestState *one, const char *status) "completed", NULL }); } +#ifndef _WIN32 static void postcopy_recover_fail(QTestState *from, QTestState *to) { int ret, pair1[2], pair2[2]; @@ -1481,6 +1482,7 @@ static void postcopy_recover_fail(QTestState *from, QTestState *to) close(pair2[0]); close(pair2[1]); } +#endif static void test_postcopy_recovery_common(MigrateCommon *args) { @@ -1520,6 +1522,7 @@ static void test_postcopy_recovery_common(MigrateCommon *args) wait_for_postcopy_status(to, "postcopy-paused"); wait_for_postcopy_status(from, "postcopy-paused"); +#ifndef _WIN32 if (args->postcopy_recovery_test_fail) { /* * Test when a wrong socket specified for recover, and then the @@ -1528,6 +1531,7 @@ static void test_postcopy_recovery_common(MigrateCommon *args) postcopy_recover_fail(from, to); /* continue with a good recovery */ } +#endif /* * Create a new socket to emulate a new channel that is different @@ -1565,6 +1569,7 @@ static void test_postcopy_recovery_compress(void) test_postcopy_recovery_common(&args); } +#ifndef _WIN32 static void test_postcopy_recovery_double_fail(void) { MigrateCommon args = { @@ -1573,6 +1578,7 @@ static void test_postcopy_recovery_double_fail(void) test_postcopy_recovery_common(&args); } +#endif #ifdef CONFIG_GNUTLS static void test_postcopy_recovery_tls_psk(void) @@ -3185,9 +3191,10 @@ int main(int argc, char **argv) qtest_add_func("/migration/postcopy/recovery/compress/plain", test_postcopy_recovery_compress); } +#ifndef _WIN32 qtest_add_func("/migration/postcopy/recovery/double-failures", test_postcopy_recovery_double_fail); - +#endif } qtest_add_func("/migration/bad_dest", test_baddest);
On Tue, Oct 31, 2023 at 07:01:02PM -0300, Fabiano Rosas wrote:
> This patch breaks the windows build. We need this:
Oops..
I hope this can still catch Juan's next pull, or I'll squash the fix if to
repost.
Thanks,
Fabiano Rosas <farosas@suse.de> wrote: > Peter Xu <peterx@redhat.com> writes: > >> From: Fabiano Rosas <farosas@suse.de> >> >> To do so, create two paired sockets, but make them not providing real data. >> Feed those fake sockets to src/dst QEMUs for recovery to let them go into >> RECOVER stage without going out. Test that we can always kick it out and >> recover again with the right ports. >> >> This patch is based on Fabiano's version here: >> >> https://lore.kernel.org/r/877cowmdu0.fsf@suse.de >> >> Signed-off-by: Fabiano Rosas <farosas@suse.de> >> [peterx: write commit message, remove case 1, fix bugs, and more] >> Signed-off-by: Peter Xu <peterx@redhat.com> > > This patch breaks the windows build. We need this: I was doing this O:-) migration-next hit the exact problem that you mention. Thanks, Juan. > -->8-- > From 96fee99f2a3c8e11951100d94159eba02dd98540 Mon Sep 17 00:00:00 2001 > From: Fabiano Rosas <farosas@suse.de> > Date: Tue, 31 Oct 2023 17:41:56 -0300 > Subject: [PATCH] fixup! tests/migration-test: Add a test for postcopy hangs > during RECOVER > > Signed-off-by: Fabiano Rosas <farosas@suse.de> > --- > tests/qtest/migration-test.c | 9 ++++++++- > 1 file changed, 8 insertions(+), 1 deletion(-) > > diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c > index 2359d349cf..604ffe7746 100644 > --- a/tests/qtest/migration-test.c > +++ b/tests/qtest/migration-test.c > @@ -1417,6 +1417,7 @@ static void wait_for_postcopy_status(QTestState *one, const char *status) > "completed", NULL }); > } > > +#ifndef _WIN32 > static void postcopy_recover_fail(QTestState *from, QTestState *to) > { > int ret, pair1[2], pair2[2]; > @@ -1481,6 +1482,7 @@ static void postcopy_recover_fail(QTestState *from, QTestState *to) > close(pair2[0]); > close(pair2[1]); > } > +#endif > > static void test_postcopy_recovery_common(MigrateCommon *args) > { > @@ -1520,6 +1522,7 @@ static void test_postcopy_recovery_common(MigrateCommon *args) > wait_for_postcopy_status(to, "postcopy-paused"); > wait_for_postcopy_status(from, "postcopy-paused"); > > +#ifndef _WIN32 > if (args->postcopy_recovery_test_fail) { > /* > * Test when a wrong socket specified for recover, and then the > @@ -1528,6 +1531,7 @@ static void test_postcopy_recovery_common(MigrateCommon *args) > postcopy_recover_fail(from, to); > /* continue with a good recovery */ > } > +#endif > > /* > * Create a new socket to emulate a new channel that is different > @@ -1565,6 +1569,7 @@ static void test_postcopy_recovery_compress(void) > test_postcopy_recovery_common(&args); > } > > +#ifndef _WIN32 > static void test_postcopy_recovery_double_fail(void) > { > MigrateCommon args = { > @@ -1573,6 +1578,7 @@ static void test_postcopy_recovery_double_fail(void) > > test_postcopy_recovery_common(&args); > } > +#endif > > #ifdef CONFIG_GNUTLS > static void test_postcopy_recovery_tls_psk(void) > @@ -3185,9 +3191,10 @@ int main(int argc, char **argv) > qtest_add_func("/migration/postcopy/recovery/compress/plain", > test_postcopy_recovery_compress); > } > +#ifndef _WIN32 > qtest_add_func("/migration/postcopy/recovery/double-failures", > test_postcopy_recovery_double_fail); > - > +#endif > } > > qtest_add_func("/migration/bad_dest", test_baddest);
diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c index e1c110537b..e81d6de000 100644 --- a/tests/qtest/migration-test.c +++ b/tests/qtest/migration-test.c @@ -726,6 +726,7 @@ typedef struct { /* Postcopy specific fields */ void *postcopy_data; bool postcopy_preempt; + bool postcopy_recovery_test_fail; } MigrateCommon; static int test_migrate_start(QTestState **from, QTestState **to, @@ -1379,6 +1380,78 @@ static void test_postcopy_preempt_tls_psk(void) } #endif +static void wait_for_postcopy_status(QTestState *one, const char *status) +{ + wait_for_migration_status(one, status, + (const char * []) { "failed", "active", + "completed", NULL }); +} + +static void postcopy_recover_fail(QTestState *from, QTestState *to) +{ + int ret, pair1[2], pair2[2]; + char c; + + /* Create two unrelated socketpairs */ + ret = qemu_socketpair(PF_LOCAL, SOCK_STREAM, 0, pair1); + g_assert_cmpint(ret, ==, 0); + + ret = qemu_socketpair(PF_LOCAL, SOCK_STREAM, 0, pair2); + g_assert_cmpint(ret, ==, 0); + + /* + * Give the guests unpaired ends of the sockets, so they'll all blocked + * at reading. This mimics a wrong channel established. + */ + qtest_qmp_fds_assert_success(from, &pair1[0], 1, + "{ 'execute': 'getfd'," + " 'arguments': { 'fdname': 'fd-mig' }}"); + qtest_qmp_fds_assert_success(to, &pair2[0], 1, + "{ 'execute': 'getfd'," + " 'arguments': { 'fdname': 'fd-mig' }}"); + + /* + * Write the 1st byte as QEMU_VM_COMMAND (0x8) for the dest socket, to + * emulate the 1st byte of a real recovery, but stops from there to + * keep dest QEMU in RECOVER. This is needed so that we can kick off + * the recover process on dest QEMU (by triggering the G_IO_IN event). + * + * NOTE: this trick is not needed on src QEMUs, because src doesn't + * rely on an pre-existing G_IO_IN event, so it will always trigger the + * upcoming recovery anyway even if it can read nothing. + */ +#define QEMU_VM_COMMAND 0x08 + c = QEMU_VM_COMMAND; + ret = send(pair2[1], &c, 1, 0); + g_assert_cmpint(ret, ==, 1); + + migrate_recover(to, "fd:fd-mig"); + migrate_qmp(from, "fd:fd-mig", "{'resume': true}"); + + /* + * Make sure both QEMU instances will go into RECOVER stage, then test + * kicking them out using migrate-pause. + */ + wait_for_postcopy_status(from, "postcopy-recover"); + wait_for_postcopy_status(to, "postcopy-recover"); + + /* + * This would be issued by the admin upon noticing the hang, we should + * make sure we're able to kick this out. + */ + migrate_pause(from); + wait_for_postcopy_status(from, "postcopy-paused"); + + /* Do the same test on dest */ + migrate_pause(to); + wait_for_postcopy_status(to, "postcopy-paused"); + + close(pair1[0]); + close(pair1[1]); + close(pair2[0]); + close(pair2[1]); +} + static void test_postcopy_recovery_common(MigrateCommon *args) { QTestState *from, *to; @@ -1414,9 +1487,17 @@ static void test_postcopy_recovery_common(MigrateCommon *args) * migrate-recover command can only succeed if destination machine * is in the paused state */ - wait_for_migration_status(to, "postcopy-paused", - (const char * []) { "failed", "active", - "completed", NULL }); + wait_for_postcopy_status(to, "postcopy-paused"); + wait_for_postcopy_status(from, "postcopy-paused"); + + if (args->postcopy_recovery_test_fail) { + /* + * Test when a wrong socket specified for recover, and then the + * ability to kick it out, and continue with a correct socket. + */ + postcopy_recover_fail(from, to); + /* continue with a good recovery */ + } /* * Create a new socket to emulate a new channel that is different @@ -1430,9 +1511,6 @@ static void test_postcopy_recovery_common(MigrateCommon *args) * Try to rebuild the migration channel using the resume flag and * the newly created channel */ - wait_for_migration_status(from, "postcopy-paused", - (const char * []) { "failed", "active", - "completed", NULL }); migrate_qmp(from, uri, "{'resume': true}"); /* Restore the postcopy bandwidth to unlimited */ @@ -1457,6 +1535,15 @@ static void test_postcopy_recovery_compress(void) test_postcopy_recovery_common(&args); } +static void test_postcopy_recovery_double_fail(void) +{ + MigrateCommon args = { + .postcopy_recovery_test_fail = true, + }; + + test_postcopy_recovery_common(&args); +} + #ifdef CONFIG_GNUTLS static void test_postcopy_recovery_tls_psk(void) { @@ -3030,6 +3117,9 @@ int main(int argc, char **argv) qtest_add_func("/migration/postcopy/recovery/compress/plain", test_postcopy_recovery_compress); } + qtest_add_func("/migration/postcopy/recovery/double-failures", + test_postcopy_recovery_double_fail); + } qtest_add_func("/migration/bad_dest", test_baddest);