diff mbox series

[v4,3/5] tests/migration-test: Add a test for postcopy hangs during RECOVER

Message ID 20231017202633.296756-4-peterx@redhat.com (mailing list archive)
State New, archived
Headers show
Series migration: Better error handling in rp thread, allow failures in recover | expand

Commit Message

Peter Xu Oct. 17, 2023, 8:26 p.m. UTC
From: Fabiano Rosas <farosas@suse.de>

To do so, create two paired sockets, but make them not providing real data.
Feed those fake sockets to src/dst QEMUs for recovery to let them go into
RECOVER stage without going out.  Test that we can always kick it out and
recover again with the right ports.

This patch is based on Fabiano's version here:

https://lore.kernel.org/r/877cowmdu0.fsf@suse.de

Signed-off-by: Fabiano Rosas <farosas@suse.de>
[peterx: write commit message, remove case 1, fix bugs, and more]
Signed-off-by: Peter Xu <peterx@redhat.com>
---
 tests/qtest/migration-test.c | 102 ++++++++++++++++++++++++++++++++---
 1 file changed, 96 insertions(+), 6 deletions(-)

Comments

Fabiano Rosas Oct. 31, 2023, 10:01 p.m. UTC | #1
Peter Xu <peterx@redhat.com> writes:

> From: Fabiano Rosas <farosas@suse.de>
>
> To do so, create two paired sockets, but make them not providing real data.
> Feed those fake sockets to src/dst QEMUs for recovery to let them go into
> RECOVER stage without going out.  Test that we can always kick it out and
> recover again with the right ports.
>
> This patch is based on Fabiano's version here:
>
> https://lore.kernel.org/r/877cowmdu0.fsf@suse.de
>
> Signed-off-by: Fabiano Rosas <farosas@suse.de>
> [peterx: write commit message, remove case 1, fix bugs, and more]
> Signed-off-by: Peter Xu <peterx@redhat.com>

This patch breaks the windows build. We need this:

-->8--
From 96fee99f2a3c8e11951100d94159eba02dd98540 Mon Sep 17 00:00:00 2001
From: Fabiano Rosas <farosas@suse.de>
Date: Tue, 31 Oct 2023 17:41:56 -0300
Subject: [PATCH] fixup! tests/migration-test: Add a test for postcopy hangs
 during RECOVER

Signed-off-by: Fabiano Rosas <farosas@suse.de>
---
 tests/qtest/migration-test.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c
index 2359d349cf..604ffe7746 100644
--- a/tests/qtest/migration-test.c
+++ b/tests/qtest/migration-test.c
@@ -1417,6 +1417,7 @@ static void wait_for_postcopy_status(QTestState *one, const char *status)
                                                   "completed", NULL });
 }
 
+#ifndef _WIN32
 static void postcopy_recover_fail(QTestState *from, QTestState *to)
 {
     int ret, pair1[2], pair2[2];
@@ -1481,6 +1482,7 @@ static void postcopy_recover_fail(QTestState *from, QTestState *to)
     close(pair2[0]);
     close(pair2[1]);
 }
+#endif
 
 static void test_postcopy_recovery_common(MigrateCommon *args)
 {
@@ -1520,6 +1522,7 @@ static void test_postcopy_recovery_common(MigrateCommon *args)
     wait_for_postcopy_status(to, "postcopy-paused");
     wait_for_postcopy_status(from, "postcopy-paused");
 
+#ifndef _WIN32
     if (args->postcopy_recovery_test_fail) {
         /*
          * Test when a wrong socket specified for recover, and then the
@@ -1528,6 +1531,7 @@ static void test_postcopy_recovery_common(MigrateCommon *args)
         postcopy_recover_fail(from, to);
         /* continue with a good recovery */
     }
+#endif
 
     /*
      * Create a new socket to emulate a new channel that is different
@@ -1565,6 +1569,7 @@ static void test_postcopy_recovery_compress(void)
     test_postcopy_recovery_common(&args);
 }
 
+#ifndef _WIN32
 static void test_postcopy_recovery_double_fail(void)
 {
     MigrateCommon args = {
@@ -1573,6 +1578,7 @@ static void test_postcopy_recovery_double_fail(void)
 
     test_postcopy_recovery_common(&args);
 }
+#endif
 
 #ifdef CONFIG_GNUTLS
 static void test_postcopy_recovery_tls_psk(void)
@@ -3185,9 +3191,10 @@ int main(int argc, char **argv)
             qtest_add_func("/migration/postcopy/recovery/compress/plain",
                            test_postcopy_recovery_compress);
         }
+#ifndef _WIN32
         qtest_add_func("/migration/postcopy/recovery/double-failures",
                        test_postcopy_recovery_double_fail);
-
+#endif
     }
 
     qtest_add_func("/migration/bad_dest", test_baddest);
Peter Xu Oct. 31, 2023, 10:19 p.m. UTC | #2
On Tue, Oct 31, 2023 at 07:01:02PM -0300, Fabiano Rosas wrote:
> This patch breaks the windows build. We need this:

Oops..

I hope this can still catch Juan's next pull, or I'll squash the fix if to
repost.

Thanks,
Juan Quintela Oct. 31, 2023, 10:34 p.m. UTC | #3
Fabiano Rosas <farosas@suse.de> wrote:
> Peter Xu <peterx@redhat.com> writes:
>
>> From: Fabiano Rosas <farosas@suse.de>
>>
>> To do so, create two paired sockets, but make them not providing real data.
>> Feed those fake sockets to src/dst QEMUs for recovery to let them go into
>> RECOVER stage without going out.  Test that we can always kick it out and
>> recover again with the right ports.
>>
>> This patch is based on Fabiano's version here:
>>
>> https://lore.kernel.org/r/877cowmdu0.fsf@suse.de
>>
>> Signed-off-by: Fabiano Rosas <farosas@suse.de>
>> [peterx: write commit message, remove case 1, fix bugs, and more]
>> Signed-off-by: Peter Xu <peterx@redhat.com>
>
> This patch breaks the windows build. We need this:

I was doing this O:-)

migration-next

hit the exact problem that you mention.

Thanks, Juan.

> -->8--
> From 96fee99f2a3c8e11951100d94159eba02dd98540 Mon Sep 17 00:00:00 2001
> From: Fabiano Rosas <farosas@suse.de>
> Date: Tue, 31 Oct 2023 17:41:56 -0300
> Subject: [PATCH] fixup! tests/migration-test: Add a test for postcopy hangs
>  during RECOVER
>
> Signed-off-by: Fabiano Rosas <farosas@suse.de>
> ---
>  tests/qtest/migration-test.c | 9 ++++++++-
>  1 file changed, 8 insertions(+), 1 deletion(-)
>
> diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c
> index 2359d349cf..604ffe7746 100644
> --- a/tests/qtest/migration-test.c
> +++ b/tests/qtest/migration-test.c
> @@ -1417,6 +1417,7 @@ static void wait_for_postcopy_status(QTestState *one, const char *status)
>                                                    "completed", NULL });
>  }
>  
> +#ifndef _WIN32
>  static void postcopy_recover_fail(QTestState *from, QTestState *to)
>  {
>      int ret, pair1[2], pair2[2];
> @@ -1481,6 +1482,7 @@ static void postcopy_recover_fail(QTestState *from, QTestState *to)
>      close(pair2[0]);
>      close(pair2[1]);
>  }
> +#endif
>  
>  static void test_postcopy_recovery_common(MigrateCommon *args)
>  {
> @@ -1520,6 +1522,7 @@ static void test_postcopy_recovery_common(MigrateCommon *args)
>      wait_for_postcopy_status(to, "postcopy-paused");
>      wait_for_postcopy_status(from, "postcopy-paused");
>  
> +#ifndef _WIN32
>      if (args->postcopy_recovery_test_fail) {
>          /*
>           * Test when a wrong socket specified for recover, and then the
> @@ -1528,6 +1531,7 @@ static void test_postcopy_recovery_common(MigrateCommon *args)
>          postcopy_recover_fail(from, to);
>          /* continue with a good recovery */
>      }
> +#endif
>  
>      /*
>       * Create a new socket to emulate a new channel that is different
> @@ -1565,6 +1569,7 @@ static void test_postcopy_recovery_compress(void)
>      test_postcopy_recovery_common(&args);
>  }
>  
> +#ifndef _WIN32
>  static void test_postcopy_recovery_double_fail(void)
>  {
>      MigrateCommon args = {
> @@ -1573,6 +1578,7 @@ static void test_postcopy_recovery_double_fail(void)
>  
>      test_postcopy_recovery_common(&args);
>  }
> +#endif
>  
>  #ifdef CONFIG_GNUTLS
>  static void test_postcopy_recovery_tls_psk(void)
> @@ -3185,9 +3191,10 @@ int main(int argc, char **argv)
>              qtest_add_func("/migration/postcopy/recovery/compress/plain",
>                             test_postcopy_recovery_compress);
>          }
> +#ifndef _WIN32
>          qtest_add_func("/migration/postcopy/recovery/double-failures",
>                         test_postcopy_recovery_double_fail);
> -
> +#endif
>      }
>  
>      qtest_add_func("/migration/bad_dest", test_baddest);
diff mbox series

Patch

diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c
index e1c110537b..e81d6de000 100644
--- a/tests/qtest/migration-test.c
+++ b/tests/qtest/migration-test.c
@@ -726,6 +726,7 @@  typedef struct {
     /* Postcopy specific fields */
     void *postcopy_data;
     bool postcopy_preempt;
+    bool postcopy_recovery_test_fail;
 } MigrateCommon;
 
 static int test_migrate_start(QTestState **from, QTestState **to,
@@ -1379,6 +1380,78 @@  static void test_postcopy_preempt_tls_psk(void)
 }
 #endif
 
+static void wait_for_postcopy_status(QTestState *one, const char *status)
+{
+    wait_for_migration_status(one, status,
+                              (const char * []) { "failed", "active",
+                                                  "completed", NULL });
+}
+
+static void postcopy_recover_fail(QTestState *from, QTestState *to)
+{
+    int ret, pair1[2], pair2[2];
+    char c;
+
+    /* Create two unrelated socketpairs */
+    ret = qemu_socketpair(PF_LOCAL, SOCK_STREAM, 0, pair1);
+    g_assert_cmpint(ret, ==, 0);
+
+    ret = qemu_socketpair(PF_LOCAL, SOCK_STREAM, 0, pair2);
+    g_assert_cmpint(ret, ==, 0);
+
+    /*
+     * Give the guests unpaired ends of the sockets, so they'll all blocked
+     * at reading.  This mimics a wrong channel established.
+     */
+    qtest_qmp_fds_assert_success(from, &pair1[0], 1,
+                                 "{ 'execute': 'getfd',"
+                                 "  'arguments': { 'fdname': 'fd-mig' }}");
+    qtest_qmp_fds_assert_success(to, &pair2[0], 1,
+                                 "{ 'execute': 'getfd',"
+                                 "  'arguments': { 'fdname': 'fd-mig' }}");
+
+    /*
+     * Write the 1st byte as QEMU_VM_COMMAND (0x8) for the dest socket, to
+     * emulate the 1st byte of a real recovery, but stops from there to
+     * keep dest QEMU in RECOVER.  This is needed so that we can kick off
+     * the recover process on dest QEMU (by triggering the G_IO_IN event).
+     *
+     * NOTE: this trick is not needed on src QEMUs, because src doesn't
+     * rely on an pre-existing G_IO_IN event, so it will always trigger the
+     * upcoming recovery anyway even if it can read nothing.
+     */
+#define QEMU_VM_COMMAND              0x08
+    c = QEMU_VM_COMMAND;
+    ret = send(pair2[1], &c, 1, 0);
+    g_assert_cmpint(ret, ==, 1);
+
+    migrate_recover(to, "fd:fd-mig");
+    migrate_qmp(from, "fd:fd-mig", "{'resume': true}");
+
+    /*
+     * Make sure both QEMU instances will go into RECOVER stage, then test
+     * kicking them out using migrate-pause.
+     */
+    wait_for_postcopy_status(from, "postcopy-recover");
+    wait_for_postcopy_status(to, "postcopy-recover");
+
+    /*
+     * This would be issued by the admin upon noticing the hang, we should
+     * make sure we're able to kick this out.
+     */
+    migrate_pause(from);
+    wait_for_postcopy_status(from, "postcopy-paused");
+
+    /* Do the same test on dest */
+    migrate_pause(to);
+    wait_for_postcopy_status(to, "postcopy-paused");
+
+    close(pair1[0]);
+    close(pair1[1]);
+    close(pair2[0]);
+    close(pair2[1]);
+}
+
 static void test_postcopy_recovery_common(MigrateCommon *args)
 {
     QTestState *from, *to;
@@ -1414,9 +1487,17 @@  static void test_postcopy_recovery_common(MigrateCommon *args)
      * migrate-recover command can only succeed if destination machine
      * is in the paused state
      */
-    wait_for_migration_status(to, "postcopy-paused",
-                              (const char * []) { "failed", "active",
-                                                  "completed", NULL });
+    wait_for_postcopy_status(to, "postcopy-paused");
+    wait_for_postcopy_status(from, "postcopy-paused");
+
+    if (args->postcopy_recovery_test_fail) {
+        /*
+         * Test when a wrong socket specified for recover, and then the
+         * ability to kick it out, and continue with a correct socket.
+         */
+        postcopy_recover_fail(from, to);
+        /* continue with a good recovery */
+    }
 
     /*
      * Create a new socket to emulate a new channel that is different
@@ -1430,9 +1511,6 @@  static void test_postcopy_recovery_common(MigrateCommon *args)
      * Try to rebuild the migration channel using the resume flag and
      * the newly created channel
      */
-    wait_for_migration_status(from, "postcopy-paused",
-                              (const char * []) { "failed", "active",
-                                                  "completed", NULL });
     migrate_qmp(from, uri, "{'resume': true}");
 
     /* Restore the postcopy bandwidth to unlimited */
@@ -1457,6 +1535,15 @@  static void test_postcopy_recovery_compress(void)
     test_postcopy_recovery_common(&args);
 }
 
+static void test_postcopy_recovery_double_fail(void)
+{
+    MigrateCommon args = {
+        .postcopy_recovery_test_fail = true,
+    };
+
+    test_postcopy_recovery_common(&args);
+}
+
 #ifdef CONFIG_GNUTLS
 static void test_postcopy_recovery_tls_psk(void)
 {
@@ -3030,6 +3117,9 @@  int main(int argc, char **argv)
             qtest_add_func("/migration/postcopy/recovery/compress/plain",
                            test_postcopy_recovery_compress);
         }
+        qtest_add_func("/migration/postcopy/recovery/double-failures",
+                       test_postcopy_recovery_double_fail);
+
     }
 
     qtest_add_func("/migration/bad_dest", test_baddest);