diff mbox series

[RFC] tests/qtest/migration: Add cpu hotplug test

Message ID 20250113210833.1712-1-farosas@suse.de (mailing list archive)
State New
Headers show
Series [RFC] tests/qtest/migration: Add cpu hotplug test | expand

Commit Message

Fabiano Rosas Jan. 13, 2025, 9:08 p.m. UTC
Bug #2594 is about a failure during migration after a cpu hotplug. Add
a test that covers that scenario. Start the source with -smp 2 and
destination with -smp 3, plug one extra cpu to match and migrate.

The issue seems to be a mismatch in the number of virtqueues between
the source and destination due to the hotplug not changing the
num_queues:

  get_pci_config_device: Bad config data: i=0x9a read: 4 device: 5
  cmask: ff wmask: 0 w1cmask:0

Usage:
$ QTEST_QEMU_IMG=./qemu-img QTEST_QEMU_BINARY=./qemu-system-x86_64 \
  ./tests/qtest/migration-test -p /x86_64/migration/hotplug/cpu

References: https://gitlab.com/qemu-project/qemu/-/issues/2594
References: https://issues.redhat.com/browse/RHEL-68302
Signed-off-by: Fabiano Rosas <farosas@suse.de>
---
As you can see there's no fix attached to this. I haven't reached that
part yet, suggestions welcome =). Posting the test case if anyone
wants to play with this.

(if someone at RH is already working on this, that's fine. I'm just
trying to get some upstream bugs to move)
---
 tests/qtest/migration/misc-tests.c | 76 ++++++++++++++++++++++++++++++
 1 file changed, 76 insertions(+)

Comments

Stefan Hajnoczi Jan. 14, 2025, 1:44 p.m. UTC | #1
On Mon, 13 Jan 2025 at 16:09, Fabiano Rosas <farosas@suse.de> wrote:
>
> Bug #2594 is about a failure during migration after a cpu hotplug. Add
> a test that covers that scenario. Start the source with -smp 2 and
> destination with -smp 3, plug one extra cpu to match and migrate.
>
> The issue seems to be a mismatch in the number of virtqueues between
> the source and destination due to the hotplug not changing the
> num_queues:
>
>   get_pci_config_device: Bad config data: i=0x9a read: 4 device: 5
>   cmask: ff wmask: 0 w1cmask:0
>
> Usage:
> $ QTEST_QEMU_IMG=./qemu-img QTEST_QEMU_BINARY=./qemu-system-x86_64 \
>   ./tests/qtest/migration-test -p /x86_64/migration/hotplug/cpu
>
> References: https://gitlab.com/qemu-project/qemu/-/issues/2594
> References: https://issues.redhat.com/browse/RHEL-68302
> Signed-off-by: Fabiano Rosas <farosas@suse.de>
> ---
> As you can see there's no fix attached to this. I haven't reached that
> part yet, suggestions welcome =). Posting the test case if anyone
> wants to play with this.
>
> (if someone at RH is already working on this, that's fine. I'm just
> trying to get some upstream bugs to move)

The management tool should set num_queues on the destination to ensure
migration compatibility.

Stefan

> ---
>  tests/qtest/migration/misc-tests.c | 76 ++++++++++++++++++++++++++++++
>  1 file changed, 76 insertions(+)
>
> diff --git a/tests/qtest/migration/misc-tests.c b/tests/qtest/migration/misc-tests.c
> index 6173430748..bde567b124 100644
> --- a/tests/qtest/migration/misc-tests.c
> +++ b/tests/qtest/migration/misc-tests.c
> @@ -251,6 +251,79 @@ static void test_validate_uri_channels_none_set(void)
>      do_test_validate_uri_channel(&args);
>  }
>
> +static void test_cpu_hotplug_virtqueues(void)
> +{
> +    MigrateStart args = {
> +        .hide_stderr = false,
> +    };
> +    QTestState *from, *to;
> +    g_autofree char *device_virtio_scsi = NULL;
> +    g_autofree char *device_virtio_blk = NULL;
> +    g_autofree char *filename_scsi = g_strdup_printf("%s/qtest_disk0.qcow2", tmpfs);
> +    g_autofree char *filename_blk = g_strdup_printf("%s/qtest_disk1.qcow2", tmpfs);
> +
> +    /*
> +     * Matching the queues between src/dst allows the test to pass.
> +     * Defaults are:
> +     *   virtio-scsi-pci:   #vcpus + 2 queues that are always present.
> +     *   virtio-blk-pci:    same number of queues as vcpus.
> +     * See virtio_pci_optimal_num_queues() for the exact computation.
> +     */
> +    bool match_queues = false;
> +    const char *qs_scsi = "num_queues=5";
> +    const char *qs_blk = "num-queues=3"; /* _ vs. - is not a mistake */
> +
> +    g_assert(mkimg(filename_scsi, "qcow2", 1));
> +    g_assert(mkimg(filename_blk, "qcow2", 1));
> +
> +    device_virtio_scsi = g_strdup_printf(
> +        "-drive id=drive0,if=none,format=qcow2,file=%s "
> +        "-device virtio-scsi-pci,id=scsi0,%s "
> +        "-device scsi-hd,drive=drive0,bus=scsi0.0 ",
> +        filename_scsi, match_queues ? qs_scsi : "");
> +
> +    device_virtio_blk = g_strdup_printf(
> +        "-drive id=drive1,if=none,format=qcow2,file=%s "
> +        "-device virtio-blk-pci,drive=drive1,id=blk0,%s ",
> +        filename_blk, match_queues ? qs_blk : "");
> +
> +    /* The missing cpu will be hotplugged before migration */
> +    args.opts_source = g_strconcat("-smp 2,threads=1,sockets=1,maxcpus=4 ",
> +                                   device_virtio_scsi,
> +                                   device_virtio_blk, NULL);
> +
> +    args.opts_target = g_strconcat("-smp 3,threads=1,sockets=1,maxcpus=4 ",
> +                                   device_virtio_scsi,
> +                                   device_virtio_blk, NULL);
> +
> +    if (migrate_start(&from, &to, "defer", &args)) {
> +        return;
> +    }
> +
> +    g_free((char *)args.opts_source);
> +    g_free((char *)args.opts_target);
> +
> +
> +    migrate_ensure_converge(from);
> +    wait_for_serial("src_serial");
> +
> +    qtest_qmp_assert_success(from, "{ 'execute': 'device_add',"
> +                             "  'arguments': { "
> +                             "    'node-id': 0, 'socket-id': 0,"
> +                             "    'core-id': 2, 'thread-id': 0,"
> +                             "    'id': {},"
> +                             "    'driver': 'qemu64-x86_64-cpu' } }");
> +
> +    migrate_incoming_qmp(to, "tcp:127.0.0.1:0", "{}");
> +
> +    migrate_qmp(from, to, NULL, NULL, "{}");
> +    wait_for_migration_complete(from);
> +
> +    migrate_end(from, to, true);
> +    unlink(filename_scsi);
> +    unlink(filename_blk);
> +}
> +
>  void migration_test_add_misc(MigrationTestEnv *env)
>  {
>      tmpfs = env->tmpfs;
> @@ -279,4 +352,7 @@ void migration_test_add_misc(MigrationTestEnv *env)
>                         test_validate_uri_channels_both_set);
>      migration_test_add("/migration/validate_uri/channels/none_set",
>                         test_validate_uri_channels_none_set);
> +
> +    migration_test_add("/migration/hotplug/cpu/virtqueues",
> +                       test_cpu_hotplug_virtqueues);
>  }
> --
> 2.35.3
>
>
Fabiano Rosas Jan. 14, 2025, 2:15 p.m. UTC | #2
Stefan Hajnoczi <stefanha@gmail.com> writes:

> On Mon, 13 Jan 2025 at 16:09, Fabiano Rosas <farosas@suse.de> wrote:
>>
>> Bug #2594 is about a failure during migration after a cpu hotplug. Add
>> a test that covers that scenario. Start the source with -smp 2 and
>> destination with -smp 3, plug one extra cpu to match and migrate.
>>
>> The issue seems to be a mismatch in the number of virtqueues between
>> the source and destination due to the hotplug not changing the
>> num_queues:
>>
>>   get_pci_config_device: Bad config data: i=0x9a read: 4 device: 5
>>   cmask: ff wmask: 0 w1cmask:0
>>
>> Usage:
>> $ QTEST_QEMU_IMG=./qemu-img QTEST_QEMU_BINARY=./qemu-system-x86_64 \
>>   ./tests/qtest/migration-test -p /x86_64/migration/hotplug/cpu
>>
>> References: https://gitlab.com/qemu-project/qemu/-/issues/2594
>> References: https://issues.redhat.com/browse/RHEL-68302
>> Signed-off-by: Fabiano Rosas <farosas@suse.de>
>> ---
>> As you can see there's no fix attached to this. I haven't reached that
>> part yet, suggestions welcome =). Posting the test case if anyone
>> wants to play with this.
>>
>> (if someone at RH is already working on this, that's fine. I'm just
>> trying to get some upstream bugs to move)
>
> The management tool should set num_queues on the destination to ensure
> migration compatibility.
>

I'm not sure that's feasible. The default num-queues seem like an
implementation detail that the management application would not have a
way to query. Unless it starts the source with a fixed number that
already accounts for all hotplug/unplug operations during the VM
lifetime, which would be wasteful in terms of resources allocated
upfront.

That would also make the destination run with a suboptimal (< #vcpus)
number of queues, although that's already the case in the source after
the hotplug. Do we have any definition on what should happen durgin
hotplug? If one plugs 100 vcpus, should num-queues remain as 2?
diff mbox series

Patch

diff --git a/tests/qtest/migration/misc-tests.c b/tests/qtest/migration/misc-tests.c
index 6173430748..bde567b124 100644
--- a/tests/qtest/migration/misc-tests.c
+++ b/tests/qtest/migration/misc-tests.c
@@ -251,6 +251,79 @@  static void test_validate_uri_channels_none_set(void)
     do_test_validate_uri_channel(&args);
 }
 
+static void test_cpu_hotplug_virtqueues(void)
+{
+    MigrateStart args = {
+        .hide_stderr = false,
+    };
+    QTestState *from, *to;
+    g_autofree char *device_virtio_scsi = NULL;
+    g_autofree char *device_virtio_blk = NULL;
+    g_autofree char *filename_scsi = g_strdup_printf("%s/qtest_disk0.qcow2", tmpfs);
+    g_autofree char *filename_blk = g_strdup_printf("%s/qtest_disk1.qcow2", tmpfs);
+
+    /*
+     * Matching the queues between src/dst allows the test to pass.
+     * Defaults are:
+     *   virtio-scsi-pci:   #vcpus + 2 queues that are always present.
+     *   virtio-blk-pci:    same number of queues as vcpus.
+     * See virtio_pci_optimal_num_queues() for the exact computation.
+     */
+    bool match_queues = false;
+    const char *qs_scsi = "num_queues=5";
+    const char *qs_blk = "num-queues=3"; /* _ vs. - is not a mistake */
+
+    g_assert(mkimg(filename_scsi, "qcow2", 1));
+    g_assert(mkimg(filename_blk, "qcow2", 1));
+
+    device_virtio_scsi = g_strdup_printf(
+        "-drive id=drive0,if=none,format=qcow2,file=%s "
+        "-device virtio-scsi-pci,id=scsi0,%s "
+        "-device scsi-hd,drive=drive0,bus=scsi0.0 ",
+        filename_scsi, match_queues ? qs_scsi : "");
+
+    device_virtio_blk = g_strdup_printf(
+        "-drive id=drive1,if=none,format=qcow2,file=%s "
+        "-device virtio-blk-pci,drive=drive1,id=blk0,%s ",
+        filename_blk, match_queues ? qs_blk : "");
+
+    /* The missing cpu will be hotplugged before migration */
+    args.opts_source = g_strconcat("-smp 2,threads=1,sockets=1,maxcpus=4 ",
+                                   device_virtio_scsi,
+                                   device_virtio_blk, NULL);
+
+    args.opts_target = g_strconcat("-smp 3,threads=1,sockets=1,maxcpus=4 ",
+                                   device_virtio_scsi,
+                                   device_virtio_blk, NULL);
+
+    if (migrate_start(&from, &to, "defer", &args)) {
+        return;
+    }
+
+    g_free((char *)args.opts_source);
+    g_free((char *)args.opts_target);
+
+
+    migrate_ensure_converge(from);
+    wait_for_serial("src_serial");
+
+    qtest_qmp_assert_success(from, "{ 'execute': 'device_add',"
+                             "  'arguments': { "
+                             "    'node-id': 0, 'socket-id': 0,"
+                             "    'core-id': 2, 'thread-id': 0,"
+                             "    'id': {},"
+                             "    'driver': 'qemu64-x86_64-cpu' } }");
+
+    migrate_incoming_qmp(to, "tcp:127.0.0.1:0", "{}");
+
+    migrate_qmp(from, to, NULL, NULL, "{}");
+    wait_for_migration_complete(from);
+
+    migrate_end(from, to, true);
+    unlink(filename_scsi);
+    unlink(filename_blk);
+}
+
 void migration_test_add_misc(MigrationTestEnv *env)
 {
     tmpfs = env->tmpfs;
@@ -279,4 +352,7 @@  void migration_test_add_misc(MigrationTestEnv *env)
                        test_validate_uri_channels_both_set);
     migration_test_add("/migration/validate_uri/channels/none_set",
                        test_validate_uri_channels_none_set);
+
+    migration_test_add("/migration/hotplug/cpu/virtqueues",
+                       test_cpu_hotplug_virtqueues);
 }