diff mbox series

[RFC,v2] x86/cpu: initialize the CPU concurrently

Message ID 90be4860-cbe0-25d4-ccca-75b96ecb4a3c@huawei.com (mailing list archive)
State New, archived
Headers show
Series [RFC,v2] x86/cpu: initialize the CPU concurrently | expand

Commit Message

Zhenyu Ye Dec. 21, 2020, 11:36 a.m. UTC
Providing a optional mechanism to wait for all VCPU threads be
created out of qemu_init_vcpu(), then we can initialize the cpu
concurrently on the x86 architecture.

This reduces the time of creating virtual machines. For example, when
the haxm is used as the accelerator, cpus_accel->create_vcpu_thread()
will cause at least 200ms for each cpu, extremely prolong the boot
time.

Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
Signed-off-by: eillon <yezhenyu2@huawei.com>
---
 hw/i386/x86.c         |  3 +++
 include/hw/core/cpu.h | 13 +++++++++++++
 softmmu/cpus.c        | 21 +++++++++++++++++++--
 3 files changed, 35 insertions(+), 2 deletions(-)

Comments

Eduardo Habkost Dec. 21, 2020, 9:36 p.m. UTC | #1
On Mon, Dec 21, 2020 at 07:36:18PM +0800, Zhenyu Ye wrote:
> Providing a optional mechanism to wait for all VCPU threads be
> created out of qemu_init_vcpu(), then we can initialize the cpu
> concurrently on the x86 architecture.
> 
> This reduces the time of creating virtual machines. For example, when
> the haxm is used as the accelerator, cpus_accel->create_vcpu_thread()
> will cause at least 200ms for each cpu, extremely prolong the boot
> time.
> 
> Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
> Signed-off-by: eillon <yezhenyu2@huawei.com>

The patch is easier to follow now, but I have a question that may
be difficult to answer:

What exactly is the meaning of cpu->created=true, and what
exactly would break if we never wait for cpu->created==true at all?

I'm asking that because we might be introducing subtle races
here, if some of the remaining CPU initialization code in
x86_cpu_realizefn() [1] expects the VCPU thread to be already
initialized.

The cpu_reset() call below is one such example (but probably not
the only one).  cpu_reset() ends up calling
kvm_arch_reset_vcpu(), which seems to assume kvm_init_vcpu() was
already called.  With your patch, kvm_init_vcpu() might end up
being called after kvm_arch_reset_vcpu().

Maybe a simpler alternative is to keep the existing thread
creation logic, but changing hax_cpu_thread_fn() to do less work
before calling cpu_thread_signal_created()?

In my testing (without this patch), creation of 8 KVM VCPU
threads in a 4 core machine takes less than 3 ms.  Why is
qemu_init_vcpu() taking so long on haxm?  Which parts of haxm
initialization can be moved after cpu_thread_signal_created(), to
make this better?

---
[1]  For reference, the last few lines of x86_cpu_realizefn() are:

|     qemu_init_vcpu(cs);
| 
|     /*
|      * Most Intel and certain AMD CPUs support hyperthreading. Even though QEMU
|      * fixes this issue by adjusting CPUID_0000_0001_EBX and CPUID_8000_0008_ECX
|      * based on inputs (sockets,cores,threads), it is still better to give
|      * users a warning.
|      *
|      * NOTE: the following code has to follow qemu_init_vcpu(). Otherwise
|      * cs->nr_threads hasn't be populated yet and the checking is incorrect.
|      */
|     if (IS_AMD_CPU(env) &&
|         !(env->features[FEAT_8000_0001_ECX] & CPUID_EXT3_TOPOEXT) &&
|         cs->nr_threads > 1 && !ht_warned) {
|             warn_report("This family of AMD CPU doesn't support "
|                         "hyperthreading(%d)",
|                         cs->nr_threads);
|             error_printf("Please configure -smp options properly"
|                          " or try enabling topoext feature.\n");
|             ht_warned = true;
|     }
| 
|     x86_cpu_apic_realize(cpu, &local_err);
|     if (local_err != NULL) {
|         goto out;
|     }
|     cpu_reset(cs);
| 
|     xcc->parent_realize(dev, &local_err);
| 
| out:
|     if (local_err != NULL) {
|         error_propagate(errp, local_err);
|         return;
|     }
| }



> ---
>  hw/i386/x86.c         |  3 +++
>  include/hw/core/cpu.h | 13 +++++++++++++
>  softmmu/cpus.c        | 21 +++++++++++++++++++--
>  3 files changed, 35 insertions(+), 2 deletions(-)
> 
> diff --git a/hw/i386/x86.c b/hw/i386/x86.c
> index 6329f90ef9..09afff724a 100644
> --- a/hw/i386/x86.c
> +++ b/hw/i386/x86.c
> @@ -108,6 +108,8 @@ void x86_cpu_new(X86MachineState *x86ms, int64_t apic_id, Error **errp)
>      if (!object_property_set_uint(cpu, "apic-id", apic_id, errp)) {
>          goto out;
>      }
> +
> +    CPU(cpu)->async_init = true;
>      qdev_realize(DEVICE(cpu), NULL, errp);
> 
>  out:
> @@ -137,6 +139,7 @@ void x86_cpus_init(X86MachineState *x86ms, int default_cpu_version)
>      for (i = 0; i < ms->smp.cpus; i++) {
>          x86_cpu_new(x86ms, possible_cpus->cpus[i].arch_id, &error_fatal);
>      }
> +    qemu_wait_all_vcpu_threads_init();
>  }
> 
>  void x86_rtc_set_cpus_count(ISADevice *rtc, uint16_t cpus_count)
> diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
> index 8e7552910d..55c2c17d93 100644
> --- a/include/hw/core/cpu.h
> +++ b/include/hw/core/cpu.h
> @@ -467,6 +467,12 @@ struct CPUState {
> 
>      /* track IOMMUs whose translations we've cached in the TCG TLB */
>      GArray *iommu_notifiers;
> +
> +    /*
> +     * If true, qemu_init_vcpu() will not wait for the VCPU thread to be created
> +     * before returning.
> +     */
> +    bool async_init;
>  };
> 
>  typedef QTAILQ_HEAD(CPUTailQ, CPUState) CPUTailQ;
> @@ -977,6 +983,13 @@ void start_exclusive(void);
>   */
>  void end_exclusive(void);
> 
> +/**
> + * qemu_wait_all_vcpu_threads_init:
> + *
> + * Wait for all VCPU threads to be created.
> + */
> +void qemu_wait_all_vcpu_threads_init(void);
> +
>  /**
>   * qemu_init_vcpu:
>   * @cpu: The vCPU to initialize.
> diff --git a/softmmu/cpus.c b/softmmu/cpus.c
> index 1dc20b9dc3..d76853d356 100644
> --- a/softmmu/cpus.c
> +++ b/softmmu/cpus.c
> @@ -601,6 +601,23 @@ void cpus_register_accel(const CpusAccel *ca)
>      cpus_accel = ca;
>  }
> 
> +static void qemu_wait_vcpu_thread_init(CPUState *cpu)
> +{
> +    while (!cpu->created) {
> +        qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
> +    }
> +}
> +
> +void qemu_wait_all_vcpu_threads_init(void)
> +{
> +    CPUState *cpu;
> +
> +    CPU_FOREACH(cpu) {
> +        printf("***** cpuid: %d\n", cpu->cpu_index);

Debugging leftover.

> +        qemu_wait_vcpu_thread_init(cpu);
> +    }
> +}
> +
>  void qemu_init_vcpu(CPUState *cpu)
>  {
>      MachineState *ms = MACHINE(qdev_get_machine());
> @@ -622,8 +639,8 @@ void qemu_init_vcpu(CPUState *cpu)
>      g_assert(cpus_accel != NULL && cpus_accel->create_vcpu_thread != NULL);
>      cpus_accel->create_vcpu_thread(cpu);
> 
> -    while (!cpu->created) {
> -        qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
> +    if (!cpu->async_init) {
> +        qemu_wait_vcpu_thread_init(cpu);
>      }
>  }
> 
> -- 
> 2.22.0.windows.1
>
Zhenyu Ye Dec. 24, 2020, 1:41 p.m. UTC | #2
Hi Eduardo,

Sorry for the delay.

On 2020/12/22 5:36, Eduardo Habkost wrote:
> On Mon, Dec 21, 2020 at 07:36:18PM +0800, Zhenyu Ye wrote:
>> Providing a optional mechanism to wait for all VCPU threads be
>> created out of qemu_init_vcpu(), then we can initialize the cpu
>> concurrently on the x86 architecture.
>>
>> This reduces the time of creating virtual machines. For example, when
>> the haxm is used as the accelerator, cpus_accel->create_vcpu_thread()
>> will cause at least 200ms for each cpu, extremely prolong the boot
>> time.
>>
>> Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
>> Signed-off-by: eillon <yezhenyu2@huawei.com>
> 
> The patch is easier to follow now, but I have a question that may
> be difficult to answer:
> 
> What exactly is the meaning of cpu->created=true, and what
> exactly would break if we never wait for cpu->created==true at all?
> 
> I'm asking that because we might be introducing subtle races
> here, if some of the remaining CPU initialization code in
> x86_cpu_realizefn() [1] expects the VCPU thread to be already
> initialized.
> 
> The cpu_reset() call below is one such example (but probably not
> the only one).  cpu_reset() ends up calling
> kvm_arch_reset_vcpu(), which seems to assume kvm_init_vcpu() was
> already called.  With your patch, kvm_init_vcpu() might end up
> being called after kvm_arch_reset_vcpu().
> 

There's a chance that this happens.
Could we move these (after qemu_init_vcpu()) out of x86_cpu_realizefn()
to the x86_cpus_init(), after qemu_wait_all_vcpu_threads_init()?
Such as:

void x86_cpus_init()
{
	foreach (cpu) {
		x86_cpu_new();
	}

	qemu_wait_all_vcpu_threads_init();

	foreach (cpu) {
		x86_cpu_new_post();
	}
}

> Maybe a simpler alternative is to keep the existing thread
> creation logic, but changing hax_cpu_thread_fn() to do less work
> before calling cpu_thread_signal_created()?
> 
> In my testing (without this patch), creation of 8 KVM VCPU
> threads in a 4 core machine takes less than 3 ms.  Why is
> qemu_init_vcpu() taking so long on haxm?  Which parts of haxm
> initialization can be moved after cpu_thread_signal_created(), to
> make this better?
> 

The most time-consuming operation in haxm is ioctl(HAX_VM_IOCTL_VCPU_CREATE).
Saddly this can not be split.

Even if we fix the problem in haxm, other accelerators may also have
this problem.  So I think if we can make the x86_cpu_new() concurrently,
we should try to do it.

Thanks,
Zhenyu
Eduardo Habkost Dec. 24, 2020, 6:06 p.m. UTC | #3
On Thu, Dec 24, 2020 at 09:41:10PM +0800, Zhenyu Ye wrote:
> Hi Eduardo,
> 
> Sorry for the delay.
> 
> On 2020/12/22 5:36, Eduardo Habkost wrote:
> > On Mon, Dec 21, 2020 at 07:36:18PM +0800, Zhenyu Ye wrote:
> >> Providing a optional mechanism to wait for all VCPU threads be
> >> created out of qemu_init_vcpu(), then we can initialize the cpu
> >> concurrently on the x86 architecture.
> >>
> >> This reduces the time of creating virtual machines. For example, when
> >> the haxm is used as the accelerator, cpus_accel->create_vcpu_thread()
> >> will cause at least 200ms for each cpu, extremely prolong the boot
> >> time.
> >>

I have just realized one thing: all VCPU thread function
(including hax) keeps holding qemu_global_mutex most of the time.
Are you sure your patch is really making VCPU initialization run
in parallel?  Do you have numbers showing this patch really
improves boot time?


> >> Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
> >> Signed-off-by: eillon <yezhenyu2@huawei.com>
> > 
> > The patch is easier to follow now, but I have a question that may
> > be difficult to answer:
> > 
> > What exactly is the meaning of cpu->created=true, and what
> > exactly would break if we never wait for cpu->created==true at all?
> > 
> > I'm asking that because we might be introducing subtle races
> > here, if some of the remaining CPU initialization code in
> > x86_cpu_realizefn() [1] expects the VCPU thread to be already
> > initialized.
> > 
> > The cpu_reset() call below is one such example (but probably not
> > the only one).  cpu_reset() ends up calling
> > kvm_arch_reset_vcpu(), which seems to assume kvm_init_vcpu() was
> > already called.  With your patch, kvm_init_vcpu() might end up
> > being called after kvm_arch_reset_vcpu().
> > 
> 
> There's a chance that this happens.
> Could we move these (after qemu_init_vcpu()) out of x86_cpu_realizefn()
> to the x86_cpus_init(), after qemu_wait_all_vcpu_threads_init()?
> Such as:
> 
> void x86_cpus_init()
> {
> 	foreach (cpu) {
> 		x86_cpu_new();
> 	}
> 
> 	qemu_wait_all_vcpu_threads_init();
> 
> 	foreach (cpu) {
> 		x86_cpu_new_post();
> 	}
> }

Maybe that would work, if the caveats are clearly documented.
I'm worried about bugs being introduced if people assume the VCPU
will always be fully initialized and ready to run after
qemu_init_vcpu() is called and qdev_realize() returns.

> 
> > Maybe a simpler alternative is to keep the existing thread
> > creation logic, but changing hax_cpu_thread_fn() to do less work
> > before calling cpu_thread_signal_created()?
> > 
> > In my testing (without this patch), creation of 8 KVM VCPU
> > threads in a 4 core machine takes less than 3 ms.  Why is
> > qemu_init_vcpu() taking so long on haxm?  Which parts of haxm
> > initialization can be moved after cpu_thread_signal_created(), to
> > make this better?
> > 
> 
> The most time-consuming operation in haxm is ioctl(HAX_VM_IOCTL_VCPU_CREATE).
> Saddly this can not be split.
> 
> Even if we fix the problem in haxm, other accelerators may also have
> this problem.  So I think if we can make the x86_cpu_new() concurrently,
> we should try to do it.

Changing the code to run all VCPU initialization actions for all
accelerators concurrently would require carefully reviewing the
VCPU thread code for all accelerators, looking for races.  Sounds
like a challenging task.  We could avoid that if we do something
that will parallelize only what we really need (and know to be
safe).
Zhenyu Ye Dec. 31, 2020, 9:34 a.m. UTC | #4
Hi Eduardo,

On 2020/12/25 2:06, Eduardo Habkost wrote:
>>
>> The most time-consuming operation in haxm is ioctl(HAX_VM_IOCTL_VCPU_CREATE).
>> Saddly this can not be split.
>>
>> Even if we fix the problem in haxm, other accelerators may also have
>> this problem.  So I think if we can make the x86_cpu_new() concurrently,
>> we should try to do it.
> 
> Changing the code to run all VCPU initialization actions for all
> accelerators concurrently would require carefully reviewing the
> VCPU thread code for all accelerators, looking for races.  Sounds
> like a challenging task.  We could avoid that if we do something
> that will parallelize only what we really need (and know to be
> safe).
> 

Yes, we must make sure that all accelerators could work parallelly,
even including the corresponding VCPU_CREATE_IOCTL, which is not
under qemu's control.

Fortunately, we have found out why ioctl(HAX_VM_IOCTL_VCPU_CREATE)
in haxm took such a long time.  It alloced vtlb when doing vcpu_create(),
which has been discarded and is useless.  After removing corresponding
operation, the vcpu initialization time is reduced to within 10ms.

Thanks for your attention and discussion.

Thanks,
Zhenyu
diff mbox series

Patch

diff --git a/hw/i386/x86.c b/hw/i386/x86.c
index 6329f90ef9..09afff724a 100644
--- a/hw/i386/x86.c
+++ b/hw/i386/x86.c
@@ -108,6 +108,8 @@  void x86_cpu_new(X86MachineState *x86ms, int64_t apic_id, Error **errp)
     if (!object_property_set_uint(cpu, "apic-id", apic_id, errp)) {
         goto out;
     }
+
+    CPU(cpu)->async_init = true;
     qdev_realize(DEVICE(cpu), NULL, errp);

 out:
@@ -137,6 +139,7 @@  void x86_cpus_init(X86MachineState *x86ms, int default_cpu_version)
     for (i = 0; i < ms->smp.cpus; i++) {
         x86_cpu_new(x86ms, possible_cpus->cpus[i].arch_id, &error_fatal);
     }
+    qemu_wait_all_vcpu_threads_init();
 }

 void x86_rtc_set_cpus_count(ISADevice *rtc, uint16_t cpus_count)
diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
index 8e7552910d..55c2c17d93 100644
--- a/include/hw/core/cpu.h
+++ b/include/hw/core/cpu.h
@@ -467,6 +467,12 @@  struct CPUState {

     /* track IOMMUs whose translations we've cached in the TCG TLB */
     GArray *iommu_notifiers;
+
+    /*
+     * If true, qemu_init_vcpu() will not wait for the VCPU thread to be created
+     * before returning.
+     */
+    bool async_init;
 };

 typedef QTAILQ_HEAD(CPUTailQ, CPUState) CPUTailQ;
@@ -977,6 +983,13 @@  void start_exclusive(void);
  */
 void end_exclusive(void);

+/**
+ * qemu_wait_all_vcpu_threads_init:
+ *
+ * Wait for all VCPU threads to be created.
+ */
+void qemu_wait_all_vcpu_threads_init(void);
+
 /**
  * qemu_init_vcpu:
  * @cpu: The vCPU to initialize.
diff --git a/softmmu/cpus.c b/softmmu/cpus.c
index 1dc20b9dc3..d76853d356 100644
--- a/softmmu/cpus.c
+++ b/softmmu/cpus.c
@@ -601,6 +601,23 @@  void cpus_register_accel(const CpusAccel *ca)
     cpus_accel = ca;
 }

+static void qemu_wait_vcpu_thread_init(CPUState *cpu)
+{
+    while (!cpu->created) {
+        qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
+    }
+}
+
+void qemu_wait_all_vcpu_threads_init(void)
+{
+    CPUState *cpu;
+
+    CPU_FOREACH(cpu) {
+        printf("***** cpuid: %d\n", cpu->cpu_index);
+        qemu_wait_vcpu_thread_init(cpu);
+    }
+}
+
 void qemu_init_vcpu(CPUState *cpu)
 {
     MachineState *ms = MACHINE(qdev_get_machine());
@@ -622,8 +639,8 @@  void qemu_init_vcpu(CPUState *cpu)
     g_assert(cpus_accel != NULL && cpus_accel->create_vcpu_thread != NULL);
     cpus_accel->create_vcpu_thread(cpu);

-    while (!cpu->created) {
-        qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
+    if (!cpu->async_init) {
+        qemu_wait_vcpu_thread_init(cpu);
     }
 }