Message ID | 20241126190203.3094635-1-pierrick.bouvier@linaro.org (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | plugins: optimize cpu_index code generation | expand |
On 11/26/24 11:02, Pierrick Bouvier wrote: > When running with a single vcpu, we can return a constant instead of a > load when accessing cpu_index. > A side effect is that all tcg operations using it are optimized, most > notably scoreboard access. > When running a simple loop in user-mode, the speedup is around 20%. > > Signed-off-by: Pierrick Bouvier <pierrick.bouvier@linaro.org> > --- > accel/tcg/plugin-gen.c | 7 +++++++ > plugins/core.c | 13 +++++++++++++ > 2 files changed, 20 insertions(+) > > diff --git a/accel/tcg/plugin-gen.c b/accel/tcg/plugin-gen.c > index 0f47bfbb489..2eabeecbdcf 100644 > --- a/accel/tcg/plugin-gen.c > +++ b/accel/tcg/plugin-gen.c > @@ -102,6 +102,13 @@ static void gen_disable_mem_helper(void) > > static TCGv_i32 gen_cpu_index(void) > { > + /* > + * Optimize when we run with a single vcpu. All values using cpu_index, > + * including scoreboard index, will be optimized out. > + */ > + if (qemu_plugin_num_vcpus() == 1) { > + return tcg_constant_i32(0); > + } > TCGv_i32 cpu_index = tcg_temp_ebb_new_i32(); > tcg_gen_ld_i32(cpu_index, tcg_env, > -offsetof(ArchCPU, env) + offsetof(CPUState, cpu_index)); > diff --git a/plugins/core.c b/plugins/core.c > index bb105e8e688..8e32ca5ee08 100644 > --- a/plugins/core.c > +++ b/plugins/core.c > @@ -266,6 +266,19 @@ static void qemu_plugin_vcpu_init__async(CPUState *cpu, run_on_cpu_data unused) > > assert(cpu->cpu_index != UNASSIGNED_CPU_INDEX); > qemu_rec_mutex_lock(&plugin.lock); > + > + /* > + * We want to flush tb when a second cpu appear. > + * When generating plugin code, we optimize cpu_index for num_vcpus == 1. > + */ > + if (plugin.num_vcpus == 1) { > + qemu_rec_mutex_unlock(&plugin.lock); > + start_exclusive(); > + qemu_rec_mutex_lock(&plugin.lock); > + tb_flush(cpu); > + end_exclusive(); > + } > + > plugin.num_vcpus = MAX(plugin.num_vcpus, cpu->cpu_index + 1); > plugin_cpu_update__locked(&cpu->cpu_index, NULL, NULL); > success = g_hash_table_insert(plugin.cpu_ht, &cpu->cpu_index, This patch is for 10.0.
diff --git a/accel/tcg/plugin-gen.c b/accel/tcg/plugin-gen.c index 0f47bfbb489..2eabeecbdcf 100644 --- a/accel/tcg/plugin-gen.c +++ b/accel/tcg/plugin-gen.c @@ -102,6 +102,13 @@ static void gen_disable_mem_helper(void) static TCGv_i32 gen_cpu_index(void) { + /* + * Optimize when we run with a single vcpu. All values using cpu_index, + * including scoreboard index, will be optimized out. + */ + if (qemu_plugin_num_vcpus() == 1) { + return tcg_constant_i32(0); + } TCGv_i32 cpu_index = tcg_temp_ebb_new_i32(); tcg_gen_ld_i32(cpu_index, tcg_env, -offsetof(ArchCPU, env) + offsetof(CPUState, cpu_index)); diff --git a/plugins/core.c b/plugins/core.c index bb105e8e688..8e32ca5ee08 100644 --- a/plugins/core.c +++ b/plugins/core.c @@ -266,6 +266,19 @@ static void qemu_plugin_vcpu_init__async(CPUState *cpu, run_on_cpu_data unused) assert(cpu->cpu_index != UNASSIGNED_CPU_INDEX); qemu_rec_mutex_lock(&plugin.lock); + + /* + * We want to flush tb when a second cpu appear. + * When generating plugin code, we optimize cpu_index for num_vcpus == 1. + */ + if (plugin.num_vcpus == 1) { + qemu_rec_mutex_unlock(&plugin.lock); + start_exclusive(); + qemu_rec_mutex_lock(&plugin.lock); + tb_flush(cpu); + end_exclusive(); + } + plugin.num_vcpus = MAX(plugin.num_vcpus, cpu->cpu_index + 1); plugin_cpu_update__locked(&cpu->cpu_index, NULL, NULL); success = g_hash_table_insert(plugin.cpu_ht, &cpu->cpu_index,
When running with a single vcpu, we can return a constant instead of a load when accessing cpu_index. A side effect is that all tcg operations using it are optimized, most notably scoreboard access. When running a simple loop in user-mode, the speedup is around 20%. Signed-off-by: Pierrick Bouvier <pierrick.bouvier@linaro.org> --- accel/tcg/plugin-gen.c | 7 +++++++ plugins/core.c | 13 +++++++++++++ 2 files changed, 20 insertions(+)