diff mbox series

[v14,04/10] accel/tcg: add jit stats and time to TBStatistics

Message ID 20230530083526.2174430-5-fei2.wu@intel.com (mailing list archive)
State New, archived
Headers show
Series TCG code quality tracking | expand

Commit Message

Wu, Fei May 30, 2023, 8:35 a.m. UTC
This collects all the statistics for TBStatistics, not only for the
whole emulation but for each TB.

Signed-off-by: Vanderson M. do Rosario <vandersonmr2@gmail.com>
Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Fei Wu <fei2.wu@intel.com>
---
 accel/tcg/monitor.c           |  20 ++++-
 accel/tcg/tb-stats.c          | 146 ++++++++++++++++++++++++++++++++++
 accel/tcg/tcg-accel-ops.c     |   7 ++
 accel/tcg/translate-all.c     |  70 +++++++++++++++-
 accel/tcg/translator.c        |   7 +-
 include/exec/tb-stats-flags.h |   2 +
 include/exec/tb-stats.h       |  46 +++++++++++
 include/qemu/timer.h          |   6 ++
 include/tcg/tcg.h             |  28 ++++++-
 softmmu/runstate.c            |   9 +++
 tcg/tcg.c                     |  88 ++++++++++++++++++--
 tests/qtest/qmp-cmd-test.c    |   3 +
 12 files changed, 417 insertions(+), 15 deletions(-)

Comments

Markus Armbruster May 30, 2023, 9:37 a.m. UTC | #1
Fei Wu <fei2.wu@intel.com> writes:

> This collects all the statistics for TBStatistics, not only for the
> whole emulation but for each TB.
>
> Signed-off-by: Vanderson M. do Rosario <vandersonmr2@gmail.com>
> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
> Signed-off-by: Fei Wu <fei2.wu@intel.com>
> ---
>  accel/tcg/monitor.c           |  20 ++++-
>  accel/tcg/tb-stats.c          | 146 ++++++++++++++++++++++++++++++++++
>  accel/tcg/tcg-accel-ops.c     |   7 ++
>  accel/tcg/translate-all.c     |  70 +++++++++++++++-
>  accel/tcg/translator.c        |   7 +-
>  include/exec/tb-stats-flags.h |   2 +
>  include/exec/tb-stats.h       |  46 +++++++++++
>  include/qemu/timer.h          |   6 ++
>  include/tcg/tcg.h             |  28 ++++++-
>  softmmu/runstate.c            |   9 +++
>  tcg/tcg.c                     |  88 ++++++++++++++++++--
>  tests/qtest/qmp-cmd-test.c    |   3 +
>  12 files changed, 417 insertions(+), 15 deletions(-)
>
> diff --git a/accel/tcg/monitor.c b/accel/tcg/monitor.c
> index e903dd1d2e..2bc87f2642 100644
> --- a/accel/tcg/monitor.c
> +++ b/accel/tcg/monitor.c
> @@ -15,6 +15,7 @@
>  #include "sysemu/cpus.h"
>  #include "sysemu/cpu-timers.h"
>  #include "sysemu/tcg.h"
> +#include "exec/tb-stats.h"
>  #include "internal.h"
>  
>  
> @@ -69,6 +70,11 @@ HumanReadableText *qmp_x_query_opcount(Error **errp)
>  {
>      g_autoptr(GString) buf = g_string_new("");
>  
> +    if (!tb_stats_collection_enabled()) {
> +        error_setg(errp, "TB information not being recorded.");

From error_setg()'s contract in include/qapi/error.h:

     * The resulting message should be a single phrase, with no newline or
     * trailing punctuation.

Please drop the period.  Same elsewhere, not flagging it again there.

> +        return NULL;
> +    }
> +
>      if (!tcg_enabled()) {
>          error_setg(errp,
>                     "Opcode count information is only available with accel=tcg");
> @@ -80,11 +86,23 @@ HumanReadableText *qmp_x_query_opcount(Error **errp)
>      return human_readable_text_from_str(buf);
>  }
>  
> +#ifdef CONFIG_TCG
> +HumanReadableText *qmp_x_query_profile(Error **errp)
> +{
> +    g_autoptr(GString) buf = g_string_new("");
> +
> +    dump_jit_exec_time_info(dev_time, buf);
> +    dev_time = 0;
> +
> +    return human_readable_text_from_str(buf);
> +}
> +#else
>  HumanReadableText *qmp_x_query_profile(Error **errp)
>  {
> -    error_setg(errp, "Internal profiler not compiled");
> +    error_setg(errp, "TCG should be enabled!");
>      return NULL;
>  }
> +#endif

machine.json has

   ##
   # @x-query-profile:
   #
   # Query TCG profiling information
   #
   # Features:
   #
   # @unstable: This command is meant for debugging.
   #
   # Returns: profile information
   #
   # Since: 6.2
   ##
   { 'command': 'x-query-profile',
     'returns': 'HumanReadableText',
     'if': 'CONFIG_TCG',
     'features': [ 'unstable' ] }

Not changed in this series.

Note the command is conditional on CONFIG_TCG, i.e. code generated for
it is #if defined(CONFIG_TCG).

The only other use is in hmp-commands-info.hx, and it is also guarded by
CONFIG_TCG.

Therefore, your #else is unreachable.  You can delete it along with...

>  
>  static void hmp_tcg_register(void)
>  {

[...]

> diff --git a/tests/qtest/qmp-cmd-test.c b/tests/qtest/qmp-cmd-test.c
> index 73a670e8fa..749aafe4da 100644
> --- a/tests/qtest/qmp-cmd-test.c
> +++ b/tests/qtest/qmp-cmd-test.c
> @@ -46,6 +46,9 @@ static int query_error_class(const char *cmd)
>          { "query-balloon", ERROR_CLASS_DEVICE_NOT_ACTIVE },
>          { "query-hotpluggable-cpus", ERROR_CLASS_GENERIC_ERROR },
>          { "query-vm-generation-id", ERROR_CLASS_GENERIC_ERROR },
> +#ifndef CONFIG_TCG
> +        { "x-query-profile", ERROR_CLASS_GENERIC_ERROR },
> +#endif

... this entry.

>          /* Only valid with a USB bus added */
>          { "x-query-usb", ERROR_CLASS_GENERIC_ERROR },
>          /* Only valid with accel=tcg */
Wu, Fei May 31, 2023, 12:54 a.m. UTC | #2
On 5/30/2023 5:37 PM, Markus Armbruster wrote:
> Fei Wu <fei2.wu@intel.com> writes:
> 
>> This collects all the statistics for TBStatistics, not only for the
>> whole emulation but for each TB.
>>
>> Signed-off-by: Vanderson M. do Rosario <vandersonmr2@gmail.com>
>> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
>> Signed-off-by: Fei Wu <fei2.wu@intel.com>
>> ---
>>  accel/tcg/monitor.c           |  20 ++++-
>>  accel/tcg/tb-stats.c          | 146 ++++++++++++++++++++++++++++++++++
>>  accel/tcg/tcg-accel-ops.c     |   7 ++
>>  accel/tcg/translate-all.c     |  70 +++++++++++++++-
>>  accel/tcg/translator.c        |   7 +-
>>  include/exec/tb-stats-flags.h |   2 +
>>  include/exec/tb-stats.h       |  46 +++++++++++
>>  include/qemu/timer.h          |   6 ++
>>  include/tcg/tcg.h             |  28 ++++++-
>>  softmmu/runstate.c            |   9 +++
>>  tcg/tcg.c                     |  88 ++++++++++++++++++--
>>  tests/qtest/qmp-cmd-test.c    |   3 +
>>  12 files changed, 417 insertions(+), 15 deletions(-)
>>
>> diff --git a/accel/tcg/monitor.c b/accel/tcg/monitor.c
>> index e903dd1d2e..2bc87f2642 100644
>> --- a/accel/tcg/monitor.c
>> +++ b/accel/tcg/monitor.c
>> @@ -15,6 +15,7 @@
>>  #include "sysemu/cpus.h"
>>  #include "sysemu/cpu-timers.h"
>>  #include "sysemu/tcg.h"
>> +#include "exec/tb-stats.h"
>>  #include "internal.h"
>>  
>>  
>> @@ -69,6 +70,11 @@ HumanReadableText *qmp_x_query_opcount(Error **errp)
>>  {
>>      g_autoptr(GString) buf = g_string_new("");
>>  
>> +    if (!tb_stats_collection_enabled()) {
>> +        error_setg(errp, "TB information not being recorded.");
> 
> From error_setg()'s contract in include/qapi/error.h:
> 
>      * The resulting message should be a single phrase, with no newline or
>      * trailing punctuation.
> 
> Please drop the period.  Same elsewhere, not flagging it again there.
> 
Got it, will do.

>> +        return NULL;
>> +    }
>> +
>>      if (!tcg_enabled()) {
>>          error_setg(errp,
>>                     "Opcode count information is only available with accel=tcg");
>> @@ -80,11 +86,23 @@ HumanReadableText *qmp_x_query_opcount(Error **errp)
>>      return human_readable_text_from_str(buf);
>>  }
>>  
>> +#ifdef CONFIG_TCG
>> +HumanReadableText *qmp_x_query_profile(Error **errp)
>> +{
>> +    g_autoptr(GString) buf = g_string_new("");
>> +
>> +    dump_jit_exec_time_info(dev_time, buf);
>> +    dev_time = 0;
>> +
>> +    return human_readable_text_from_str(buf);
>> +}
>> +#else
>>  HumanReadableText *qmp_x_query_profile(Error **errp)
>>  {
>> -    error_setg(errp, "Internal profiler not compiled");
>> +    error_setg(errp, "TCG should be enabled!");
>>      return NULL;
>>  }
>> +#endif
> 
> machine.json has
> 
>    ##
>    # @x-query-profile:
>    #
>    # Query TCG profiling information
>    #
>    # Features:
>    #
>    # @unstable: This command is meant for debugging.
>    #
>    # Returns: profile information
>    #
>    # Since: 6.2
>    ##
>    { 'command': 'x-query-profile',
>      'returns': 'HumanReadableText',
>      'if': 'CONFIG_TCG',
>      'features': [ 'unstable' ] }
> 
> Not changed in this series.
> 
> Note the command is conditional on CONFIG_TCG, i.e. code generated for
> it is #if defined(CONFIG_TCG).
> 
> The only other use is in hmp-commands-info.hx, and it is also guarded by
> CONFIG_TCG.
> 
> Therefore, your #else is unreachable.  You can delete it along with...
> 
OK, so qmp_x_query_profile won't get called #ifndef CONFIG_TCG, I will
remove it.

Thanks,
Fei.

>>  
>>  static void hmp_tcg_register(void)
>>  {
> 
> [...]
> 
>> diff --git a/tests/qtest/qmp-cmd-test.c b/tests/qtest/qmp-cmd-test.c
>> index 73a670e8fa..749aafe4da 100644
>> --- a/tests/qtest/qmp-cmd-test.c
>> +++ b/tests/qtest/qmp-cmd-test.c
>> @@ -46,6 +46,9 @@ static int query_error_class(const char *cmd)
>>          { "query-balloon", ERROR_CLASS_DEVICE_NOT_ACTIVE },
>>          { "query-hotpluggable-cpus", ERROR_CLASS_GENERIC_ERROR },
>>          { "query-vm-generation-id", ERROR_CLASS_GENERIC_ERROR },
>> +#ifndef CONFIG_TCG
>> +        { "x-query-profile", ERROR_CLASS_GENERIC_ERROR },
>> +#endif
> 
> ... this entry.
> 
>>          /* Only valid with a USB bus added */
>>          { "x-query-usb", ERROR_CLASS_GENERIC_ERROR },
>>          /* Only valid with accel=tcg */
>
Richard Henderson June 1, 2023, 1:08 a.m. UTC | #3
On 5/30/23 01:35, Fei Wu wrote:
> +static void collect_jit_profile_info(void *p, uint32_t hash, void *userp)
> +{
> +    struct jit_profile_info *jpi = userp;
> +    TBStatistics *tbs = p;
> +
> +    jpi->translations += tbs->translations.total;
> +    jpi->ops += tbs->code.num_tcg_ops;
> +    if (stat_per_translation(tbs, code.num_tcg_ops) > jpi->ops_max) {
> +        jpi->ops_max = stat_per_translation(tbs, code.num_tcg_ops);
> +    }
> +    jpi->del_ops += tbs->code.deleted_ops;
> +    jpi->temps += tbs->code.temps;
> +    if (stat_per_translation(tbs, code.temps) > jpi->temps_max) {
> +        jpi->temps_max = stat_per_translation(tbs, code.temps);
> +    }
> +    jpi->host += tbs->code.out_len;
> +    jpi->guest += tbs->code.in_len;
> +    jpi->search_data += tbs->code.search_out_len;
> +
> +    jpi->interm_time += stat_per_translation(tbs, gen_times.ir);
> +    jpi->opt_time += stat_per_translation(tbs, gen_times.ir_opt);
> +    jpi->la_time += stat_per_translation(tbs, gen_times.la);
> +    jpi->code_time += stat_per_translation(tbs, gen_times.code);
> +
> +    /*
> +     * The restore time covers how long we have spent restoring state
> +     * from a given TB (e.g. recovering from a fault). It is therefor
> +     * not related to the number of translations we have done.
> +     */
> +    jpi->restore_time += tbs->tb_restore_time;
> +    jpi->restore_count += tbs->tb_restore_count;
> +}

Why do sums of averages (stats_per_translation) instead of accumulating the complete total 
and dividing by jpi->translations?

> +void dump_jit_exec_time_info(uint64_t dev_time, GString *buf)
> +{
> +    static uint64_t last_cpu_exec_time;
> +    uint64_t cpu_exec_time;
> +    uint64_t delta;
> +
> +    cpu_exec_time = tcg_cpu_exec_time();
> +    delta = cpu_exec_time - last_cpu_exec_time;
> +
> +    g_string_append_printf(buf, "async time  %" PRId64 " (%0.3f)\n",
> +                           dev_time, dev_time / (double)NANOSECONDS_PER_SECOND);
> +    g_string_append_printf(buf, "qemu time   %" PRId64 " (%0.3f)\n",
> +                           delta, delta / (double)NANOSECONDS_PER_SECOND);
> +    last_cpu_exec_time = cpu_exec_time;
> +}
> +
> +/* dump JIT statisticis using TCGProfile and TBStats */

"statistics"

> diff --git a/accel/tcg/tcg-accel-ops.c b/accel/tcg/tcg-accel-ops.c
> index 3973591508..749ad182f2 100644
> --- a/accel/tcg/tcg-accel-ops.c
> +++ b/accel/tcg/tcg-accel-ops.c
> @@ -70,10 +70,17 @@ void tcg_cpus_destroy(CPUState *cpu)
>   int tcg_cpus_exec(CPUState *cpu)
>   {
>       int ret;
> +    uint64_t ti;
> +
>       assert(tcg_enabled());
> +    ti = profile_getclock();
> +
>       cpu_exec_start(cpu);
>       ret = cpu_exec(cpu);
>       cpu_exec_end(cpu);
> +
> +    qatomic_add(&tcg_ctx->prof.cpu_exec_time, profile_getclock() - ti);

You can't qatomic_add a 64-bit value on a 32-bit host.
Nor is tcg_ctx a good place to put this.

If you want to accumulate per-cpu data, put it on the cpu where there's no chance of 
racing with anyone.

Finally, I suspect that this isn't even where you want to accumulate time for execution as 
separate from translation.  You'd to that in cpu_exec_enter/cpu_exec_exit.

> @@ -296,6 +315,8 @@ static TBStatistics *tb_get_stats(tb_page_addr_t phys_pc, target_ulong pc,
>       new_stats->cs_base = cs_base;
>       new_stats->flags = flags;
>       new_stats->stats_enabled = get_default_tbstats_flag();
> +    new_stats->tbs = g_ptr_array_sized_new(4);

Why default to 4?  Is that just a guess, or are we really recomputing tbs that frequently? 
  Is there a good reason not to use g_ptr_array_new()?

> diff --git a/accel/tcg/translator.c b/accel/tcg/translator.c
> index 80ffbfb455..a60a92091b 100644
> --- a/accel/tcg/translator.c
> +++ b/accel/tcg/translator.c
> @@ -19,7 +19,7 @@
>   #include "exec/plugin-gen.h"
>   #include "exec/replay-core.h"
>   
> -static void gen_tb_exec_count(TranslationBlock *tb)
> +static inline void gen_tb_exec_count(TranslationBlock *tb)

Why?

>   {
>       if (tb_stats_enabled(tb, TB_EXEC_STATS)) {
>           TCGv_ptr ptr = tcg_temp_ebb_new_ptr();
> @@ -147,6 +147,11 @@ void translator_loop(CPUState *cpu, TranslationBlock *tb, int *max_insns,
>       tb->size = db->pc_next - db->pc_first;
>       tb->icount = db->num_insns;
>   
> +    /* Save number of guest instructions for TB_JIT_STATS */
> +    if (tb_stats_enabled(tb, TB_JIT_STATS)) {
> +        tb->tb_stats->code.num_guest_inst += db->num_insns;
> +    }

Why do this here, as opposed to the block in tb_gen_code?

> +#define stat_per_translation(stat, name) \
> +    (stat->translations.total ? stat->name / stat->translations.total : 0)

Not a fan of this macro, and as per above, the reason for doing the division here is 
questionable.

> diff --git a/include/qemu/timer.h b/include/qemu/timer.h
> index 9a91cb1248..ad0da18a5f 100644
> --- a/include/qemu/timer.h
> +++ b/include/qemu/timer.h
> @@ -989,4 +989,10 @@ static inline int64_t cpu_get_host_ticks(void)
>   }
>   #endif
>   
> +static inline int64_t profile_getclock(void)
> +{
> +    return get_clock();
> +}

Why?  You use get_clock directly most places.

> +/* Timestamps during translation  */
> +typedef struct TCGTime {
> +    uint64_t start;
> +    uint64_t ir_done;
> +    uint64_t opt_done;
> +    uint64_t la_done;
> +    uint64_t code_done;
> +} TCGTime;

int64_t would match the result of get_clock().

> +
> +/*
> + * The TCGProfile structure holds data for the lifetime of the translator.
> + */
> +typedef struct TCGProfile {
> +    /* exec time of this vcpu */
> +    int64_t cpu_exec_time;

TCGContext is not per-cpu, and therefore TCGProfile does not correspond either.

> @@ -608,6 +630,7 @@ struct TCGContext {
>   
>       /* Exit to translator on overflow. */
>       sigjmp_buf jmp_trans;
> +    TranslationBlock *current_tb;

TCGContext.gen_tb already exists.

> -int64_t tcg_cpu_exec_time(void);
> +uint64_t tcg_cpu_exec_time(void);

Why?  (Also, probably wants removing, per above.)

> --- a/softmmu/runstate.c
> +++ b/softmmu/runstate.c
> @@ -728,9 +728,18 @@ static bool main_loop_should_exit(int *status)
>   int qemu_main_loop(void)
>   {
>       int status = EXIT_SUCCESS;
> +#ifdef CONFIG_TCG
> +    uint64_t ti;
> +#endif
>   
>       while (!main_loop_should_exit(&status)) {
> +#ifdef CONFIG_TCG
> +        ti = profile_getclock();
> +#endif
>           main_loop_wait(false);
> +#ifdef CONFIG_TCG
> +        dev_time += profile_getclock() - ti;
> +#endif
>       }

What is this intending to collect?  Because I don't think it measures anything.  Certainly 
nothing related to TCG, CPUs or even devices.

> +    /* ? won't this end up op_opt - op = del_op_count ? */
> +    if (tb_stats_enabled(s->gen_tb, TB_JIT_STATS)) {
> +        s->gen_tb->tb_stats->code.deleted_ops++;
> +    }

Not quite.  We can emit new ops as well.  But how useful this is on its own is debatable.

> +/* avoid copy/paste errors */
> +#define PROF_ADD(to, from, field)                       \
> +    do {                                                \
> +        (to)->field += qatomic_read(&((from)->field));  \
> +    } while (0)

It is only used twice.
In addition, you can't use qatomic_read of a 64-bit variable on a 32-bit host.
You really really need to build e.g. i386.

> @@ -5879,6 +5923,7 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, uint64_t pc_start)
>           }
>       }
>   
> +
>   #ifdef CONFIG_DEBUG_TCG

Stray.


r~
Wu, Fei June 1, 2023, 6:48 a.m. UTC | #4
On 6/1/2023 9:08 AM, Richard Henderson wrote:
> On 5/30/23 01:35, Fei Wu wrote:
>> +static void collect_jit_profile_info(void *p, uint32_t hash, void
>> *userp)
>> +{
>> +    struct jit_profile_info *jpi = userp;
>> +    TBStatistics *tbs = p;
>> +
>> +    jpi->translations += tbs->translations.total;
>> +    jpi->ops += tbs->code.num_tcg_ops;
>> +    if (stat_per_translation(tbs, code.num_tcg_ops) > jpi->ops_max) {
>> +        jpi->ops_max = stat_per_translation(tbs, code.num_tcg_ops);
>> +    }
>> +    jpi->del_ops += tbs->code.deleted_ops;
>> +    jpi->temps += tbs->code.temps;
>> +    if (stat_per_translation(tbs, code.temps) > jpi->temps_max) {
>> +        jpi->temps_max = stat_per_translation(tbs, code.temps);
>> +    }
>> +    jpi->host += tbs->code.out_len;
>> +    jpi->guest += tbs->code.in_len;
>> +    jpi->search_data += tbs->code.search_out_len;
>> +
>> +    jpi->interm_time += stat_per_translation(tbs, gen_times.ir);
>> +    jpi->opt_time += stat_per_translation(tbs, gen_times.ir_opt);
>> +    jpi->la_time += stat_per_translation(tbs, gen_times.la);
>> +    jpi->code_time += stat_per_translation(tbs, gen_times.code);
>> +
>> +    /*
>> +     * The restore time covers how long we have spent restoring state
>> +     * from a given TB (e.g. recovering from a fault). It is therefor
>> +     * not related to the number of translations we have done.
>> +     */
>> +    jpi->restore_time += tbs->tb_restore_time;
>> +    jpi->restore_count += tbs->tb_restore_count;
>> +}
> 
> Why do sums of averages (stats_per_translation) instead of accumulating
> the complete total and dividing by jpi->translations?
> 
There has some inconsistency.

>> +void dump_jit_exec_time_info(uint64_t dev_time, GString *buf)
>> +{
>> +    static uint64_t last_cpu_exec_time;
>> +    uint64_t cpu_exec_time;
>> +    uint64_t delta;
>> +
>> +    cpu_exec_time = tcg_cpu_exec_time();
>> +    delta = cpu_exec_time - last_cpu_exec_time;
>> +
>> +    g_string_append_printf(buf, "async time  %" PRId64 " (%0.3f)\n",
>> +                           dev_time, dev_time /
>> (double)NANOSECONDS_PER_SECOND);
>> +    g_string_append_printf(buf, "qemu time   %" PRId64 " (%0.3f)\n",
>> +                           delta, delta /
>> (double)NANOSECONDS_PER_SECOND);
>> +    last_cpu_exec_time = cpu_exec_time;
>> +}
>> +
>> +/* dump JIT statisticis using TCGProfile and TBStats */
> 
> "statistics"
> 
ok.

>> diff --git a/accel/tcg/tcg-accel-ops.c b/accel/tcg/tcg-accel-ops.c
>> index 3973591508..749ad182f2 100644
>> --- a/accel/tcg/tcg-accel-ops.c
>> +++ b/accel/tcg/tcg-accel-ops.c
>> @@ -70,10 +70,17 @@ void tcg_cpus_destroy(CPUState *cpu)
>>   int tcg_cpus_exec(CPUState *cpu)
>>   {
>>       int ret;
>> +    uint64_t ti;
>> +
>>       assert(tcg_enabled());
>> +    ti = profile_getclock();
>> +
>>       cpu_exec_start(cpu);
>>       ret = cpu_exec(cpu);
>>       cpu_exec_end(cpu);
>> +
>> +    qatomic_add(&tcg_ctx->prof.cpu_exec_time, profile_getclock() - ti);
> 
> You can't qatomic_add a 64-bit value on a 32-bit host.

Right, I only changed the counters to size_t, didn't fix time part. Is
it possible to support it with some kind of locks on 32-bit as a generic
fallback solution? After all, 32-bit host seems not popular, it might be
sacrifice performance a little bit.

For me, the times 'dev_time' and 'cpu_exec_time' is only for the
developer to get a sense of how large part of time spent on such as "ir"
and "code". Alex mentioned in another thread that it's arguable if they
are useful. If not, we can remove it.

> Nor is tcg_ctx a good place to put this.
> 
> If you want to accumulate per-cpu data, put it on the cpu where there's
> no chance of racing with anyone.
> 
> Finally, I suspect that this isn't even where you want to accumulate
> time for execution as separate from translation.  You'd to that in
> cpu_exec_enter/cpu_exec_exit.
> 
>> @@ -296,6 +315,8 @@ static TBStatistics *tb_get_stats(tb_page_addr_t
>> phys_pc, target_ulong pc,
>>       new_stats->cs_base = cs_base;
>>       new_stats->flags = flags;
>>       new_stats->stats_enabled = get_default_tbstats_flag();
>> +    new_stats->tbs = g_ptr_array_sized_new(4);
> 
> Why default to 4?  Is that just a guess, or are we really recomputing
> tbs that frequently?  Is there a good reason not to use g_ptr_array_new()?
> 
Is this the same question in 2019 :) I cannot find the original link
right now. I will try to find it again later.

>> diff --git a/accel/tcg/translator.c b/accel/tcg/translator.c
>> index 80ffbfb455..a60a92091b 100644
>> --- a/accel/tcg/translator.c
>> +++ b/accel/tcg/translator.c
>> @@ -19,7 +19,7 @@
>>   #include "exec/plugin-gen.h"
>>   #include "exec/replay-core.h"
>>   -static void gen_tb_exec_count(TranslationBlock *tb)
>> +static inline void gen_tb_exec_count(TranslationBlock *tb)
> 
> Why?
> 
will remove.

>>   {
>>       if (tb_stats_enabled(tb, TB_EXEC_STATS)) {
>>           TCGv_ptr ptr = tcg_temp_ebb_new_ptr();
>> @@ -147,6 +147,11 @@ void translator_loop(CPUState *cpu,
>> TranslationBlock *tb, int *max_insns,
>>       tb->size = db->pc_next - db->pc_first;
>>       tb->icount = db->num_insns;
>>   +    /* Save number of guest instructions for TB_JIT_STATS */
>> +    if (tb_stats_enabled(tb, TB_JIT_STATS)) {
>> +        tb->tb_stats->code.num_guest_inst += db->num_insns;
>> +    }
> 
> Why do this here, as opposed to the block in tb_gen_code?
> 
close to scene of action?

>> +#define stat_per_translation(stat, name) \
>> +    (stat->translations.total ? stat->name / stat->translations.total
>> : 0)
> 
> Not a fan of this macro, and as per above, the reason for doing the
> division here is questionable.
> 
>> diff --git a/include/qemu/timer.h b/include/qemu/timer.h
>> index 9a91cb1248..ad0da18a5f 100644
>> --- a/include/qemu/timer.h
>> +++ b/include/qemu/timer.h
>> @@ -989,4 +989,10 @@ static inline int64_t cpu_get_host_ticks(void)
>>   }
>>   #endif
>>   +static inline int64_t profile_getclock(void)
>> +{
>> +    return get_clock();
>> +}
> 
> Why?  You use get_clock directly most places.
> 
for the ones still in TCGProfile, but I think it's okay to remove.

>> +/* Timestamps during translation  */
>> +typedef struct TCGTime {
>> +    uint64_t start;
>> +    uint64_t ir_done;
>> +    uint64_t opt_done;
>> +    uint64_t la_done;
>> +    uint64_t code_done;
>> +} TCGTime;
> 
> int64_t would match the result of get_clock().
> 
OK. btw, there are still several occurrences of u64 for time in the
existing code.

>> +
>> +/*
>> + * The TCGProfile structure holds data for the lifetime of the
>> translator.
>> + */
>> +typedef struct TCGProfile {
>> +    /* exec time of this vcpu */
>> +    int64_t cpu_exec_time;
> 
> TCGContext is not per-cpu, and therefore TCGProfile does not correspond
> either.
> 
>> @@ -608,6 +630,7 @@ struct TCGContext {
>>         /* Exit to translator on overflow. */
>>       sigjmp_buf jmp_trans;
>> +    TranslationBlock *current_tb;
> 
> TCGContext.gen_tb already exists.
> 
no user actually, will remove it.

>> -int64_t tcg_cpu_exec_time(void);
>> +uint64_t tcg_cpu_exec_time(void);
> 
> Why?  (Also, probably wants removing, per above.)
> 
The original patch mentioned 'make it an uint64_t as we won't be dealing
in negative numbers.'

>> --- a/softmmu/runstate.c
>> +++ b/softmmu/runstate.c
>> @@ -728,9 +728,18 @@ static bool main_loop_should_exit(int *status)
>>   int qemu_main_loop(void)
>>   {
>>       int status = EXIT_SUCCESS;
>> +#ifdef CONFIG_TCG
>> +    uint64_t ti;
>> +#endif
>>         while (!main_loop_should_exit(&status)) {
>> +#ifdef CONFIG_TCG
>> +        ti = profile_getclock();
>> +#endif
>>           main_loop_wait(false);
>> +#ifdef CONFIG_TCG
>> +        dev_time += profile_getclock() - ti;
>> +#endif
>>       }
> 
> What is this intending to collect?  Because I don't think it measures
> anything.  Certainly nothing related to TCG, CPUs or even devices.
> 
It's exported to hmp cmd in dump_jit_exec_time_info() together with
cpu_exec_time.

>> +    /* ? won't this end up op_opt - op = del_op_count ? */
>> +    if (tb_stats_enabled(s->gen_tb, TB_JIT_STATS)) {
>> +        s->gen_tb->tb_stats->code.deleted_ops++;
>> +    }
> 
> Not quite.  We can emit new ops as well.  But how useful this is on its
> own is debatable.
> 
okay, so I will remove the comment.

>> +/* avoid copy/paste errors */
>> +#define PROF_ADD(to, from, field)                       \
>> +    do {                                                \
>> +        (to)->field += qatomic_read(&((from)->field));  \
>> +    } while (0)
> 
> It is only used twice.
> In addition, you can't use qatomic_read of a 64-bit variable on a 32-bit
> host.
> You really really need to build e.g. i386.
> 
>> @@ -5879,6 +5923,7 @@ int tcg_gen_code(TCGContext *s, TranslationBlock
>> *tb, uint64_t pc_start)
>>           }
>>       }
>>   +
>>   #ifdef CONFIG_DEBUG_TCG
> 
> Stray.
> 
will fix.

Thanks,
Fei.

> 
> r~
Richard Henderson June 1, 2023, 2:10 p.m. UTC | #5
On 5/31/23 23:48, Wu, Fei wrote:
>>> -int64_t tcg_cpu_exec_time(void);
>>> +uint64_t tcg_cpu_exec_time(void);
>>
>> Why?  (Also, probably wants removing, per above.)
>>
> The original patch mentioned 'make it an uint64_t as we won't be dealing
> in negative numbers.'

The signed vs unsigned thing is something that should be handled throughout everything 
that handles times, not adjusted here and there by only profiling.

>>> --- a/softmmu/runstate.c
>>> +++ b/softmmu/runstate.c
>>> @@ -728,9 +728,18 @@ static bool main_loop_should_exit(int *status)
>>>    int qemu_main_loop(void)
>>>    {
>>>        int status = EXIT_SUCCESS;
>>> +#ifdef CONFIG_TCG
>>> +    uint64_t ti;
>>> +#endif
>>>          while (!main_loop_should_exit(&status)) {
>>> +#ifdef CONFIG_TCG
>>> +        ti = profile_getclock();
>>> +#endif
>>>            main_loop_wait(false);
>>> +#ifdef CONFIG_TCG
>>> +        dev_time += profile_getclock() - ti;
>>> +#endif
>>>        }
>>
>> What is this intending to collect?  Because I don't think it measures
>> anything.  Certainly nothing related to TCG, CPUs or even devices.
>>
> It's exported to hmp cmd in dump_jit_exec_time_info() together with
> cpu_exec_time.

That doesn't answer my question: What do you think it measures?

r~
Richard Henderson June 1, 2023, 3:10 p.m. UTC | #6
On 5/31/23 23:48, Wu, Fei wrote:
> On 6/1/2023 9:08 AM, Richard Henderson wrote:
>> On 5/30/23 01:35, Fei Wu wrote:
>>> +    qatomic_add(&tcg_ctx->prof.cpu_exec_time, profile_getclock() - ti);
>>
>> You can't qatomic_add a 64-bit value on a 32-bit host.
> 
> Right, I only changed the counters to size_t, didn't fix time part. Is
> it possible to support it with some kind of locks on 32-bit as a generic
> fallback solution? After all, 32-bit host seems not popular, it might be
> sacrifice performance a little bit.

We have

int64_t  qatomic_read_i64(const int64_t *ptr);
uint64_t qatomic_read_u64(const uint64_t *ptr);
void qatomic_set_i64(int64_t *ptr, int64_t val);
void qatomic_set_u64(uint64_t *ptr, uint64_t val);

to support 32-bit hosts. You must also use

typedef int64_t aligned_int64_t __attribute__((aligned(8)));
typedef uint64_t aligned_uint64_t __attribute__((aligned(8)));


r~
diff mbox series

Patch

diff --git a/accel/tcg/monitor.c b/accel/tcg/monitor.c
index e903dd1d2e..2bc87f2642 100644
--- a/accel/tcg/monitor.c
+++ b/accel/tcg/monitor.c
@@ -15,6 +15,7 @@ 
 #include "sysemu/cpus.h"
 #include "sysemu/cpu-timers.h"
 #include "sysemu/tcg.h"
+#include "exec/tb-stats.h"
 #include "internal.h"
 
 
@@ -69,6 +70,11 @@  HumanReadableText *qmp_x_query_opcount(Error **errp)
 {
     g_autoptr(GString) buf = g_string_new("");
 
+    if (!tb_stats_collection_enabled()) {
+        error_setg(errp, "TB information not being recorded.");
+        return NULL;
+    }
+
     if (!tcg_enabled()) {
         error_setg(errp,
                    "Opcode count information is only available with accel=tcg");
@@ -80,11 +86,23 @@  HumanReadableText *qmp_x_query_opcount(Error **errp)
     return human_readable_text_from_str(buf);
 }
 
+#ifdef CONFIG_TCG
+HumanReadableText *qmp_x_query_profile(Error **errp)
+{
+    g_autoptr(GString) buf = g_string_new("");
+
+    dump_jit_exec_time_info(dev_time, buf);
+    dev_time = 0;
+
+    return human_readable_text_from_str(buf);
+}
+#else
 HumanReadableText *qmp_x_query_profile(Error **errp)
 {
-    error_setg(errp, "Internal profiler not compiled");
+    error_setg(errp, "TCG should be enabled!");
     return NULL;
 }
+#endif
 
 static void hmp_tcg_register(void)
 {
diff --git a/accel/tcg/tb-stats.c b/accel/tcg/tb-stats.c
index 143a52ef5c..78a3104c7f 100644
--- a/accel/tcg/tb-stats.c
+++ b/accel/tcg/tb-stats.c
@@ -9,6 +9,11 @@ 
 #include "qemu/osdep.h"
 
 #include "disas/disas.h"
+#include "exec/exec-all.h"
+#include "tcg/tcg.h"
+
+#include "qemu/qemu-print.h"
+#include "qemu/timer.h"
 
 #include "exec/tb-stats.h"
 #include "tb-context.h"
@@ -24,6 +29,147 @@  enum TBStatsStatus {
 static enum TBStatsStatus tcg_collect_tb_stats;
 static uint32_t default_tbstats_flag;
 
+uint64_t dev_time;
+
+struct jit_profile_info {
+    uint64_t translations;
+    uint64_t aborted;
+    uint64_t ops;
+    unsigned ops_max;
+    uint64_t del_ops;
+    uint64_t temps;
+    unsigned temps_max;
+    uint64_t host;
+    uint64_t guest;
+    uint64_t search_data;
+
+    uint64_t interm_time;
+    uint64_t code_time;
+    uint64_t restore_count;
+    uint64_t restore_time;
+    uint64_t opt_time;
+    uint64_t la_time;
+};
+
+/* accumulate the statistics from all TBs */
+static void collect_jit_profile_info(void *p, uint32_t hash, void *userp)
+{
+    struct jit_profile_info *jpi = userp;
+    TBStatistics *tbs = p;
+
+    jpi->translations += tbs->translations.total;
+    jpi->ops += tbs->code.num_tcg_ops;
+    if (stat_per_translation(tbs, code.num_tcg_ops) > jpi->ops_max) {
+        jpi->ops_max = stat_per_translation(tbs, code.num_tcg_ops);
+    }
+    jpi->del_ops += tbs->code.deleted_ops;
+    jpi->temps += tbs->code.temps;
+    if (stat_per_translation(tbs, code.temps) > jpi->temps_max) {
+        jpi->temps_max = stat_per_translation(tbs, code.temps);
+    }
+    jpi->host += tbs->code.out_len;
+    jpi->guest += tbs->code.in_len;
+    jpi->search_data += tbs->code.search_out_len;
+
+    jpi->interm_time += stat_per_translation(tbs, gen_times.ir);
+    jpi->opt_time += stat_per_translation(tbs, gen_times.ir_opt);
+    jpi->la_time += stat_per_translation(tbs, gen_times.la);
+    jpi->code_time += stat_per_translation(tbs, gen_times.code);
+
+    /*
+     * The restore time covers how long we have spent restoring state
+     * from a given TB (e.g. recovering from a fault). It is therefor
+     * not related to the number of translations we have done.
+     */
+    jpi->restore_time += tbs->tb_restore_time;
+    jpi->restore_count += tbs->tb_restore_count;
+}
+
+void dump_jit_exec_time_info(uint64_t dev_time, GString *buf)
+{
+    static uint64_t last_cpu_exec_time;
+    uint64_t cpu_exec_time;
+    uint64_t delta;
+
+    cpu_exec_time = tcg_cpu_exec_time();
+    delta = cpu_exec_time - last_cpu_exec_time;
+
+    g_string_append_printf(buf, "async time  %" PRId64 " (%0.3f)\n",
+                           dev_time, dev_time / (double)NANOSECONDS_PER_SECOND);
+    g_string_append_printf(buf, "qemu time   %" PRId64 " (%0.3f)\n",
+                           delta, delta / (double)NANOSECONDS_PER_SECOND);
+    last_cpu_exec_time = cpu_exec_time;
+}
+
+/* dump JIT statisticis using TCGProfile and TBStats */
+void dump_jit_profile_info(TCGProfile *s, GString *buf)
+{
+    if (!tb_stats_collection_enabled()) {
+        return;
+    }
+
+    struct jit_profile_info *jpi = g_new0(struct jit_profile_info, 1);
+
+    qht_iter(&tb_ctx.tb_stats, collect_jit_profile_info, jpi);
+
+    if (jpi->translations) {
+        g_string_append_printf(buf, "translated TBs      %" PRId64 "\n",
+                jpi->translations);
+        g_string_append_printf(buf, "avg ops/TB          %0.1f max=%d\n",
+                jpi->ops / (double) jpi->translations, jpi->ops_max);
+        g_string_append_printf(buf, "deleted ops/TB      %0.2f\n",
+                jpi->del_ops / (double) jpi->translations);
+        g_string_append_printf(buf, "avg temps/TB        %0.2f max=%d\n",
+                jpi->temps / (double) jpi->translations, jpi->temps_max);
+        g_string_append_printf(buf, "avg host code/TB    %0.1f\n",
+                jpi->host / (double) jpi->translations);
+        g_string_append_printf(buf, "avg search data/TB  %0.1f\n",
+                jpi->search_data / (double) jpi->translations);
+
+        uint64_t tot = jpi->interm_time + jpi->code_time;
+
+        g_string_append_printf(buf, "JIT cycles          %" PRId64
+                " (%0.3fs at 2.4 GHz)\n",
+                tot, tot / 2.4e9);
+        g_string_append_printf(buf, "  cycles/op           %0.1f\n",
+                jpi->ops ? (double)tot / jpi->ops : 0);
+        g_string_append_printf(buf, "  cycles/in byte      %0.1f\n",
+                jpi->guest ? (double)tot / jpi->guest : 0);
+        g_string_append_printf(buf, "  cycles/out byte     %0.1f\n",
+                jpi->host ? (double)tot / jpi->host : 0);
+        g_string_append_printf(buf, "  cycles/search byte  %0.1f\n",
+                jpi->search_data ? (double)tot / jpi->search_data : 0);
+        if (tot == 0) {
+            tot = 1;
+        }
+
+        g_string_append_printf(buf, "  gen_interm time     %0.1f%%\n",
+                (double)jpi->interm_time / tot * 100.0);
+        g_string_append_printf(buf, "  gen_code time       %0.1f%%\n",
+                (double)jpi->code_time / tot * 100.0);
+
+        g_string_append_printf(buf, "    optim./code time    %0.1f%%\n",
+                (double)jpi->opt_time / (jpi->code_time ? jpi->code_time : 1)
+                    * 100.0);
+        g_string_append_printf(buf, "    liveness/code time  %0.1f%%\n",
+                (double)jpi->la_time / (jpi->code_time ? jpi->code_time : 1)
+                    * 100.0);
+
+        g_string_append_printf(buf, "cpu_restore count   %" PRId64 "\n",
+                jpi->restore_count);
+        g_string_append_printf(buf, "  avg cycles        %0.1f\n",
+                jpi->restore_count ?
+                    (double)jpi->restore_time / jpi->restore_count : 0);
+
+        if (s) {
+            g_string_append_printf(buf, "cpu exec time  %" PRId64 " (%0.3fs)\n",
+                s->cpu_exec_time,
+                s->cpu_exec_time / (double) NANOSECONDS_PER_SECOND);
+        }
+    }
+    g_free(jpi);
+}
+
 void init_tb_stats_htable(void)
 {
     if (!tb_ctx.tb_stats.map && tb_stats_collection_enabled()) {
diff --git a/accel/tcg/tcg-accel-ops.c b/accel/tcg/tcg-accel-ops.c
index 3973591508..749ad182f2 100644
--- a/accel/tcg/tcg-accel-ops.c
+++ b/accel/tcg/tcg-accel-ops.c
@@ -70,10 +70,17 @@  void tcg_cpus_destroy(CPUState *cpu)
 int tcg_cpus_exec(CPUState *cpu)
 {
     int ret;
+    uint64_t ti;
+
     assert(tcg_enabled());
+    ti = profile_getclock();
+
     cpu_exec_start(cpu);
     ret = cpu_exec(cpu);
     cpu_exec_end(cpu);
+
+    qatomic_add(&tcg_ctx->prof.cpu_exec_time, profile_getclock() - ti);
+
     return ret;
 }
 
diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index dadf49954f..8733aec11b 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -204,6 +204,12 @@  void cpu_restore_state_from_tb(CPUState *cpu, TranslationBlock *tb,
                                uintptr_t host_pc)
 {
     uint64_t data[TARGET_INSN_START_WORDS];
+    uint64_t ti = 0;
+
+    if (tb_stats_enabled(tb, TB_JIT_TIME)) {
+        ti = get_clock();
+    }
+
     int insns_left = cpu_unwind_data_from_tb(tb, host_pc, data);
 
     if (insns_left < 0) {
@@ -220,6 +226,15 @@  void cpu_restore_state_from_tb(CPUState *cpu, TranslationBlock *tb,
     }
 
     cpu->cc->tcg_ops->restore_state_to_opc(cpu, tb, data);
+
+    if (tb_stats_enabled(tb, TB_JIT_TIME)) {
+        TBStatistics *ts = tb->tb_stats;
+        uint64_t elapsed = get_clock() - ti;
+        qemu_mutex_lock(&ts->jit_stats_lock);
+        ts->tb_restore_time += elapsed;
+        ts->tb_restore_count++;
+        qemu_mutex_unlock(&ts->jit_stats_lock);
+    }
 }
 
 bool cpu_restore_state(CPUState *cpu, uintptr_t host_pc)
@@ -267,7 +282,7 @@  void page_init(void)
  */
 static int setjmp_gen_code(CPUArchState *env, TranslationBlock *tb,
                            target_ulong pc, void *host_pc,
-                           int *max_insns, int64_t *ti)
+                           int *max_insns, TCGTime *tcg_time)
 {
     int ret = sigsetjmp(tcg_ctx->jmp_trans, 0);
     if (unlikely(ret != 0)) {
@@ -282,7 +297,11 @@  static int setjmp_gen_code(CPUArchState *env, TranslationBlock *tb,
     tcg_ctx->cpu = NULL;
     *max_insns = tb->icount;
 
-    return tcg_gen_code(tcg_ctx, tb, pc);
+    if (tb_stats_enabled(tb, TB_JIT_TIME)) {
+        tcg_time->ir_done = get_clock();
+    }
+
+    return tcg_gen_code(tcg_ctx, tb, pc, tcg_time);
 }
 
 static TBStatistics *tb_get_stats(tb_page_addr_t phys_pc, target_ulong pc,
@@ -296,6 +315,8 @@  static TBStatistics *tb_get_stats(tb_page_addr_t phys_pc, target_ulong pc,
     new_stats->cs_base = cs_base;
     new_stats->flags = flags;
     new_stats->stats_enabled = get_default_tbstats_flag();
+    new_stats->tbs = g_ptr_array_sized_new(4);
+    qemu_mutex_init(&new_stats->jit_stats_lock);
 
     /*
      * All initialisation must be complete before we insert into qht
@@ -309,6 +330,7 @@  static TBStatistics *tb_get_stats(tb_page_addr_t phys_pc, target_ulong pc,
          * If there is already a TBStatistic for this TB from a previous flush
          * then just make the new TB point to the older TBStatistic
          */
+        g_ptr_array_free(new_stats->tbs, true);
         g_free(new_stats);
         return existing_stats;
     } else {
@@ -327,8 +349,8 @@  TranslationBlock *tb_gen_code(CPUState *cpu,
     tb_page_addr_t phys_pc;
     tcg_insn_unit *gen_code_buf;
     int gen_code_size, search_size, max_insns;
-    int64_t ti;
     void *host_pc;
+    TCGTime tcg_time;
 
     assert_memory_lock();
     qemu_thread_jit_write();
@@ -389,11 +411,15 @@  TranslationBlock *tb_gen_code(CPUState *cpu,
     if (tb_stats_collection_enabled() &&
         qemu_log_in_addr_range(tb->pc)) {
         tb->tb_stats = tb_get_stats(phys_pc, pc, cs_base, flags);
+        if (tb_stats_enabled(tb, TB_JIT_TIME)) {
+            tcg_time.start = get_clock();
+        }
     } else {
         tb->tb_stats = NULL;
     }
 
-    gen_code_size = setjmp_gen_code(env, tb, pc, host_pc, &max_insns, &ti);
+    gen_code_size = setjmp_gen_code(env, tb, pc, host_pc, &max_insns,
+                                    &tcg_time);
     if (unlikely(gen_code_size < 0)) {
         switch (gen_code_size) {
         case -1:
@@ -445,6 +471,10 @@  TranslationBlock *tb_gen_code(CPUState *cpu,
      */
     perf_report_code(pc, tb, tcg_splitwx_to_rx(gen_code_buf));
 
+    if (tb_stats_enabled(tb, TB_JIT_TIME)) {
+        tcg_time.code_done = get_clock();
+    }
+
     if (qemu_loglevel_mask(CPU_LOG_TB_OUT_ASM) &&
         qemu_log_in_addr_range(pc)) {
         FILE *logfile = qemu_log_trylock();
@@ -547,6 +577,38 @@  TranslationBlock *tb_gen_code(CPUState *cpu,
         return tb;
     }
 
+    /*
+     * Collect JIT stats when enabled. We batch them all up here to
+     * avoid spamming the cache with atomic accesses
+     */
+    if (tb_stats_enabled(tb, (TB_JIT_STATS | TB_JIT_TIME))) {
+        TBStatistics *ts = tb->tb_stats;
+        qemu_mutex_lock(&ts->jit_stats_lock);
+
+        if (tb_stats_enabled(tb, TB_JIT_STATS)) {
+            ts->code.num_tcg_ops_opt += tcg_ctx->nb_ops;
+            ts->code.in_len += tb->size;
+            ts->code.out_len += tb->tc.size;
+            ts->code.search_out_len += search_size;
+
+            ts->translations.total++;
+            if (tb_page_addr1(tb) != -1) {
+                ts->translations.spanning++;
+            }
+
+            g_ptr_array_add(ts->tbs, tb);
+        }
+
+        if (tb_stats_enabled(tb, TB_JIT_TIME)) {
+            ts->gen_times.ir += tcg_time.ir_done - tcg_time.start;
+            ts->gen_times.ir_opt += tcg_time.opt_done - tcg_time.ir_done;
+            ts->gen_times.la += tcg_time.la_done - tcg_time.opt_done;
+            ts->gen_times.code += tcg_time.code_done - tcg_time.la_done;
+        }
+
+        qemu_mutex_unlock(&ts->jit_stats_lock);
+    }
+
     /*
      * Insert TB into the corresponding region tree before publishing it
      * through QHT. Otherwise rewinding happened in the TB might fail to
diff --git a/accel/tcg/translator.c b/accel/tcg/translator.c
index 80ffbfb455..a60a92091b 100644
--- a/accel/tcg/translator.c
+++ b/accel/tcg/translator.c
@@ -19,7 +19,7 @@ 
 #include "exec/plugin-gen.h"
 #include "exec/replay-core.h"
 
-static void gen_tb_exec_count(TranslationBlock *tb)
+static inline void gen_tb_exec_count(TranslationBlock *tb)
 {
     if (tb_stats_enabled(tb, TB_EXEC_STATS)) {
         TCGv_ptr ptr = tcg_temp_ebb_new_ptr();
@@ -147,6 +147,11 @@  void translator_loop(CPUState *cpu, TranslationBlock *tb, int *max_insns,
     tb->size = db->pc_next - db->pc_first;
     tb->icount = db->num_insns;
 
+    /* Save number of guest instructions for TB_JIT_STATS */
+    if (tb_stats_enabled(tb, TB_JIT_STATS)) {
+        tb->tb_stats->code.num_guest_inst += db->num_insns;
+    }
+
     if (qemu_loglevel_mask(CPU_LOG_TB_IN_ASM)
         && qemu_log_in_addr_range(db->pc_first)) {
         FILE *logfile = qemu_log_trylock();
diff --git a/include/exec/tb-stats-flags.h b/include/exec/tb-stats-flags.h
index fa71eb6f0c..f29eff7576 100644
--- a/include/exec/tb-stats-flags.h
+++ b/include/exec/tb-stats-flags.h
@@ -13,6 +13,8 @@ 
 
 #define TB_NOTHING    (1 << 0)
 #define TB_EXEC_STATS (1 << 1)
+#define TB_JIT_STATS  (1 << 2)
+#define TB_JIT_TIME   (1 << 3)
 
 /* TBStatistic collection controls */
 void enable_collect_tb_stats(void);
diff --git a/include/exec/tb-stats.h b/include/exec/tb-stats.h
index eb1fa92a4e..d93d42e085 100644
--- a/include/exec/tb-stats.h
+++ b/include/exec/tb-stats.h
@@ -34,6 +34,9 @@ 
 #define tb_stats_enabled(tb, JIT_STATS) \
     (tb && tb->tb_stats && (tb->tb_stats->stats_enabled & JIT_STATS))
 
+#define stat_per_translation(stat, name) \
+    (stat->translations.total ? stat->name / stat->translations.total : 0)
+
 typedef struct TBStatistics TBStatistics;
 
 /*
@@ -60,10 +63,53 @@  struct TBStatistics {
         unsigned long atomic;
     } executions;
 
+    /* JIT Stats - protected by lock */
+    QemuMutex jit_stats_lock;
+
+    /* Sum of all operations for all translations */
+    struct {
+        unsigned num_guest_inst;
+        unsigned num_tcg_ops;
+        unsigned num_tcg_ops_opt;
+        unsigned spills;
+
+        /* CONFIG_PROFILE */
+        unsigned temps;
+        unsigned deleted_ops;
+        unsigned in_len;
+        unsigned out_len;
+        unsigned search_out_len;
+    } code;
+
+    struct {
+        unsigned long total;
+        unsigned long uncached;
+        unsigned long spanning;
+    } translations;
+
+    /*
+     * All persistent (cached) TranslationBlocks using
+     * this TBStats structure. Has to be reset on a tb_flush.
+     */
+    GPtrArray *tbs;
+
+    /* Recover state from TB */
+    uint64_t tb_restore_time;
+    uint64_t tb_restore_count;
+
+    struct {
+        uint64_t ir;
+        uint64_t ir_opt;
+        uint64_t la;
+        uint64_t code;
+    } gen_times;
 };
 
 bool tb_stats_cmp(const void *ap, const void *bp);
 
 void init_tb_stats_htable(void);
 
+void dump_jit_profile_info(TCGProfile *s, GString *buf);
+void dump_jit_exec_time_info(uint64_t dev_time, GString *buf);
+
 #endif
diff --git a/include/qemu/timer.h b/include/qemu/timer.h
index 9a91cb1248..ad0da18a5f 100644
--- a/include/qemu/timer.h
+++ b/include/qemu/timer.h
@@ -989,4 +989,10 @@  static inline int64_t cpu_get_host_ticks(void)
 }
 #endif
 
+static inline int64_t profile_getclock(void)
+{
+    return get_clock();
+}
+
+extern uint64_t dev_time;
 #endif
diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
index 083cbd6580..85bf6fb78e 100644
--- a/include/tcg/tcg.h
+++ b/include/tcg/tcg.h
@@ -522,6 +522,26 @@  static inline TCGRegSet output_pref(const TCGOp *op, unsigned i)
     return i < ARRAY_SIZE(op->output_pref) ? op->output_pref[i] : 0;
 }
 
+/* Timestamps during translation  */
+typedef struct TCGTime {
+    uint64_t start;
+    uint64_t ir_done;
+    uint64_t opt_done;
+    uint64_t la_done;
+    uint64_t code_done;
+} TCGTime;
+
+/*
+ * The TCGProfile structure holds data for the lifetime of the translator.
+ */
+typedef struct TCGProfile {
+    /* exec time of this vcpu */
+    int64_t cpu_exec_time;
+
+    /* Lifetime count of TCGOps per TCGContext when tb_stats enabled */
+    size_t table_op_count[NB_OPS];
+} TCGProfile;
+
 struct TCGContext {
     uint8_t *pool_cur, *pool_end;
     TCGPool *pool_first, *pool_current, *pool_first_large;
@@ -548,6 +568,8 @@  struct TCGContext {
     tcg_insn_unit *code_buf;      /* pointer for start of tb */
     tcg_insn_unit *code_ptr;      /* pointer for running end of tb */
 
+    TCGProfile prof;
+
 #ifdef CONFIG_DEBUG_TCG
     int goto_tb_issue_mask;
     const TCGOpcode *vecop_list;
@@ -608,6 +630,7 @@  struct TCGContext {
 
     /* Exit to translator on overflow. */
     sigjmp_buf jmp_trans;
+    TranslationBlock *current_tb;
 };
 
 static inline bool temp_readonly(TCGTemp *ts)
@@ -827,7 +850,8 @@  void tcg_register_thread(void);
 void tcg_prologue_init(TCGContext *s);
 void tcg_func_start(TCGContext *s);
 
-int tcg_gen_code(TCGContext *s, TranslationBlock *tb, uint64_t pc_start);
+int tcg_gen_code(TCGContext *s, TranslationBlock *tb, uint64_t pc_start,
+                 TCGTime *tcg_time);
 
 void tb_target_set_jmp_target(const TranslationBlock *, int,
                               uintptr_t, uintptr_t);
@@ -885,7 +909,7 @@  static inline TCGv_ptr tcg_temp_new_ptr(void)
     return temp_tcgv_ptr(t);
 }
 
-int64_t tcg_cpu_exec_time(void);
+uint64_t tcg_cpu_exec_time(void);
 void tcg_dump_info(GString *buf);
 void tcg_dump_op_count(GString *buf);
 
diff --git a/softmmu/runstate.c b/softmmu/runstate.c
index bd50062ed0..37390799f1 100644
--- a/softmmu/runstate.c
+++ b/softmmu/runstate.c
@@ -728,9 +728,18 @@  static bool main_loop_should_exit(int *status)
 int qemu_main_loop(void)
 {
     int status = EXIT_SUCCESS;
+#ifdef CONFIG_TCG
+    uint64_t ti;
+#endif
 
     while (!main_loop_should_exit(&status)) {
+#ifdef CONFIG_TCG
+        ti = profile_getclock();
+#endif
         main_loop_wait(false);
+#ifdef CONFIG_TCG
+        dev_time += profile_getclock() - ti;
+#endif
     }
 
     return status;
diff --git a/tcg/tcg.c b/tcg/tcg.c
index 47befdfcae..dc6b3226e2 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -41,6 +41,7 @@ 
 #define NO_CPU_IO_DEFS
 
 #include "exec/exec-all.h"
+#include "exec/tb-stats.h"
 #include "tcg/tcg-op.h"
 
 #if UINTPTR_MAX == UINT32_MAX
@@ -2983,6 +2984,10 @@  void tcg_op_remove(TCGContext *s, TCGOp *op)
     QTAILQ_REMOVE(&s->ops, op, link);
     QTAILQ_INSERT_TAIL(&s->free_ops, op, link);
     s->nb_ops--;
+    /* ? won't this end up op_opt - op = del_op_count ? */
+    if (tb_stats_enabled(s->gen_tb, TB_JIT_STATS)) {
+        s->gen_tb->tb_stats->code.deleted_ops++;
+    }
 }
 
 void tcg_remove_ops_after(TCGOp *op)
@@ -4148,6 +4153,10 @@  static TCGReg tcg_reg_alloc(TCGContext *s, TCGRegSet required_regs,
     }
 
     /* We must spill something.  */
+    if (tb_stats_enabled(s->gen_tb, TB_JIT_STATS)) {
+        s->gen_tb->tb_stats->code.spills++;
+    }
+
     for (j = f; j < 2; j++) {
         TCGRegSet set = reg_ct[j];
 
@@ -5852,22 +5861,57 @@  static void tcg_out_st_helper_args(TCGContext *s, const TCGLabelQemuLdst *ldst,
     tcg_out_helper_load_common_args(s, ldst, parm, info, next_arg);
 }
 
-void tcg_dump_op_count(GString *buf)
+/* avoid copy/paste errors */
+#define PROF_ADD(to, from, field)                       \
+    do {                                                \
+        (to)->field += qatomic_read(&((from)->field));  \
+    } while (0)
+
+static void collect_tcg_profiler(TCGProfile *prof)
 {
-    g_string_append_printf(buf, "[TCG profiler not compiled]\n");
+    unsigned int n_ctxs = qatomic_read(&tcg_cur_ctxs);
+    unsigned int i;
+
+    for (i = 0; i < n_ctxs; i++) {
+        TCGContext *s = qatomic_read(&tcg_ctxs[i]);
+        const TCGProfile *orig = &s->prof;
+
+        PROF_ADD(prof, orig, cpu_exec_time);
+
+        for (i = 0; i < NB_OPS; i++) {
+            PROF_ADD(prof, orig, table_op_count[i]);
+        }
+    }
 }
 
-int64_t tcg_cpu_exec_time(void)
+uint64_t tcg_cpu_exec_time(void)
 {
-    error_report("%s: TCG profiler not compiled", __func__);
-    exit(EXIT_FAILURE);
+    unsigned int n_ctxs = qatomic_read(&tcg_cur_ctxs);
+    unsigned int i;
+    uint64_t ret = 0;
+
+    for (i = 0; i < n_ctxs; i++) {
+        const TCGContext *s = qatomic_read(&tcg_ctxs[i]);
+        const TCGProfile *prof = &s->prof;
+
+        ret += qatomic_read(&prof->cpu_exec_time);
+    }
+    return ret;
 }
 
-int tcg_gen_code(TCGContext *s, TranslationBlock *tb, uint64_t pc_start)
+int tcg_gen_code(TCGContext *s, TranslationBlock *tb, uint64_t pc_start,
+                 TCGTime *tcg_time)
 {
     int i, num_insns;
     TCGOp *op;
 
+    s->current_tb = tb;
+    /* save pre-optimisation op count */
+    if (tb_stats_enabled(tb, TB_JIT_STATS)) {
+        tb->tb_stats->code.num_tcg_ops += s->nb_ops;
+        tb->tb_stats->code.temps += s->nb_temps;
+    }
+
     if (unlikely(qemu_loglevel_mask(CPU_LOG_TB_OP)
                  && qemu_log_in_addr_range(pc_start))) {
         FILE *logfile = qemu_log_trylock();
@@ -5879,6 +5923,7 @@  int tcg_gen_code(TCGContext *s, TranslationBlock *tb, uint64_t pc_start)
         }
     }
 
+
 #ifdef CONFIG_DEBUG_TCG
     /* Ensure all labels referenced have been emitted.  */
     {
@@ -5898,6 +5943,10 @@  int tcg_gen_code(TCGContext *s, TranslationBlock *tb, uint64_t pc_start)
 
     tcg_optimize(s);
 
+    if (tb_stats_enabled(tb, TB_JIT_TIME)) {
+        tcg_time->opt_done = get_clock();
+    }
+
     reachable_code_pass(s);
     liveness_pass_0(s);
     liveness_pass_1(s);
@@ -5921,6 +5970,10 @@  int tcg_gen_code(TCGContext *s, TranslationBlock *tb, uint64_t pc_start)
         }
     }
 
+    if (tb_stats_enabled(tb, TB_JIT_TIME)) {
+        tcg_time->la_done = get_clock();
+    }
+
     if (unlikely(qemu_loglevel_mask(CPU_LOG_TB_OP_OPT)
                  && qemu_log_in_addr_range(pc_start))) {
         FILE *logfile = qemu_log_trylock();
@@ -5955,6 +6008,13 @@  int tcg_gen_code(TCGContext *s, TranslationBlock *tb, uint64_t pc_start)
     s->pool_labels = NULL;
 #endif
 
+    if (tb_stats_collection_enabled()) {
+        QTAILQ_FOREACH(op, &s->ops, link) {
+            TCGOpcode opc = op->opc;
+            s->prof.table_op_count[opc]++;
+        }
+    }
+
     num_insns = -1;
     QTAILQ_FOREACH(op, &s->ops, link) {
         TCGOpcode opc = op->opc;
@@ -6053,9 +6113,23 @@  int tcg_gen_code(TCGContext *s, TranslationBlock *tb, uint64_t pc_start)
     return tcg_current_code_size(s);
 }
 
+void tcg_dump_op_count(GString *buf)
+{
+    TCGProfile prof = {};
+    int i;
+
+    collect_tcg_profiler(&prof);
+    for (i = 0; i < NB_OPS; i++) {
+        g_string_append_printf(buf, "%s %" PRId64 "\n",
+                tcg_op_defs[i].name, prof.table_op_count[i]);
+    }
+}
+
 void tcg_dump_info(GString *buf)
 {
-    g_string_append_printf(buf, "[TCG profiler not compiled]\n");
+    TCGProfile prof = {};
+    collect_tcg_profiler(&prof);
+    dump_jit_profile_info(&prof, buf);
 }
 
 #ifdef ELF_HOST_MACHINE
diff --git a/tests/qtest/qmp-cmd-test.c b/tests/qtest/qmp-cmd-test.c
index 73a670e8fa..749aafe4da 100644
--- a/tests/qtest/qmp-cmd-test.c
+++ b/tests/qtest/qmp-cmd-test.c
@@ -46,6 +46,9 @@  static int query_error_class(const char *cmd)
         { "query-balloon", ERROR_CLASS_DEVICE_NOT_ACTIVE },
         { "query-hotpluggable-cpus", ERROR_CLASS_GENERIC_ERROR },
         { "query-vm-generation-id", ERROR_CLASS_GENERIC_ERROR },
+#ifndef CONFIG_TCG
+        { "x-query-profile", ERROR_CLASS_GENERIC_ERROR },
+#endif
         /* Only valid with a USB bus added */
         { "x-query-usb", ERROR_CLASS_GENERIC_ERROR },
         /* Only valid with accel=tcg */