diff mbox series

[v13,04/10] accel/tcg: add jit stats and time to TBStatistics

Message ID 20230529114947.2123652-5-fei2.wu@intel.com (mailing list archive)
State New, archived
Headers show
Series TCG code quality tracking | expand

Commit Message

Wu, Fei May 29, 2023, 11:49 a.m. UTC
This collects all the statistics for TBStatistics, not only for the
whole emulation but for each TB.

Signed-off-by: Vanderson M. do Rosario <vandersonmr2@gmail.com>
Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Fei Wu <fei2.wu@intel.com>
---
 accel/tcg/monitor.c           |  20 ++++-
 accel/tcg/tb-stats.c          | 146 ++++++++++++++++++++++++++++++++++
 accel/tcg/tcg-accel-ops.c     |   7 ++
 accel/tcg/translate-all.c     |  76 +++++++++++++++++-
 accel/tcg/translator.c        |   5 +-
 include/exec/tb-stats-flags.h |   2 +
 include/exec/tb-stats.h       |  46 +++++++++++
 include/qemu/timer.h          |   6 ++
 include/tcg/tcg.h             |  45 ++++++++++-
 softmmu/runstate.c            |   9 +++
 tcg/tcg.c                     |  82 +++++++++++++++++--
 tests/qtest/qmp-cmd-test.c    |   3 +
 12 files changed, 435 insertions(+), 12 deletions(-)

Comments

Richard Henderson May 30, 2023, 4:07 a.m. UTC | #1
On 5/29/23 04:49, Fei Wu wrote:
> +/*
> + * The TCGProfile structure holds data for analysing the quality of
> + * the code generation. The data is split between stuff that is valid
> + * for the lifetime of a single translation and things that are valid
> + * for the lifetime of the translator. As the former is reset for each
> + * new translation so it should be copied elsewhere if you want to
> + * keep it.
> + *
> + * The structure is safe to access within the context of translation
> + * but accessing the data from elsewhere should be done with safe
> + * work.
> + */
> +typedef struct TCGProfile {
> +
> +    struct {
> +        int nb_guest_insns;
> +        int nb_spills;
> +        int nb_ops_pre_opt;
> +
> +        int del_op_count;
> +        int temp_count;
> +    } translation;
> +
> +    int64_t cpu_exec_time;
> +    int64_t op_count; /* total insn count */
> +    int64_t code_in_len;
> +    int64_t code_out_len;
> +    int64_t search_out_len;
> +
> +    /* Timestamps during translation  */
> +    uint64_t gen_start_time;
> +    uint64_t gen_ir_done_time;
> +    uint64_t gen_opt_done_time;
> +    uint64_t gen_la_done_time;
> +    uint64_t gen_code_done_time;
> +
> +    /* Lifetime count of TCGOps per TCGContext */
> +    uint64_t table_op_count[NB_OPS];
> +} TCGProfile;
> +

Why have you added this back?

The whole point of removing CONFIG_PROFILE to begin was to have all new code.  Not to 
remove it then reintroduce it unchanged.

In tcg_gen_code, you have access to the TranslationBlock as s->gen_tb.  There is zero 
point to accumulating into TCGProfile, when you could be accumulating into TCGStatistics 
directly.


r~
Wu, Fei May 30, 2023, 5:01 a.m. UTC | #2
On 5/30/2023 12:07 PM, Richard Henderson wrote:
> On 5/29/23 04:49, Fei Wu wrote:
>> +/*
>> + * The TCGProfile structure holds data for analysing the quality of
>> + * the code generation. The data is split between stuff that is valid
>> + * for the lifetime of a single translation and things that are valid
>> + * for the lifetime of the translator. As the former is reset for each
>> + * new translation so it should be copied elsewhere if you want to
>> + * keep it.
>> + *
>> + * The structure is safe to access within the context of translation
>> + * but accessing the data from elsewhere should be done with safe
>> + * work.
>> + */
>> +typedef struct TCGProfile {
>> +
>> +    struct {
>> +        int nb_guest_insns;
>> +        int nb_spills;
>> +        int nb_ops_pre_opt;
>> +
>> +        int del_op_count;
>> +        int temp_count;
>> +    } translation;
>> +
>> +    int64_t cpu_exec_time;
>> +    int64_t op_count; /* total insn count */
>> +    int64_t code_in_len;
>> +    int64_t code_out_len;
>> +    int64_t search_out_len;
>> +
>> +    /* Timestamps during translation  */
>> +    uint64_t gen_start_time;
>> +    uint64_t gen_ir_done_time;
>> +    uint64_t gen_opt_done_time;
>> +    uint64_t gen_la_done_time;
>> +    uint64_t gen_code_done_time;
>> +
>> +    /* Lifetime count of TCGOps per TCGContext */
>> +    uint64_t table_op_count[NB_OPS];
>> +} TCGProfile;
>> +
> 
> Why have you added this back?
> 
> The whole point of removing CONFIG_PROFILE to begin was to have all new
> code.  Not to remove it then reintroduce it unchanged.
> 
> In tcg_gen_code, you have access to the TranslationBlock as s->gen_tb. 
> There is zero point to accumulating into TCGProfile, when you could be
> accumulating into TCGStatistics directly.
> 
TCGProfile contains global wide (per TCGContext) stats, but TBStatistics
is TB specific, some info in TCGProfile such as table_op_count is not
able to be summed up from TBStatistics. The per-translation stats in
TCGProfile may be removed indeed.

Thanks,
Fei.

> 
> r~
Wu, Fei May 30, 2023, 6:16 a.m. UTC | #3
On 5/30/2023 1:01 PM, Wu, Fei wrote:
> On 5/30/2023 12:07 PM, Richard Henderson wrote:
>> On 5/29/23 04:49, Fei Wu wrote:
>>> +/*
>>> + * The TCGProfile structure holds data for analysing the quality of
>>> + * the code generation. The data is split between stuff that is valid
>>> + * for the lifetime of a single translation and things that are valid
>>> + * for the lifetime of the translator. As the former is reset for each
>>> + * new translation so it should be copied elsewhere if you want to
>>> + * keep it.
>>> + *
>>> + * The structure is safe to access within the context of translation
>>> + * but accessing the data from elsewhere should be done with safe
>>> + * work.
>>> + */
>>> +typedef struct TCGProfile {
>>> +
>>> +    struct {
>>> +        int nb_guest_insns;
>>> +        int nb_spills;
>>> +        int nb_ops_pre_opt;
>>> +
>>> +        int del_op_count;
>>> +        int temp_count;
>>> +    } translation;
>>> +
>>> +    int64_t cpu_exec_time;
>>> +    int64_t op_count; /* total insn count */
>>> +    int64_t code_in_len;
>>> +    int64_t code_out_len;
>>> +    int64_t search_out_len;
>>> +
>>> +    /* Timestamps during translation  */
>>> +    uint64_t gen_start_time;
>>> +    uint64_t gen_ir_done_time;
>>> +    uint64_t gen_opt_done_time;
>>> +    uint64_t gen_la_done_time;
>>> +    uint64_t gen_code_done_time;
>>> +
>>> +    /* Lifetime count of TCGOps per TCGContext */
>>> +    uint64_t table_op_count[NB_OPS];
>>> +} TCGProfile;
>>> +
>>
>> Why have you added this back?
>>
>> The whole point of removing CONFIG_PROFILE to begin was to have all new
>> code.  Not to remove it then reintroduce it unchanged.
>>
>> In tcg_gen_code, you have access to the TranslationBlock as s->gen_tb. 
>> There is zero point to accumulating into TCGProfile, when you could be
>> accumulating into TCGStatistics directly.
>>
> TCGProfile contains global wide (per TCGContext) stats, but TBStatistics
> is TB specific, some info in TCGProfile such as table_op_count is not
> able to be summed up from TBStatistics. The per-translation stats in
> TCGProfile may be removed indeed.
> 
After some cleanup locally, these are the remains in TCGProfile:
* cpu_exec_time - which is not guarded by tb_stats_enabled, it could be
moved out as an individual variable?
* gen_xxx_time - which in kinda global variables across functions to
calc ts->gen_times
* table_op_count - it's indeed guarded by tb_stats_enabled, but it's a
large array, it might be too large to put into TBStaticstics.

Thanks,
Fei.

> Thanks,
> Fei.
> 
>>
>> r~
>
Alex Bennée May 30, 2023, 10:08 a.m. UTC | #4
"Wu, Fei" <fei2.wu@intel.com> writes:

> On 5/30/2023 1:01 PM, Wu, Fei wrote:
>> On 5/30/2023 12:07 PM, Richard Henderson wrote:
>>> On 5/29/23 04:49, Fei Wu wrote:
>>>> +/*
>>>> + * The TCGProfile structure holds data for analysing the quality of
>>>> + * the code generation. The data is split between stuff that is valid
>>>> + * for the lifetime of a single translation and things that are valid
>>>> + * for the lifetime of the translator. As the former is reset for each
>>>> + * new translation so it should be copied elsewhere if you want to
>>>> + * keep it.
>>>> + *
>>>> + * The structure is safe to access within the context of translation
>>>> + * but accessing the data from elsewhere should be done with safe
>>>> + * work.
>>>> + */
<snip>
>>>> +    int64_t cpu_exec_time;
>>>> +    int64_t op_count; /* total insn count */
>>>> +    int64_t code_in_len;
>>>> +    int64_t code_out_len;
>>>> +    int64_t search_out_len;
>>>> +
>>>> +    /* Timestamps during translation  */
>>>> +    uint64_t gen_start_time;
>>>> +    uint64_t gen_ir_done_time;
>>>> +    uint64_t gen_opt_done_time;
>>>> +    uint64_t gen_la_done_time;
>>>> +    uint64_t gen_code_done_time;
>>>> +
>>>> +    /* Lifetime count of TCGOps per TCGContext */
>>>> +    uint64_t table_op_count[NB_OPS];
>>>> +} TCGProfile;
>>>> +
>>>
>>> Why have you added this back?
>>>
>>> The whole point of removing CONFIG_PROFILE to begin was to have all new
>>> code.  Not to remove it then reintroduce it unchanged.
>>>
>>> In tcg_gen_code, you have access to the TranslationBlock as s->gen_tb. 
>>> There is zero point to accumulating into TCGProfile, when you could be
>>> accumulating into TCGStatistics directly.
>>>
>> TCGProfile contains global wide (per TCGContext) stats, but TBStatistics
>> is TB specific, some info in TCGProfile such as table_op_count is not
>> able to be summed up from TBStatistics. The per-translation stats in
>> TCGProfile may be removed indeed.
>> 
> After some cleanup locally, these are the remains in TCGProfile:
> * cpu_exec_time - which is not guarded by tb_stats_enabled, it could be
> moved out as an individual variable?
> * gen_xxx_time - which in kinda global variables across functions to
> calc ts->gen_times

Given the work on JIT profiling I think there is an argument to drop the
time profile bits and pieces. I think you can get the same information
from a perf run although it does amortise the cost of generation over
all translations. Do we see any particular TBs that are particularly
expensive to translate by more than a standard deviation?

> * table_op_count - it's indeed guarded by tb_stats_enabled, but it's a
> large array, it might be too large to put into TBStaticstics.

Probably. This is probably more interesting information as an aggregate
than per TB.

>
> Thanks,
> Fei.
>
>> Thanks,
>> Fei.
>> 
>>>
>>> r~
>>
Wu, Fei May 31, 2023, 12:46 a.m. UTC | #5
On 5/30/2023 6:08 PM, Alex Bennée wrote:
> 
> "Wu, Fei" <fei2.wu@intel.com> writes:
> 
>> On 5/30/2023 1:01 PM, Wu, Fei wrote:
>>> On 5/30/2023 12:07 PM, Richard Henderson wrote:
>>>> On 5/29/23 04:49, Fei Wu wrote:
>>>>> +/*
>>>>> + * The TCGProfile structure holds data for analysing the quality of
>>>>> + * the code generation. The data is split between stuff that is valid
>>>>> + * for the lifetime of a single translation and things that are valid
>>>>> + * for the lifetime of the translator. As the former is reset for each
>>>>> + * new translation so it should be copied elsewhere if you want to
>>>>> + * keep it.
>>>>> + *
>>>>> + * The structure is safe to access within the context of translation
>>>>> + * but accessing the data from elsewhere should be done with safe
>>>>> + * work.
>>>>> + */
> <snip>
>>>>> +    int64_t cpu_exec_time;
>>>>> +    int64_t op_count; /* total insn count */
>>>>> +    int64_t code_in_len;
>>>>> +    int64_t code_out_len;
>>>>> +    int64_t search_out_len;
>>>>> +
>>>>> +    /* Timestamps during translation  */
>>>>> +    uint64_t gen_start_time;
>>>>> +    uint64_t gen_ir_done_time;
>>>>> +    uint64_t gen_opt_done_time;
>>>>> +    uint64_t gen_la_done_time;
>>>>> +    uint64_t gen_code_done_time;
>>>>> +
>>>>> +    /* Lifetime count of TCGOps per TCGContext */
>>>>> +    uint64_t table_op_count[NB_OPS];
>>>>> +} TCGProfile;
>>>>> +
>>>>
>>>> Why have you added this back?
>>>>
>>>> The whole point of removing CONFIG_PROFILE to begin was to have all new
>>>> code.  Not to remove it then reintroduce it unchanged.
>>>>
>>>> In tcg_gen_code, you have access to the TranslationBlock as s->gen_tb. 
>>>> There is zero point to accumulating into TCGProfile, when you could be
>>>> accumulating into TCGStatistics directly.
>>>>
>>> TCGProfile contains global wide (per TCGContext) stats, but TBStatistics
>>> is TB specific, some info in TCGProfile such as table_op_count is not
>>> able to be summed up from TBStatistics. The per-translation stats in
>>> TCGProfile may be removed indeed.
>>>
>> After some cleanup locally, these are the remains in TCGProfile:
>> * cpu_exec_time - which is not guarded by tb_stats_enabled, it could be
>> moved out as an individual variable?
>> * gen_xxx_time - which in kinda global variables across functions to
>> calc ts->gen_times
> 
> Given the work on JIT profiling I think there is an argument to drop the
> time profile bits and pieces. I think you can get the same information
> from a perf run although it does amortise the cost of generation over
> all translations. Do we see any particular TBs that are particularly
> expensive to translate by more than a standard deviation?
> 
I haven't seen this before, but it may be different if we sort the TBs
by these times. Perhaps we can get some idea of some kinds of code takes
more time to optimize than others?

>> * table_op_count - it's indeed guarded by tb_stats_enabled, but it's a
>> large array, it might be too large to put into TBStaticstics.
> 
> Probably. This is probably more interesting information as an aggregate
> than per TB.
> 
Yes, I kept this here in v14.

Thanks,
Fei.

>>
>> Thanks,
>> Fei.
>>
>>> Thanks,
>>> Fei.
>>>
>>>>
>>>> r~
>>>
> 
>
Wu, Fei May 31, 2023, 2:05 a.m. UTC | #6
On 5/30/2023 6:08 PM, Alex Bennée wrote:
> 
> "Wu, Fei" <fei2.wu@intel.com> writes:
> 
>> On 5/30/2023 1:01 PM, Wu, Fei wrote:
>>> On 5/30/2023 12:07 PM, Richard Henderson wrote:
>>>> On 5/29/23 04:49, Fei Wu wrote:
>>>>> +/*
>>>>> + * The TCGProfile structure holds data for analysing the quality of
>>>>> + * the code generation. The data is split between stuff that is valid
>>>>> + * for the lifetime of a single translation and things that are valid
>>>>> + * for the lifetime of the translator. As the former is reset for each
>>>>> + * new translation so it should be copied elsewhere if you want to
>>>>> + * keep it.
>>>>> + *
>>>>> + * The structure is safe to access within the context of translation
>>>>> + * but accessing the data from elsewhere should be done with safe
>>>>> + * work.
>>>>> + */
> <snip>
>>>>> +    int64_t cpu_exec_time;
>>>>> +    int64_t op_count; /* total insn count */
>>>>> +    int64_t code_in_len;
>>>>> +    int64_t code_out_len;
>>>>> +    int64_t search_out_len;
>>>>> +
>>>>> +    /* Timestamps during translation  */
>>>>> +    uint64_t gen_start_time;
>>>>> +    uint64_t gen_ir_done_time;
>>>>> +    uint64_t gen_opt_done_time;
>>>>> +    uint64_t gen_la_done_time;
>>>>> +    uint64_t gen_code_done_time;
>>>>> +
>>>>> +    /* Lifetime count of TCGOps per TCGContext */
>>>>> +    uint64_t table_op_count[NB_OPS];
>>>>> +} TCGProfile;
>>>>> +
>>>>
>>>> Why have you added this back?
>>>>
>>>> The whole point of removing CONFIG_PROFILE to begin was to have all new
>>>> code.  Not to remove it then reintroduce it unchanged.
>>>>
>>>> In tcg_gen_code, you have access to the TranslationBlock as s->gen_tb. 
>>>> There is zero point to accumulating into TCGProfile, when you could be
>>>> accumulating into TCGStatistics directly.
>>>>
>>> TCGProfile contains global wide (per TCGContext) stats, but TBStatistics
>>> is TB specific, some info in TCGProfile such as table_op_count is not
>>> able to be summed up from TBStatistics. The per-translation stats in
>>> TCGProfile may be removed indeed.
>>>
>> After some cleanup locally, these are the remains in TCGProfile:
>> * cpu_exec_time - which is not guarded by tb_stats_enabled, it could be
>> moved out as an individual variable?
>> * gen_xxx_time - which in kinda global variables across functions to
>> calc ts->gen_times
> 
> Given the work on JIT profiling I think there is an argument to drop the
> time profile bits and pieces. I think you can get the same information
> from a perf run although it does amortise the cost of generation over
> all translations. Do we see any particular TBs that are particularly
> expensive to translate by more than a standard deviation?
> 
I added some code locally to sort by ir/code, (opt & la are not printed
in 'info tb-list').

(qemu) info tb-list 2 ir
(qemu) TB id:1 | phys:0x2abe14 virt:0xffffffff800abe14 flags:0x01024001
0 inv/1
        | exec:99079/0 guest inst cov:0.03%
        | trans:1 ints: g:6 op:24 op_opt:21 spills:0
        | h/g (host bytes / guest insts): 20.000000
        | time to gen at 2.4GHz => code:840.00(ns) IR:75274.58(ns)

TB id:2 | phys:0x24cf94 virt:0xffffffff8004cf94 flags:0x01024001 0 inv/3
        | exec:99085/0 guest inst cov:0.53%
        | trans:3 ints: g:31 op:86 op_opt:74 spills:0
        | h/g (host bytes / guest insts): 42.322582
        | time to gen at 2.4GHz => code:10011.25(ns) IR:118656.25(ns)

(qemu) info tb-list 2 ir
(qemu) TB id:1 | phys:0x24cf94 virt:0xffffffff8004cf94 flags:0x01024001
0 inv/3
        | exec:131259/0 guest inst cov:0.59%
        | trans:3 ints: g:31 op:86 op_opt:74 spills:0
        | h/g (host bytes / guest insts): 42.322582
        | time to gen at 2.4GHz => code:10011.25(ns) IR:118656.25(ns)

TB id:2 | phys:0x2abe14 virt:0xffffffff800abe14 flags:0x01024001 1 inv/3
        | exec:131541/0 guest inst cov:0.11%
        | trans:3 ints: g:6 op:24 op_opt:21 spills:0
        | h/g (host bytes / guest insts): 20.000000
        | time to gen at 2.4GHz => code:4411.25(ns) IR:88288.33(ns)

(qemu) info tb 2
(qemu)
------------------------------

TB id:2 | phys:0x2abe14 virt:0xffffffff800abe14 flags:0x01024001 1 inv/3
        | exec:668754/0 guest inst cov:0.16%
        | trans:3 ints: g:6 op:24 op_opt:21 spills:0
        | h/g (host bytes / guest insts): 20.000000
        | time to gen at 2.4GHz => code:4411.25(ns) IR:88288.33(ns)

----------------
IN:
Priv: 1; Virt: 0

0xffffffff800abe14:  864a              mv                      a2,s2
0xffffffff800abe16:  85ce              mv                      a1,s3
0xffffffff800abe18:  8526              mv                      a0,s1
0xffffffff800abe1a:  46bd              addi                    a3,zero,15
0xffffffff800abe1c:  fffff097          auipc                   ra,-4096
              # 0xffffffff800aae1c
0xffffffff800abe20:  cc0080e7          jalr                    ra,ra,-832
------------------------------

Look at the tb with phys:0x2abe14, although the first time IR takes
75274ns, but in the second command we can see it takes much less time
(Note IR time is accumulated).

So if the time for the same TB is not consistent, and the deviation
could be dominated by system events such as memory allocation instead of
codegen itself (?), I think it's less useful.

Alex, regarding dropping time profile, do you mean remove TB_JIT_TIME
completely?

Thanks,
Fei.

>> * table_op_count - it's indeed guarded by tb_stats_enabled, but it's a
>> large array, it might be too large to put into TBStaticstics.
> 
> Probably. This is probably more interesting information as an aggregate
> than per TB.
> 
>>
>> Thanks,
>> Fei.
>>
>>> Thanks,
>>> Fei.
>>>
>>>>
>>>> r~
>>>
> 
>
Alex Bennée June 1, 2023, 11:51 a.m. UTC | #7
"Wu, Fei" <fei2.wu@intel.com> writes:

> On 5/30/2023 6:08 PM, Alex Bennée wrote:
>> 
>> "Wu, Fei" <fei2.wu@intel.com> writes:
>> 
>>> On 5/30/2023 1:01 PM, Wu, Fei wrote:
>>>> On 5/30/2023 12:07 PM, Richard Henderson wrote:
>>>>> On 5/29/23 04:49, Fei Wu wrote:
<snip>
> ----------------
> IN:
> Priv: 1; Virt: 0
>
> 0xffffffff800abe14:  864a              mv                      a2,s2
> 0xffffffff800abe16:  85ce              mv                      a1,s3
> 0xffffffff800abe18:  8526              mv                      a0,s1
> 0xffffffff800abe1a:  46bd              addi                    a3,zero,15
> 0xffffffff800abe1c:  fffff097          auipc                   ra,-4096
>               # 0xffffffff800aae1c
> 0xffffffff800abe20:  cc0080e7          jalr                    ra,ra,-832
> ------------------------------
>
> Look at the tb with phys:0x2abe14, although the first time IR takes
> 75274ns, but in the second command we can see it takes much less time
> (Note IR time is accumulated).
>
> So if the time for the same TB is not consistent, and the deviation
> could be dominated by system events such as memory allocation instead of
> codegen itself (?), I think it's less useful.
>
> Alex, regarding dropping time profile, do you mean remove TB_JIT_TIME
> completely?

I think so - perf would do a better job of separating system events from
the core code as it has better visibility of the whole system.
Wu, Fei June 1, 2023, 12:48 p.m. UTC | #8
On 6/1/2023 7:51 PM, Alex Bennée wrote:
> 
> "Wu, Fei" <fei2.wu@intel.com> writes:
> 
>> On 5/30/2023 6:08 PM, Alex Bennée wrote:
>>>
>>> "Wu, Fei" <fei2.wu@intel.com> writes:
>>>
>>>> On 5/30/2023 1:01 PM, Wu, Fei wrote:
>>>>> On 5/30/2023 12:07 PM, Richard Henderson wrote:
>>>>>> On 5/29/23 04:49, Fei Wu wrote:
> <snip>
>> ----------------
>> IN:
>> Priv: 1; Virt: 0
>>
>> 0xffffffff800abe14:  864a              mv                      a2,s2
>> 0xffffffff800abe16:  85ce              mv                      a1,s3
>> 0xffffffff800abe18:  8526              mv                      a0,s1
>> 0xffffffff800abe1a:  46bd              addi                    a3,zero,15
>> 0xffffffff800abe1c:  fffff097          auipc                   ra,-4096
>>               # 0xffffffff800aae1c
>> 0xffffffff800abe20:  cc0080e7          jalr                    ra,ra,-832
>> ------------------------------
>>
>> Look at the tb with phys:0x2abe14, although the first time IR takes
>> 75274ns, but in the second command we can see it takes much less time
>> (Note IR time is accumulated).
>>
>> So if the time for the same TB is not consistent, and the deviation
>> could be dominated by system events such as memory allocation instead of
>> codegen itself (?), I think it's less useful.
>>
>> Alex, regarding dropping time profile, do you mean remove TB_JIT_TIME
>> completely?
> 
> I think so - perf would do a better job of separating system events from
> the core code as it has better visibility of the whole system. 
> 
OK, got it.

Richard, can we reach the agreement here? If yes, I will remove
TB_JIT_TIME and all the time stuffs (dev_time, cpu_exec_time)
completely. We can still add it back if it proves to be useful. The only
concern is that tbstats replaces CONFIG_PROFILER but drops this function
from it.

Thanks,
Fei.
Richard Henderson June 1, 2023, 3:26 p.m. UTC | #9
On 6/1/23 05:48, Wu, Fei wrote:
> On 6/1/2023 7:51 PM, Alex Bennée wrote:
>>
>> "Wu, Fei" <fei2.wu@intel.com> writes:
>>
>>> On 5/30/2023 6:08 PM, Alex Bennée wrote:
>>>>
>>>> "Wu, Fei" <fei2.wu@intel.com> writes:
>>>>
>>>>> On 5/30/2023 1:01 PM, Wu, Fei wrote:
>>>>>> On 5/30/2023 12:07 PM, Richard Henderson wrote:
>>>>>>> On 5/29/23 04:49, Fei Wu wrote:
>> <snip>
>>> ----------------
>>> IN:
>>> Priv: 1; Virt: 0
>>>
>>> 0xffffffff800abe14:  864a              mv                      a2,s2
>>> 0xffffffff800abe16:  85ce              mv                      a1,s3
>>> 0xffffffff800abe18:  8526              mv                      a0,s1
>>> 0xffffffff800abe1a:  46bd              addi                    a3,zero,15
>>> 0xffffffff800abe1c:  fffff097          auipc                   ra,-4096
>>>                # 0xffffffff800aae1c
>>> 0xffffffff800abe20:  cc0080e7          jalr                    ra,ra,-832
>>> ------------------------------
>>>
>>> Look at the tb with phys:0x2abe14, although the first time IR takes
>>> 75274ns, but in the second command we can see it takes much less time
>>> (Note IR time is accumulated).
>>>
>>> So if the time for the same TB is not consistent, and the deviation
>>> could be dominated by system events such as memory allocation instead of
>>> codegen itself (?), I think it's less useful.
>>>
>>> Alex, regarding dropping time profile, do you mean remove TB_JIT_TIME
>>> completely?
>>
>> I think so - perf would do a better job of separating system events from
>> the core code as it has better visibility of the whole system.
>>
> OK, got it.
> 
> Richard, can we reach the agreement here? If yes, I will remove
> TB_JIT_TIME and all the time stuffs (dev_time, cpu_exec_time)
> completely. We can still add it back if it proves to be useful. The only
> concern is that tbstats replaces CONFIG_PROFILER but drops this function
> from it.

Yes, I'm quite happy to drop all of the time stuff.


r~
diff mbox series

Patch

diff --git a/accel/tcg/monitor.c b/accel/tcg/monitor.c
index e903dd1d2e..2bc87f2642 100644
--- a/accel/tcg/monitor.c
+++ b/accel/tcg/monitor.c
@@ -15,6 +15,7 @@ 
 #include "sysemu/cpus.h"
 #include "sysemu/cpu-timers.h"
 #include "sysemu/tcg.h"
+#include "exec/tb-stats.h"
 #include "internal.h"
 
 
@@ -69,6 +70,11 @@  HumanReadableText *qmp_x_query_opcount(Error **errp)
 {
     g_autoptr(GString) buf = g_string_new("");
 
+    if (!tb_stats_collection_enabled()) {
+        error_setg(errp, "TB information not being recorded.");
+        return NULL;
+    }
+
     if (!tcg_enabled()) {
         error_setg(errp,
                    "Opcode count information is only available with accel=tcg");
@@ -80,11 +86,23 @@  HumanReadableText *qmp_x_query_opcount(Error **errp)
     return human_readable_text_from_str(buf);
 }
 
+#ifdef CONFIG_TCG
+HumanReadableText *qmp_x_query_profile(Error **errp)
+{
+    g_autoptr(GString) buf = g_string_new("");
+
+    dump_jit_exec_time_info(dev_time, buf);
+    dev_time = 0;
+
+    return human_readable_text_from_str(buf);
+}
+#else
 HumanReadableText *qmp_x_query_profile(Error **errp)
 {
-    error_setg(errp, "Internal profiler not compiled");
+    error_setg(errp, "TCG should be enabled!");
     return NULL;
 }
+#endif
 
 static void hmp_tcg_register(void)
 {
diff --git a/accel/tcg/tb-stats.c b/accel/tcg/tb-stats.c
index 143a52ef5c..78a3104c7f 100644
--- a/accel/tcg/tb-stats.c
+++ b/accel/tcg/tb-stats.c
@@ -9,6 +9,11 @@ 
 #include "qemu/osdep.h"
 
 #include "disas/disas.h"
+#include "exec/exec-all.h"
+#include "tcg/tcg.h"
+
+#include "qemu/qemu-print.h"
+#include "qemu/timer.h"
 
 #include "exec/tb-stats.h"
 #include "tb-context.h"
@@ -24,6 +29,147 @@  enum TBStatsStatus {
 static enum TBStatsStatus tcg_collect_tb_stats;
 static uint32_t default_tbstats_flag;
 
+uint64_t dev_time;
+
+struct jit_profile_info {
+    uint64_t translations;
+    uint64_t aborted;
+    uint64_t ops;
+    unsigned ops_max;
+    uint64_t del_ops;
+    uint64_t temps;
+    unsigned temps_max;
+    uint64_t host;
+    uint64_t guest;
+    uint64_t search_data;
+
+    uint64_t interm_time;
+    uint64_t code_time;
+    uint64_t restore_count;
+    uint64_t restore_time;
+    uint64_t opt_time;
+    uint64_t la_time;
+};
+
+/* accumulate the statistics from all TBs */
+static void collect_jit_profile_info(void *p, uint32_t hash, void *userp)
+{
+    struct jit_profile_info *jpi = userp;
+    TBStatistics *tbs = p;
+
+    jpi->translations += tbs->translations.total;
+    jpi->ops += tbs->code.num_tcg_ops;
+    if (stat_per_translation(tbs, code.num_tcg_ops) > jpi->ops_max) {
+        jpi->ops_max = stat_per_translation(tbs, code.num_tcg_ops);
+    }
+    jpi->del_ops += tbs->code.deleted_ops;
+    jpi->temps += tbs->code.temps;
+    if (stat_per_translation(tbs, code.temps) > jpi->temps_max) {
+        jpi->temps_max = stat_per_translation(tbs, code.temps);
+    }
+    jpi->host += tbs->code.out_len;
+    jpi->guest += tbs->code.in_len;
+    jpi->search_data += tbs->code.search_out_len;
+
+    jpi->interm_time += stat_per_translation(tbs, gen_times.ir);
+    jpi->opt_time += stat_per_translation(tbs, gen_times.ir_opt);
+    jpi->la_time += stat_per_translation(tbs, gen_times.la);
+    jpi->code_time += stat_per_translation(tbs, gen_times.code);
+
+    /*
+     * The restore time covers how long we have spent restoring state
+     * from a given TB (e.g. recovering from a fault). It is therefor
+     * not related to the number of translations we have done.
+     */
+    jpi->restore_time += tbs->tb_restore_time;
+    jpi->restore_count += tbs->tb_restore_count;
+}
+
+void dump_jit_exec_time_info(uint64_t dev_time, GString *buf)
+{
+    static uint64_t last_cpu_exec_time;
+    uint64_t cpu_exec_time;
+    uint64_t delta;
+
+    cpu_exec_time = tcg_cpu_exec_time();
+    delta = cpu_exec_time - last_cpu_exec_time;
+
+    g_string_append_printf(buf, "async time  %" PRId64 " (%0.3f)\n",
+                           dev_time, dev_time / (double)NANOSECONDS_PER_SECOND);
+    g_string_append_printf(buf, "qemu time   %" PRId64 " (%0.3f)\n",
+                           delta, delta / (double)NANOSECONDS_PER_SECOND);
+    last_cpu_exec_time = cpu_exec_time;
+}
+
+/* dump JIT statisticis using TCGProfile and TBStats */
+void dump_jit_profile_info(TCGProfile *s, GString *buf)
+{
+    if (!tb_stats_collection_enabled()) {
+        return;
+    }
+
+    struct jit_profile_info *jpi = g_new0(struct jit_profile_info, 1);
+
+    qht_iter(&tb_ctx.tb_stats, collect_jit_profile_info, jpi);
+
+    if (jpi->translations) {
+        g_string_append_printf(buf, "translated TBs      %" PRId64 "\n",
+                jpi->translations);
+        g_string_append_printf(buf, "avg ops/TB          %0.1f max=%d\n",
+                jpi->ops / (double) jpi->translations, jpi->ops_max);
+        g_string_append_printf(buf, "deleted ops/TB      %0.2f\n",
+                jpi->del_ops / (double) jpi->translations);
+        g_string_append_printf(buf, "avg temps/TB        %0.2f max=%d\n",
+                jpi->temps / (double) jpi->translations, jpi->temps_max);
+        g_string_append_printf(buf, "avg host code/TB    %0.1f\n",
+                jpi->host / (double) jpi->translations);
+        g_string_append_printf(buf, "avg search data/TB  %0.1f\n",
+                jpi->search_data / (double) jpi->translations);
+
+        uint64_t tot = jpi->interm_time + jpi->code_time;
+
+        g_string_append_printf(buf, "JIT cycles          %" PRId64
+                " (%0.3fs at 2.4 GHz)\n",
+                tot, tot / 2.4e9);
+        g_string_append_printf(buf, "  cycles/op           %0.1f\n",
+                jpi->ops ? (double)tot / jpi->ops : 0);
+        g_string_append_printf(buf, "  cycles/in byte      %0.1f\n",
+                jpi->guest ? (double)tot / jpi->guest : 0);
+        g_string_append_printf(buf, "  cycles/out byte     %0.1f\n",
+                jpi->host ? (double)tot / jpi->host : 0);
+        g_string_append_printf(buf, "  cycles/search byte  %0.1f\n",
+                jpi->search_data ? (double)tot / jpi->search_data : 0);
+        if (tot == 0) {
+            tot = 1;
+        }
+
+        g_string_append_printf(buf, "  gen_interm time     %0.1f%%\n",
+                (double)jpi->interm_time / tot * 100.0);
+        g_string_append_printf(buf, "  gen_code time       %0.1f%%\n",
+                (double)jpi->code_time / tot * 100.0);
+
+        g_string_append_printf(buf, "    optim./code time    %0.1f%%\n",
+                (double)jpi->opt_time / (jpi->code_time ? jpi->code_time : 1)
+                    * 100.0);
+        g_string_append_printf(buf, "    liveness/code time  %0.1f%%\n",
+                (double)jpi->la_time / (jpi->code_time ? jpi->code_time : 1)
+                    * 100.0);
+
+        g_string_append_printf(buf, "cpu_restore count   %" PRId64 "\n",
+                jpi->restore_count);
+        g_string_append_printf(buf, "  avg cycles        %0.1f\n",
+                jpi->restore_count ?
+                    (double)jpi->restore_time / jpi->restore_count : 0);
+
+        if (s) {
+            g_string_append_printf(buf, "cpu exec time  %" PRId64 " (%0.3fs)\n",
+                s->cpu_exec_time,
+                s->cpu_exec_time / (double) NANOSECONDS_PER_SECOND);
+        }
+    }
+    g_free(jpi);
+}
+
 void init_tb_stats_htable(void)
 {
     if (!tb_ctx.tb_stats.map && tb_stats_collection_enabled()) {
diff --git a/accel/tcg/tcg-accel-ops.c b/accel/tcg/tcg-accel-ops.c
index 3973591508..749ad182f2 100644
--- a/accel/tcg/tcg-accel-ops.c
+++ b/accel/tcg/tcg-accel-ops.c
@@ -70,10 +70,17 @@  void tcg_cpus_destroy(CPUState *cpu)
 int tcg_cpus_exec(CPUState *cpu)
 {
     int ret;
+    uint64_t ti;
+
     assert(tcg_enabled());
+    ti = profile_getclock();
+
     cpu_exec_start(cpu);
     ret = cpu_exec(cpu);
     cpu_exec_end(cpu);
+
+    qatomic_add(&tcg_ctx->prof.cpu_exec_time, profile_getclock() - ti);
+
     return ret;
 }
 
diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index dadf49954f..ec72b5121e 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -204,6 +204,12 @@  void cpu_restore_state_from_tb(CPUState *cpu, TranslationBlock *tb,
                                uintptr_t host_pc)
 {
     uint64_t data[TARGET_INSN_START_WORDS];
+    uint64_t ti = 0;
+
+    if (tb_stats_enabled(tb, TB_JIT_TIME)) {
+        ti = profile_getclock();
+    }
+
     int insns_left = cpu_unwind_data_from_tb(tb, host_pc, data);
 
     if (insns_left < 0) {
@@ -220,6 +226,15 @@  void cpu_restore_state_from_tb(CPUState *cpu, TranslationBlock *tb,
     }
 
     cpu->cc->tcg_ops->restore_state_to_opc(cpu, tb, data);
+
+    if (tb_stats_enabled(tb, TB_JIT_TIME)) {
+        TBStatistics *ts = tb->tb_stats;
+        uint64_t elapsed = profile_getclock() - ti;
+        qemu_mutex_lock(&ts->jit_stats_lock);
+        ts->tb_restore_time += elapsed;
+        ts->tb_restore_count++;
+        qemu_mutex_unlock(&ts->jit_stats_lock);
+    }
 }
 
 bool cpu_restore_state(CPUState *cpu, uintptr_t host_pc)
@@ -267,8 +282,9 @@  void page_init(void)
  */
 static int setjmp_gen_code(CPUArchState *env, TranslationBlock *tb,
                            target_ulong pc, void *host_pc,
-                           int *max_insns, int64_t *ti)
+                           int *max_insns)
 {
+    TCGProfile *prof = &tcg_ctx->prof;
     int ret = sigsetjmp(tcg_ctx->jmp_trans, 0);
     if (unlikely(ret != 0)) {
         return ret;
@@ -282,6 +298,10 @@  static int setjmp_gen_code(CPUArchState *env, TranslationBlock *tb,
     tcg_ctx->cpu = NULL;
     *max_insns = tb->icount;
 
+    if (tb_stats_enabled(tb, TB_JIT_TIME)) {
+        prof->gen_ir_done_time = profile_getclock();
+    }
+
     return tcg_gen_code(tcg_ctx, tb, pc);
 }
 
@@ -296,6 +316,8 @@  static TBStatistics *tb_get_stats(tb_page_addr_t phys_pc, target_ulong pc,
     new_stats->cs_base = cs_base;
     new_stats->flags = flags;
     new_stats->stats_enabled = get_default_tbstats_flag();
+    new_stats->tbs = g_ptr_array_sized_new(4);
+    qemu_mutex_init(&new_stats->jit_stats_lock);
 
     /*
      * All initialisation must be complete before we insert into qht
@@ -309,6 +331,7 @@  static TBStatistics *tb_get_stats(tb_page_addr_t phys_pc, target_ulong pc,
          * If there is already a TBStatistic for this TB from a previous flush
          * then just make the new TB point to the older TBStatistic
          */
+        g_ptr_array_free(new_stats->tbs, true);
         g_free(new_stats);
         return existing_stats;
     } else {
@@ -327,7 +350,7 @@  TranslationBlock *tb_gen_code(CPUState *cpu,
     tb_page_addr_t phys_pc;
     tcg_insn_unit *gen_code_buf;
     int gen_code_size, search_size, max_insns;
-    int64_t ti;
+    TCGProfile *prof = &tcg_ctx->prof;
     void *host_pc;
 
     assert_memory_lock();
@@ -389,11 +412,14 @@  TranslationBlock *tb_gen_code(CPUState *cpu,
     if (tb_stats_collection_enabled() &&
         qemu_log_in_addr_range(tb->pc)) {
         tb->tb_stats = tb_get_stats(phys_pc, pc, cs_base, flags);
+        if (tb_stats_enabled(tb, TB_JIT_TIME)) {
+            prof->gen_start_time = profile_getclock();
+        }
     } else {
         tb->tb_stats = NULL;
     }
 
-    gen_code_size = setjmp_gen_code(env, tb, pc, host_pc, &max_insns, &ti);
+    gen_code_size = setjmp_gen_code(env, tb, pc, host_pc, &max_insns);
     if (unlikely(gen_code_size < 0)) {
         switch (gen_code_size) {
         case -1:
@@ -445,6 +471,10 @@  TranslationBlock *tb_gen_code(CPUState *cpu,
      */
     perf_report_code(pc, tb, tcg_splitwx_to_rx(gen_code_buf));
 
+    if (tb_stats_enabled(tb, TB_JIT_TIME)) {
+        prof->gen_code_done_time = profile_getclock();
+    }
+
     if (qemu_loglevel_mask(CPU_LOG_TB_OUT_ASM) &&
         qemu_log_in_addr_range(pc)) {
         FILE *logfile = qemu_log_trylock();
@@ -547,6 +577,46 @@  TranslationBlock *tb_gen_code(CPUState *cpu,
         return tb;
     }
 
+    /*
+     * Collect JIT stats when enabled. We batch them all up here to
+     * avoid spamming the cache with atomic accesses
+     */
+    if (tb_stats_enabled(tb, (TB_JIT_STATS | TB_JIT_TIME))) {
+        TBStatistics *ts = tb->tb_stats;
+        qemu_mutex_lock(&ts->jit_stats_lock);
+
+        if (tb_stats_enabled(tb, TB_JIT_STATS)) {
+            ts->code.num_guest_inst += prof->translation.nb_guest_insns;
+            ts->code.num_tcg_ops += prof->translation.nb_ops_pre_opt;
+            ts->code.num_tcg_ops_opt += tcg_ctx->nb_ops;
+            ts->code.spills += prof->translation.nb_spills;
+            ts->code.temps += prof->translation.temp_count;
+            ts->code.deleted_ops += prof->translation.del_op_count;
+            ts->code.in_len += tb->size;
+            ts->code.out_len += tb->tc.size;
+            ts->code.search_out_len += search_size;
+
+            ts->translations.total++;
+            if (tb_page_addr1(tb) != -1) {
+                ts->translations.spanning++;
+            }
+
+            g_ptr_array_add(ts->tbs, tb);
+        }
+
+        if (tb_stats_enabled(tb, TB_JIT_TIME)) {
+            ts->gen_times.ir += prof->gen_ir_done_time - prof->gen_start_time;
+            ts->gen_times.ir_opt +=
+                prof->gen_opt_done_time - prof->gen_ir_done_time;
+            ts->gen_times.la +=
+                prof->gen_la_done_time - prof->gen_opt_done_time;
+            ts->gen_times.code +=
+                prof->gen_code_done_time - prof->gen_la_done_time;
+        }
+
+        qemu_mutex_unlock(&ts->jit_stats_lock);
+    }
+
     /*
      * Insert TB into the corresponding region tree before publishing it
      * through QHT. Otherwise rewinding happened in the TB might fail to
diff --git a/accel/tcg/translator.c b/accel/tcg/translator.c
index 80ffbfb455..f7c8f3f1f4 100644
--- a/accel/tcg/translator.c
+++ b/accel/tcg/translator.c
@@ -19,7 +19,7 @@ 
 #include "exec/plugin-gen.h"
 #include "exec/replay-core.h"
 
-static void gen_tb_exec_count(TranslationBlock *tb)
+static inline void gen_tb_exec_count(TranslationBlock *tb)
 {
     if (tb_stats_enabled(tb, TB_EXEC_STATS)) {
         TCGv_ptr ptr = tcg_temp_ebb_new_ptr();
@@ -147,6 +147,9 @@  void translator_loop(CPUState *cpu, TranslationBlock *tb, int *max_insns,
     tb->size = db->pc_next - db->pc_first;
     tb->icount = db->num_insns;
 
+    /* Save number of guest instructions for TB_JIT_STATS */
+    tcg_ctx->prof.translation.nb_guest_insns = db->num_insns;
+
     if (qemu_loglevel_mask(CPU_LOG_TB_IN_ASM)
         && qemu_log_in_addr_range(db->pc_first)) {
         FILE *logfile = qemu_log_trylock();
diff --git a/include/exec/tb-stats-flags.h b/include/exec/tb-stats-flags.h
index fa71eb6f0c..f29eff7576 100644
--- a/include/exec/tb-stats-flags.h
+++ b/include/exec/tb-stats-flags.h
@@ -13,6 +13,8 @@ 
 
 #define TB_NOTHING    (1 << 0)
 #define TB_EXEC_STATS (1 << 1)
+#define TB_JIT_STATS  (1 << 2)
+#define TB_JIT_TIME   (1 << 3)
 
 /* TBStatistic collection controls */
 void enable_collect_tb_stats(void);
diff --git a/include/exec/tb-stats.h b/include/exec/tb-stats.h
index eb1fa92a4e..d93d42e085 100644
--- a/include/exec/tb-stats.h
+++ b/include/exec/tb-stats.h
@@ -34,6 +34,9 @@ 
 #define tb_stats_enabled(tb, JIT_STATS) \
     (tb && tb->tb_stats && (tb->tb_stats->stats_enabled & JIT_STATS))
 
+#define stat_per_translation(stat, name) \
+    (stat->translations.total ? stat->name / stat->translations.total : 0)
+
 typedef struct TBStatistics TBStatistics;
 
 /*
@@ -60,10 +63,53 @@  struct TBStatistics {
         unsigned long atomic;
     } executions;
 
+    /* JIT Stats - protected by lock */
+    QemuMutex jit_stats_lock;
+
+    /* Sum of all operations for all translations */
+    struct {
+        unsigned num_guest_inst;
+        unsigned num_tcg_ops;
+        unsigned num_tcg_ops_opt;
+        unsigned spills;
+
+        /* CONFIG_PROFILE */
+        unsigned temps;
+        unsigned deleted_ops;
+        unsigned in_len;
+        unsigned out_len;
+        unsigned search_out_len;
+    } code;
+
+    struct {
+        unsigned long total;
+        unsigned long uncached;
+        unsigned long spanning;
+    } translations;
+
+    /*
+     * All persistent (cached) TranslationBlocks using
+     * this TBStats structure. Has to be reset on a tb_flush.
+     */
+    GPtrArray *tbs;
+
+    /* Recover state from TB */
+    uint64_t tb_restore_time;
+    uint64_t tb_restore_count;
+
+    struct {
+        uint64_t ir;
+        uint64_t ir_opt;
+        uint64_t la;
+        uint64_t code;
+    } gen_times;
 };
 
 bool tb_stats_cmp(const void *ap, const void *bp);
 
 void init_tb_stats_htable(void);
 
+void dump_jit_profile_info(TCGProfile *s, GString *buf);
+void dump_jit_exec_time_info(uint64_t dev_time, GString *buf);
+
 #endif
diff --git a/include/qemu/timer.h b/include/qemu/timer.h
index 9a91cb1248..ad0da18a5f 100644
--- a/include/qemu/timer.h
+++ b/include/qemu/timer.h
@@ -989,4 +989,10 @@  static inline int64_t cpu_get_host_ticks(void)
 }
 #endif
 
+static inline int64_t profile_getclock(void)
+{
+    return get_clock();
+}
+
+extern uint64_t dev_time;
 #endif
diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
index 083cbd6580..fbd633322b 100644
--- a/include/tcg/tcg.h
+++ b/include/tcg/tcg.h
@@ -522,6 +522,46 @@  static inline TCGRegSet output_pref(const TCGOp *op, unsigned i)
     return i < ARRAY_SIZE(op->output_pref) ? op->output_pref[i] : 0;
 }
 
+/*
+ * The TCGProfile structure holds data for analysing the quality of
+ * the code generation. The data is split between stuff that is valid
+ * for the lifetime of a single translation and things that are valid
+ * for the lifetime of the translator. As the former is reset for each
+ * new translation so it should be copied elsewhere if you want to
+ * keep it.
+ *
+ * The structure is safe to access within the context of translation
+ * but accessing the data from elsewhere should be done with safe
+ * work.
+ */
+typedef struct TCGProfile {
+
+    struct {
+        int nb_guest_insns;
+        int nb_spills;
+        int nb_ops_pre_opt;
+
+        int del_op_count;
+        int temp_count;
+    } translation;
+
+    int64_t cpu_exec_time;
+    int64_t op_count; /* total insn count */
+    int64_t code_in_len;
+    int64_t code_out_len;
+    int64_t search_out_len;
+
+    /* Timestamps during translation  */
+    uint64_t gen_start_time;
+    uint64_t gen_ir_done_time;
+    uint64_t gen_opt_done_time;
+    uint64_t gen_la_done_time;
+    uint64_t gen_code_done_time;
+
+    /* Lifetime count of TCGOps per TCGContext */
+    uint64_t table_op_count[NB_OPS];
+} TCGProfile;
+
 struct TCGContext {
     uint8_t *pool_cur, *pool_end;
     TCGPool *pool_first, *pool_current, *pool_first_large;
@@ -548,6 +588,8 @@  struct TCGContext {
     tcg_insn_unit *code_buf;      /* pointer for start of tb */
     tcg_insn_unit *code_ptr;      /* pointer for running end of tb */
 
+    TCGProfile prof;
+
 #ifdef CONFIG_DEBUG_TCG
     int goto_tb_issue_mask;
     const TCGOpcode *vecop_list;
@@ -608,6 +650,7 @@  struct TCGContext {
 
     /* Exit to translator on overflow. */
     sigjmp_buf jmp_trans;
+    TranslationBlock *current_tb;
 };
 
 static inline bool temp_readonly(TCGTemp *ts)
@@ -885,7 +928,7 @@  static inline TCGv_ptr tcg_temp_new_ptr(void)
     return temp_tcgv_ptr(t);
 }
 
-int64_t tcg_cpu_exec_time(void);
+uint64_t tcg_cpu_exec_time(void);
 void tcg_dump_info(GString *buf);
 void tcg_dump_op_count(GString *buf);
 
diff --git a/softmmu/runstate.c b/softmmu/runstate.c
index bd50062ed0..37390799f1 100644
--- a/softmmu/runstate.c
+++ b/softmmu/runstate.c
@@ -728,9 +728,18 @@  static bool main_loop_should_exit(int *status)
 int qemu_main_loop(void)
 {
     int status = EXIT_SUCCESS;
+#ifdef CONFIG_TCG
+    uint64_t ti;
+#endif
 
     while (!main_loop_should_exit(&status)) {
+#ifdef CONFIG_TCG
+        ti = profile_getclock();
+#endif
         main_loop_wait(false);
+#ifdef CONFIG_TCG
+        dev_time += profile_getclock() - ti;
+#endif
     }
 
     return status;
diff --git a/tcg/tcg.c b/tcg/tcg.c
index 47befdfcae..9ae0e093e1 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -41,6 +41,7 @@ 
 #define NO_CPU_IO_DEFS
 
 #include "exec/exec-all.h"
+#include "exec/tb-stats.h"
 #include "tcg/tcg-op.h"
 
 #if UINTPTR_MAX == UINT32_MAX
@@ -1511,6 +1512,9 @@  void tcg_func_start(TCGContext *s)
     s->nb_labels = 0;
     s->current_frame_offset = s->frame_start;
 
+    s->prof.translation.nb_spills = 0;
+    s->prof.translation.del_op_count = 0;
+    s->prof.translation.temp_count = 0;
 #ifdef CONFIG_DEBUG_TCG
     s->goto_tb_issue_mask = 0;
 #endif
@@ -2983,6 +2987,8 @@  void tcg_op_remove(TCGContext *s, TCGOp *op)
     QTAILQ_REMOVE(&s->ops, op, link);
     QTAILQ_INSERT_TAIL(&s->free_ops, op, link);
     s->nb_ops--;
+    /* ? won't this end up op_opt - op = del_op_count ? */
+    s->prof.translation.del_op_count++;
 }
 
 void tcg_remove_ops_after(TCGOp *op)
@@ -4148,6 +4154,7 @@  static TCGReg tcg_reg_alloc(TCGContext *s, TCGRegSet required_regs,
     }
 
     /* We must spill something.  */
+    s->prof.translation.nb_spills++;
     for (j = f; j < 2; j++) {
         TCGRegSet set = reg_ct[j];
 
@@ -5852,22 +5859,55 @@  static void tcg_out_st_helper_args(TCGContext *s, const TCGLabelQemuLdst *ldst,
     tcg_out_helper_load_common_args(s, ldst, parm, info, next_arg);
 }
 
-void tcg_dump_op_count(GString *buf)
+/* avoid copy/paste errors */
+#define PROF_ADD(to, from, field)                       \
+    do {                                                \
+        (to)->field += qatomic_read(&((from)->field));  \
+    } while (0)
+
+static void collect_tcg_profiler(TCGProfile *prof)
 {
-    g_string_append_printf(buf, "[TCG profiler not compiled]\n");
+    unsigned int n_ctxs = qatomic_read(&tcg_cur_ctxs);
+    unsigned int i;
+
+    for (i = 0; i < n_ctxs; i++) {
+        TCGContext *s = qatomic_read(&tcg_ctxs[i]);
+        const TCGProfile *orig = &s->prof;
+
+        PROF_ADD(prof, orig, cpu_exec_time);
+
+        for (i = 0; i < NB_OPS; i++) {
+            PROF_ADD(prof, orig, table_op_count[i]);
+        }
+    }
 }
 
-int64_t tcg_cpu_exec_time(void)
+uint64_t tcg_cpu_exec_time(void)
 {
-    error_report("%s: TCG profiler not compiled", __func__);
-    exit(EXIT_FAILURE);
+    unsigned int n_ctxs = qatomic_read(&tcg_cur_ctxs);
+    unsigned int i;
+    uint64_t ret = 0;
+
+    for (i = 0; i < n_ctxs; i++) {
+        const TCGContext *s = qatomic_read(&tcg_ctxs[i]);
+        const TCGProfile *prof = &s->prof;
+
+        ret += qatomic_read(&prof->cpu_exec_time);
+    }
+    return ret;
 }
 
 int tcg_gen_code(TCGContext *s, TranslationBlock *tb, uint64_t pc_start)
 {
+    TCGProfile *prof = &s->prof;
     int i, num_insns;
     TCGOp *op;
 
+    s->current_tb = tb;
+    /* save pre-optimisation op count */
+    prof->translation.nb_ops_pre_opt = s->nb_ops;
+    prof->translation.temp_count = s->nb_temps;
+
     if (unlikely(qemu_loglevel_mask(CPU_LOG_TB_OP)
                  && qemu_log_in_addr_range(pc_start))) {
         FILE *logfile = qemu_log_trylock();
@@ -5879,6 +5919,7 @@  int tcg_gen_code(TCGContext *s, TranslationBlock *tb, uint64_t pc_start)
         }
     }
 
+
 #ifdef CONFIG_DEBUG_TCG
     /* Ensure all labels referenced have been emitted.  */
     {
@@ -5898,6 +5939,10 @@  int tcg_gen_code(TCGContext *s, TranslationBlock *tb, uint64_t pc_start)
 
     tcg_optimize(s);
 
+    if (tb_stats_enabled(tb, TB_JIT_TIME)) {
+        prof->gen_opt_done_time = profile_getclock();
+    }
+
     reachable_code_pass(s);
     liveness_pass_0(s);
     liveness_pass_1(s);
@@ -5921,6 +5966,10 @@  int tcg_gen_code(TCGContext *s, TranslationBlock *tb, uint64_t pc_start)
         }
     }
 
+    if (tb_stats_enabled(tb, TB_JIT_TIME)) {
+        prof->gen_la_done_time = profile_getclock();
+    }
+
     if (unlikely(qemu_loglevel_mask(CPU_LOG_TB_OP_OPT)
                  && qemu_log_in_addr_range(pc_start))) {
         FILE *logfile = qemu_log_trylock();
@@ -5955,6 +6004,13 @@  int tcg_gen_code(TCGContext *s, TranslationBlock *tb, uint64_t pc_start)
     s->pool_labels = NULL;
 #endif
 
+    if (tb_stats_collection_enabled()) {
+        QTAILQ_FOREACH(op, &s->ops, link) {
+            TCGOpcode opc = op->opc;
+            s->prof.table_op_count[opc]++;
+        }
+    }
+
     num_insns = -1;
     QTAILQ_FOREACH(op, &s->ops, link) {
         TCGOpcode opc = op->opc;
@@ -6053,9 +6109,23 @@  int tcg_gen_code(TCGContext *s, TranslationBlock *tb, uint64_t pc_start)
     return tcg_current_code_size(s);
 }
 
+void tcg_dump_op_count(GString *buf)
+{
+    TCGProfile prof = {};
+    int i;
+
+    collect_tcg_profiler(&prof);
+    for (i = 0; i < NB_OPS; i++) {
+        g_string_append_printf(buf, "%s %" PRId64 "\n",
+                tcg_op_defs[i].name, prof.table_op_count[i]);
+    }
+}
+
 void tcg_dump_info(GString *buf)
 {
-    g_string_append_printf(buf, "[TCG profiler not compiled]\n");
+    TCGProfile prof = {};
+    collect_tcg_profiler(&prof);
+    dump_jit_profile_info(&prof, buf);
 }
 
 #ifdef ELF_HOST_MACHINE
diff --git a/tests/qtest/qmp-cmd-test.c b/tests/qtest/qmp-cmd-test.c
index 73a670e8fa..749aafe4da 100644
--- a/tests/qtest/qmp-cmd-test.c
+++ b/tests/qtest/qmp-cmd-test.c
@@ -46,6 +46,9 @@  static int query_error_class(const char *cmd)
         { "query-balloon", ERROR_CLASS_DEVICE_NOT_ACTIVE },
         { "query-hotpluggable-cpus", ERROR_CLASS_GENERIC_ERROR },
         { "query-vm-generation-id", ERROR_CLASS_GENERIC_ERROR },
+#ifndef CONFIG_TCG
+        { "x-query-profile", ERROR_CLASS_GENERIC_ERROR },
+#endif
         /* Only valid with a USB bus added */
         { "x-query-usb", ERROR_CLASS_GENERIC_ERROR },
         /* Only valid with accel=tcg */