diff mbox series

[v11,02/14] accel: collecting TB execution count

Message ID 20230421132421.1617479-3-fei2.wu@intel.com (mailing list archive)
State New, archived
Headers show
Series TCG code quality tracking | expand

Commit Message

Wu, Fei April 21, 2023, 1:24 p.m. UTC
From: "Vanderson M. do Rosario" <vandersonmr2@gmail.com>

If a TB has a TBS (TBStatistics) with the TB_EXEC_STATS
enabled, then we instrument the start code of this TB
to atomically count the number of times it is executed.
We count both the number of "normal" executions and atomic
executions of a TB.

The execution count of the TB is stored in its respective
TBS.

All TBStatistics are created by default with the flags from
default_tbstats_flag.

Signed-off-by: Vanderson M. do Rosario <vandersonmr2@gmail.com>
Message-Id: <20190829173437.5926-3-vandersonmr2@gmail.com>
[AJB: Fix author]
Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
---
 accel/tcg/cpu-exec.c      |  6 ++++++
 accel/tcg/tb-stats.c      |  6 ++++++
 accel/tcg/tcg-runtime.c   |  8 ++++++++
 accel/tcg/tcg-runtime.h   |  2 ++
 accel/tcg/translate-all.c |  7 +++++--
 accel/tcg/translator.c    | 10 ++++++++++
 include/exec/gen-icount.h |  1 +
 include/exec/tb-stats.h   | 18 ++++++++++++++++++
 8 files changed, 56 insertions(+), 2 deletions(-)

Comments

Richard Henderson May 3, 2023, 8:28 a.m. UTC | #1
On 4/21/23 14:24, Fei Wu wrote:
> From: "Vanderson M. do Rosario" <vandersonmr2@gmail.com>
> 
> If a TB has a TBS (TBStatistics) with the TB_EXEC_STATS
> enabled, then we instrument the start code of this TB
> to atomically count the number of times it is executed.
> We count both the number of "normal" executions and atomic
> executions of a TB.
> 
> The execution count of the TB is stored in its respective
> TBS.
> 
> All TBStatistics are created by default with the flags from
> default_tbstats_flag.
> 
> Signed-off-by: Vanderson M. do Rosario <vandersonmr2@gmail.com>
> Message-Id: <20190829173437.5926-3-vandersonmr2@gmail.com>
> [AJB: Fix author]
> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
> ---
>   accel/tcg/cpu-exec.c      |  6 ++++++
>   accel/tcg/tb-stats.c      |  6 ++++++
>   accel/tcg/tcg-runtime.c   |  8 ++++++++
>   accel/tcg/tcg-runtime.h   |  2 ++
>   accel/tcg/translate-all.c |  7 +++++--
>   accel/tcg/translator.c    | 10 ++++++++++
>   include/exec/gen-icount.h |  1 +
>   include/exec/tb-stats.h   | 18 ++++++++++++++++++
>   8 files changed, 56 insertions(+), 2 deletions(-)
> 
> diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
> index c815f2dbfd..d89f9fe493 100644
> --- a/accel/tcg/cpu-exec.c
> +++ b/accel/tcg/cpu-exec.c
> @@ -25,6 +25,7 @@
>   #include "trace.h"
>   #include "disas/disas.h"
>   #include "exec/exec-all.h"
> +#include "exec/tb-stats.h"
>   #include "tcg/tcg.h"
>   #include "qemu/atomic.h"
>   #include "qemu/rcu.h"
> @@ -564,7 +565,12 @@ void cpu_exec_step_atomic(CPUState *cpu)
>               mmap_unlock();
>           }
>   
> +        if (tb_stats_enabled(tb, TB_EXEC_STATS)) {
> +            tb->tb_stats->executions.atomic++;
> +        }

The write is protected by the exclusive lock, but the read might be accessible from the 
monitor, iiuc.  Which means you should use atomic_set(), for non-tearable write after 
non-atomic increment.

> @@ -148,3 +149,10 @@ void HELPER(exit_atomic)(CPUArchState *env)
>   {
>       cpu_loop_exit_atomic(env_cpu(env), GETPC());
>   }
> +
> +void HELPER(inc_exec_freq)(void *ptr)
> +{
> +    TBStatistics *stats = (TBStatistics *) ptr;
> +    tcg_debug_assert(stats);
> +    qatomic_inc(&stats->executions.normal);
> +}

Ug.  Do we really need an atomic update?

If we have multiple threads executing through the same TB, we'll get significant slow-down 
at the cost of not missing increments.  If we allow a non-atomic update, we'll get much 
less slow-down at the cost of missing a few increments.  But this is statistical only, so 
how much does it really matter?


r~
Wu, Fei May 8, 2023, 10:02 a.m. UTC | #2
On 5/3/2023 4:28 PM, Richard Henderson wrote:
> On 4/21/23 14:24, Fei Wu wrote:
>> From: "Vanderson M. do Rosario" <vandersonmr2@gmail.com>
>>
>> If a TB has a TBS (TBStatistics) with the TB_EXEC_STATS
>> enabled, then we instrument the start code of this TB
>> to atomically count the number of times it is executed.
>> We count both the number of "normal" executions and atomic
>> executions of a TB.
>>
>> The execution count of the TB is stored in its respective
>> TBS.
>>
>> All TBStatistics are created by default with the flags from
>> default_tbstats_flag.
>>
>> Signed-off-by: Vanderson M. do Rosario <vandersonmr2@gmail.com>
>> Message-Id: <20190829173437.5926-3-vandersonmr2@gmail.com>
>> [AJB: Fix author]
>> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
>> ---
>>   accel/tcg/cpu-exec.c      |  6 ++++++
>>   accel/tcg/tb-stats.c      |  6 ++++++
>>   accel/tcg/tcg-runtime.c   |  8 ++++++++
>>   accel/tcg/tcg-runtime.h   |  2 ++
>>   accel/tcg/translate-all.c |  7 +++++--
>>   accel/tcg/translator.c    | 10 ++++++++++
>>   include/exec/gen-icount.h |  1 +
>>   include/exec/tb-stats.h   | 18 ++++++++++++++++++
>>   8 files changed, 56 insertions(+), 2 deletions(-)
>>
>> diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
>> index c815f2dbfd..d89f9fe493 100644
>> --- a/accel/tcg/cpu-exec.c
>> +++ b/accel/tcg/cpu-exec.c
>> @@ -25,6 +25,7 @@
>>   #include "trace.h"
>>   #include "disas/disas.h"
>>   #include "exec/exec-all.h"
>> +#include "exec/tb-stats.h"
>>   #include "tcg/tcg.h"
>>   #include "qemu/atomic.h"
>>   #include "qemu/rcu.h"
>> @@ -564,7 +565,12 @@ void cpu_exec_step_atomic(CPUState *cpu)
>>               mmap_unlock();
>>           }
>>   +        if (tb_stats_enabled(tb, TB_EXEC_STATS)) {
>> +            tb->tb_stats->executions.atomic++;
>> +        }
> 
> The write is protected by the exclusive lock, but the read might be
> accessible from the monitor, iiuc.  Which means you should use
> atomic_set(), for non-tearable write after non-atomic increment.
> 
The writes are serialized, 'atomic' is an aligned integer (unsigned
long), the read in parallel with write should not be a problem? It
returns the value either before increment or after, not part of.

>> @@ -148,3 +149,10 @@ void HELPER(exit_atomic)(CPUArchState *env)
>>   {
>>       cpu_loop_exit_atomic(env_cpu(env), GETPC());
>>   }
>> +
>> +void HELPER(inc_exec_freq)(void *ptr)
>> +{
>> +    TBStatistics *stats = (TBStatistics *) ptr;
>> +    tcg_debug_assert(stats);
>> +    qatomic_inc(&stats->executions.normal);
>> +}
> 
> Ug.  Do we really need an atomic update?
> 
> If we have multiple threads executing through the same TB, we'll get
> significant slow-down at the cost of not missing increments.  If we
> allow a non-atomic update, we'll get much less slow-down at the cost of
> missing a few increments.  But this is statistical only, so how much
> does it really matter?
> 
This sounds reasonable to me. Alex, what's your point here?

Richard, could you please review all this series? I just saw your
reviews on patch 01 and 02.

Thanks,
Fei.
> 
> r~
diff mbox series

Patch

diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
index c815f2dbfd..d89f9fe493 100644
--- a/accel/tcg/cpu-exec.c
+++ b/accel/tcg/cpu-exec.c
@@ -25,6 +25,7 @@ 
 #include "trace.h"
 #include "disas/disas.h"
 #include "exec/exec-all.h"
+#include "exec/tb-stats.h"
 #include "tcg/tcg.h"
 #include "qemu/atomic.h"
 #include "qemu/rcu.h"
@@ -564,7 +565,12 @@  void cpu_exec_step_atomic(CPUState *cpu)
             mmap_unlock();
         }
 
+        if (tb_stats_enabled(tb, TB_EXEC_STATS)) {
+            tb->tb_stats->executions.atomic++;
+        }
+
         cpu_exec_enter(cpu);
+
         /* execute the generated code */
         trace_exec_tb(tb, pc);
         cpu_tb_exec(cpu, tb, &tb_exit);
diff --git a/accel/tcg/tb-stats.c b/accel/tcg/tb-stats.c
index f78a21f522..b1d4341b6f 100644
--- a/accel/tcg/tb-stats.c
+++ b/accel/tcg/tb-stats.c
@@ -22,6 +22,7 @@  enum TBStatsStatus {
 };
 
 static enum TBStatsStatus tcg_collect_tb_stats;
+static uint32_t default_tbstats_flag;
 
 void init_tb_stats_htable(void)
 {
@@ -56,3 +57,8 @@  bool tb_stats_collection_paused(void)
 {
     return tcg_collect_tb_stats == TB_STATS_PAUSED;
 }
+
+uint32_t get_default_tbstats_flag(void)
+{
+    return default_tbstats_flag;
+}
diff --git a/accel/tcg/tcg-runtime.c b/accel/tcg/tcg-runtime.c
index e4e030043f..44a349b334 100644
--- a/accel/tcg/tcg-runtime.c
+++ b/accel/tcg/tcg-runtime.c
@@ -27,6 +27,7 @@ 
 #include "exec/helper-proto.h"
 #include "exec/cpu_ldst.h"
 #include "exec/exec-all.h"
+#include "exec/tb-stats.h"
 #include "disas/disas.h"
 #include "exec/log.h"
 #include "tcg/tcg.h"
@@ -148,3 +149,10 @@  void HELPER(exit_atomic)(CPUArchState *env)
 {
     cpu_loop_exit_atomic(env_cpu(env), GETPC());
 }
+
+void HELPER(inc_exec_freq)(void *ptr)
+{
+    TBStatistics *stats = (TBStatistics *) ptr;
+    tcg_debug_assert(stats);
+    qatomic_inc(&stats->executions.normal);
+}
diff --git a/accel/tcg/tcg-runtime.h b/accel/tcg/tcg-runtime.h
index e141a6ab24..66954aa210 100644
--- a/accel/tcg/tcg-runtime.h
+++ b/accel/tcg/tcg-runtime.h
@@ -39,6 +39,8 @@  DEF_HELPER_FLAGS_1(exit_atomic, TCG_CALL_NO_WG, noreturn, env)
 DEF_HELPER_FLAGS_3(memset, TCG_CALL_NO_RWG, ptr, ptr, int, ptr)
 #endif /* IN_HELPER_PROTO */
 
+DEF_HELPER_FLAGS_1(inc_exec_freq, TCG_CALL_NO_RWG, void, ptr)
+
 DEF_HELPER_FLAGS_5(atomic_cmpxchgb, TCG_CALL_NO_WG,
                    i32, env, tl, i32, i32, i32)
 DEF_HELPER_FLAGS_5(atomic_cmpxchgw_be, TCG_CALL_NO_WG,
diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index 0fcde2e84a..afc89d5692 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -308,6 +308,7 @@  static TBStatistics *tb_get_stats(tb_page_addr_t phys_pc, target_ulong pc,
     new_stats->pc = pc;
     new_stats->cs_base = cs_base;
     new_stats->flags = flags;
+    new_stats->stats_enabled = get_default_tbstats_flag();
 
     /*
      * All initialisation must be complete before we insert into qht
@@ -397,9 +398,11 @@  TranslationBlock *tb_gen_code(CPUState *cpu,
     /*
      * We want to fetch the stats structure before we start code
      * generation so we can count interesting things about this
-     * generation.
+     * generation. If dfilter is in effect we will only collect stats
+     * for the specified range.
      */
-    if (tb_stats_collection_enabled()) {
+    if (tb_stats_collection_enabled() &&
+        qemu_log_in_addr_range(tb->pc)) {
         tb->tb_stats = tb_get_stats(phys_pc, pc, cs_base, flags);
     } else {
         tb->tb_stats = NULL;
diff --git a/accel/tcg/translator.c b/accel/tcg/translator.c
index 7bda43ff61..165072c8c3 100644
--- a/accel/tcg/translator.c
+++ b/accel/tcg/translator.c
@@ -18,6 +18,15 @@ 
 #include "exec/plugin-gen.h"
 #include "exec/replay-core.h"
 
+static inline void gen_tb_exec_count(TranslationBlock *tb)
+{
+    if (tb_stats_enabled(tb, TB_EXEC_STATS)) {
+        TCGv_ptr ptr = tcg_temp_new_ptr();
+        tcg_gen_movi_ptr(ptr, (intptr_t)tb->tb_stats);
+        gen_helper_inc_exec_freq(ptr);
+    }
+}
+
 bool translator_use_goto_tb(DisasContextBase *db, target_ulong dest)
 {
     /* Suppress goto_tb if requested. */
@@ -56,6 +65,7 @@  void translator_loop(CPUState *cpu, TranslationBlock *tb, int *max_insns,
 
     /* Start translating.  */
     gen_tb_start(db->tb);
+    gen_tb_exec_count(tb);
     ops->tb_start(db, cpu);
     tcg_debug_assert(db->is_jmp == DISAS_NEXT);  /* no early exit */
 
diff --git a/include/exec/gen-icount.h b/include/exec/gen-icount.h
index f6de79a6b4..20e7835ff0 100644
--- a/include/exec/gen-icount.h
+++ b/include/exec/gen-icount.h
@@ -2,6 +2,7 @@ 
 #define GEN_ICOUNT_H
 
 #include "exec/exec-all.h"
+#include "exec/tb-stats.h"
 
 /* Helpers for instruction counting code generation.  */
 
diff --git a/include/exec/tb-stats.h b/include/exec/tb-stats.h
index 606f643723..27a233b7f8 100644
--- a/include/exec/tb-stats.h
+++ b/include/exec/tb-stats.h
@@ -30,6 +30,9 @@ 
 #include "exec/exec-all.h"
 #include "tcg/tcg.h"
 
+#define tb_stats_enabled(tb, JIT_STATS) \
+    (tb && tb->tb_stats && (tb->tb_stats->stats_enabled & JIT_STATS))
+
 typedef struct TBStatistics TBStatistics;
 
 /*
@@ -46,16 +49,31 @@  struct TBStatistics {
     uint32_t     flags;
     /* cs_base isn't included in the hash but we do check for matches */
     target_ulong cs_base;
+
+    /* which stats are enabled for this TBStats */
+    uint32_t stats_enabled;
+
+    /* Execution stats */
+    struct {
+        unsigned long normal;
+        unsigned long atomic;
+    } executions;
+
 };
 
 bool tb_stats_cmp(const void *ap, const void *bp);
 
 void init_tb_stats_htable(void);
 
+#define TB_NOTHING    (1 << 0)
+#define TB_EXEC_STATS (1 << 1)
+
 void enable_collect_tb_stats(void);
 void disable_collect_tb_stats(void);
 void pause_collect_tb_stats(void);
 bool tb_stats_collection_enabled(void);
 bool tb_stats_collection_paused(void);
 
+uint32_t get_default_tbstats_flag(void);
+
 #endif