Message ID | 20230421132421.1617479-3-fei2.wu@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | TCG code quality tracking | expand |
On 4/21/23 14:24, Fei Wu wrote: > From: "Vanderson M. do Rosario" <vandersonmr2@gmail.com> > > If a TB has a TBS (TBStatistics) with the TB_EXEC_STATS > enabled, then we instrument the start code of this TB > to atomically count the number of times it is executed. > We count both the number of "normal" executions and atomic > executions of a TB. > > The execution count of the TB is stored in its respective > TBS. > > All TBStatistics are created by default with the flags from > default_tbstats_flag. > > Signed-off-by: Vanderson M. do Rosario <vandersonmr2@gmail.com> > Message-Id: <20190829173437.5926-3-vandersonmr2@gmail.com> > [AJB: Fix author] > Signed-off-by: Alex Bennée <alex.bennee@linaro.org> > --- > accel/tcg/cpu-exec.c | 6 ++++++ > accel/tcg/tb-stats.c | 6 ++++++ > accel/tcg/tcg-runtime.c | 8 ++++++++ > accel/tcg/tcg-runtime.h | 2 ++ > accel/tcg/translate-all.c | 7 +++++-- > accel/tcg/translator.c | 10 ++++++++++ > include/exec/gen-icount.h | 1 + > include/exec/tb-stats.h | 18 ++++++++++++++++++ > 8 files changed, 56 insertions(+), 2 deletions(-) > > diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c > index c815f2dbfd..d89f9fe493 100644 > --- a/accel/tcg/cpu-exec.c > +++ b/accel/tcg/cpu-exec.c > @@ -25,6 +25,7 @@ > #include "trace.h" > #include "disas/disas.h" > #include "exec/exec-all.h" > +#include "exec/tb-stats.h" > #include "tcg/tcg.h" > #include "qemu/atomic.h" > #include "qemu/rcu.h" > @@ -564,7 +565,12 @@ void cpu_exec_step_atomic(CPUState *cpu) > mmap_unlock(); > } > > + if (tb_stats_enabled(tb, TB_EXEC_STATS)) { > + tb->tb_stats->executions.atomic++; > + } The write is protected by the exclusive lock, but the read might be accessible from the monitor, iiuc. Which means you should use atomic_set(), for non-tearable write after non-atomic increment. > @@ -148,3 +149,10 @@ void HELPER(exit_atomic)(CPUArchState *env) > { > cpu_loop_exit_atomic(env_cpu(env), GETPC()); > } > + > +void HELPER(inc_exec_freq)(void *ptr) > +{ > + TBStatistics *stats = (TBStatistics *) ptr; > + tcg_debug_assert(stats); > + qatomic_inc(&stats->executions.normal); > +} Ug. Do we really need an atomic update? If we have multiple threads executing through the same TB, we'll get significant slow-down at the cost of not missing increments. If we allow a non-atomic update, we'll get much less slow-down at the cost of missing a few increments. But this is statistical only, so how much does it really matter? r~
On 5/3/2023 4:28 PM, Richard Henderson wrote: > On 4/21/23 14:24, Fei Wu wrote: >> From: "Vanderson M. do Rosario" <vandersonmr2@gmail.com> >> >> If a TB has a TBS (TBStatistics) with the TB_EXEC_STATS >> enabled, then we instrument the start code of this TB >> to atomically count the number of times it is executed. >> We count both the number of "normal" executions and atomic >> executions of a TB. >> >> The execution count of the TB is stored in its respective >> TBS. >> >> All TBStatistics are created by default with the flags from >> default_tbstats_flag. >> >> Signed-off-by: Vanderson M. do Rosario <vandersonmr2@gmail.com> >> Message-Id: <20190829173437.5926-3-vandersonmr2@gmail.com> >> [AJB: Fix author] >> Signed-off-by: Alex Bennée <alex.bennee@linaro.org> >> --- >> accel/tcg/cpu-exec.c | 6 ++++++ >> accel/tcg/tb-stats.c | 6 ++++++ >> accel/tcg/tcg-runtime.c | 8 ++++++++ >> accel/tcg/tcg-runtime.h | 2 ++ >> accel/tcg/translate-all.c | 7 +++++-- >> accel/tcg/translator.c | 10 ++++++++++ >> include/exec/gen-icount.h | 1 + >> include/exec/tb-stats.h | 18 ++++++++++++++++++ >> 8 files changed, 56 insertions(+), 2 deletions(-) >> >> diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c >> index c815f2dbfd..d89f9fe493 100644 >> --- a/accel/tcg/cpu-exec.c >> +++ b/accel/tcg/cpu-exec.c >> @@ -25,6 +25,7 @@ >> #include "trace.h" >> #include "disas/disas.h" >> #include "exec/exec-all.h" >> +#include "exec/tb-stats.h" >> #include "tcg/tcg.h" >> #include "qemu/atomic.h" >> #include "qemu/rcu.h" >> @@ -564,7 +565,12 @@ void cpu_exec_step_atomic(CPUState *cpu) >> mmap_unlock(); >> } >> + if (tb_stats_enabled(tb, TB_EXEC_STATS)) { >> + tb->tb_stats->executions.atomic++; >> + } > > The write is protected by the exclusive lock, but the read might be > accessible from the monitor, iiuc. Which means you should use > atomic_set(), for non-tearable write after non-atomic increment. > The writes are serialized, 'atomic' is an aligned integer (unsigned long), the read in parallel with write should not be a problem? It returns the value either before increment or after, not part of. >> @@ -148,3 +149,10 @@ void HELPER(exit_atomic)(CPUArchState *env) >> { >> cpu_loop_exit_atomic(env_cpu(env), GETPC()); >> } >> + >> +void HELPER(inc_exec_freq)(void *ptr) >> +{ >> + TBStatistics *stats = (TBStatistics *) ptr; >> + tcg_debug_assert(stats); >> + qatomic_inc(&stats->executions.normal); >> +} > > Ug. Do we really need an atomic update? > > If we have multiple threads executing through the same TB, we'll get > significant slow-down at the cost of not missing increments. If we > allow a non-atomic update, we'll get much less slow-down at the cost of > missing a few increments. But this is statistical only, so how much > does it really matter? > This sounds reasonable to me. Alex, what's your point here? Richard, could you please review all this series? I just saw your reviews on patch 01 and 02. Thanks, Fei. > > r~
diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c index c815f2dbfd..d89f9fe493 100644 --- a/accel/tcg/cpu-exec.c +++ b/accel/tcg/cpu-exec.c @@ -25,6 +25,7 @@ #include "trace.h" #include "disas/disas.h" #include "exec/exec-all.h" +#include "exec/tb-stats.h" #include "tcg/tcg.h" #include "qemu/atomic.h" #include "qemu/rcu.h" @@ -564,7 +565,12 @@ void cpu_exec_step_atomic(CPUState *cpu) mmap_unlock(); } + if (tb_stats_enabled(tb, TB_EXEC_STATS)) { + tb->tb_stats->executions.atomic++; + } + cpu_exec_enter(cpu); + /* execute the generated code */ trace_exec_tb(tb, pc); cpu_tb_exec(cpu, tb, &tb_exit); diff --git a/accel/tcg/tb-stats.c b/accel/tcg/tb-stats.c index f78a21f522..b1d4341b6f 100644 --- a/accel/tcg/tb-stats.c +++ b/accel/tcg/tb-stats.c @@ -22,6 +22,7 @@ enum TBStatsStatus { }; static enum TBStatsStatus tcg_collect_tb_stats; +static uint32_t default_tbstats_flag; void init_tb_stats_htable(void) { @@ -56,3 +57,8 @@ bool tb_stats_collection_paused(void) { return tcg_collect_tb_stats == TB_STATS_PAUSED; } + +uint32_t get_default_tbstats_flag(void) +{ + return default_tbstats_flag; +} diff --git a/accel/tcg/tcg-runtime.c b/accel/tcg/tcg-runtime.c index e4e030043f..44a349b334 100644 --- a/accel/tcg/tcg-runtime.c +++ b/accel/tcg/tcg-runtime.c @@ -27,6 +27,7 @@ #include "exec/helper-proto.h" #include "exec/cpu_ldst.h" #include "exec/exec-all.h" +#include "exec/tb-stats.h" #include "disas/disas.h" #include "exec/log.h" #include "tcg/tcg.h" @@ -148,3 +149,10 @@ void HELPER(exit_atomic)(CPUArchState *env) { cpu_loop_exit_atomic(env_cpu(env), GETPC()); } + +void HELPER(inc_exec_freq)(void *ptr) +{ + TBStatistics *stats = (TBStatistics *) ptr; + tcg_debug_assert(stats); + qatomic_inc(&stats->executions.normal); +} diff --git a/accel/tcg/tcg-runtime.h b/accel/tcg/tcg-runtime.h index e141a6ab24..66954aa210 100644 --- a/accel/tcg/tcg-runtime.h +++ b/accel/tcg/tcg-runtime.h @@ -39,6 +39,8 @@ DEF_HELPER_FLAGS_1(exit_atomic, TCG_CALL_NO_WG, noreturn, env) DEF_HELPER_FLAGS_3(memset, TCG_CALL_NO_RWG, ptr, ptr, int, ptr) #endif /* IN_HELPER_PROTO */ +DEF_HELPER_FLAGS_1(inc_exec_freq, TCG_CALL_NO_RWG, void, ptr) + DEF_HELPER_FLAGS_5(atomic_cmpxchgb, TCG_CALL_NO_WG, i32, env, tl, i32, i32, i32) DEF_HELPER_FLAGS_5(atomic_cmpxchgw_be, TCG_CALL_NO_WG, diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c index 0fcde2e84a..afc89d5692 100644 --- a/accel/tcg/translate-all.c +++ b/accel/tcg/translate-all.c @@ -308,6 +308,7 @@ static TBStatistics *tb_get_stats(tb_page_addr_t phys_pc, target_ulong pc, new_stats->pc = pc; new_stats->cs_base = cs_base; new_stats->flags = flags; + new_stats->stats_enabled = get_default_tbstats_flag(); /* * All initialisation must be complete before we insert into qht @@ -397,9 +398,11 @@ TranslationBlock *tb_gen_code(CPUState *cpu, /* * We want to fetch the stats structure before we start code * generation so we can count interesting things about this - * generation. + * generation. If dfilter is in effect we will only collect stats + * for the specified range. */ - if (tb_stats_collection_enabled()) { + if (tb_stats_collection_enabled() && + qemu_log_in_addr_range(tb->pc)) { tb->tb_stats = tb_get_stats(phys_pc, pc, cs_base, flags); } else { tb->tb_stats = NULL; diff --git a/accel/tcg/translator.c b/accel/tcg/translator.c index 7bda43ff61..165072c8c3 100644 --- a/accel/tcg/translator.c +++ b/accel/tcg/translator.c @@ -18,6 +18,15 @@ #include "exec/plugin-gen.h" #include "exec/replay-core.h" +static inline void gen_tb_exec_count(TranslationBlock *tb) +{ + if (tb_stats_enabled(tb, TB_EXEC_STATS)) { + TCGv_ptr ptr = tcg_temp_new_ptr(); + tcg_gen_movi_ptr(ptr, (intptr_t)tb->tb_stats); + gen_helper_inc_exec_freq(ptr); + } +} + bool translator_use_goto_tb(DisasContextBase *db, target_ulong dest) { /* Suppress goto_tb if requested. */ @@ -56,6 +65,7 @@ void translator_loop(CPUState *cpu, TranslationBlock *tb, int *max_insns, /* Start translating. */ gen_tb_start(db->tb); + gen_tb_exec_count(tb); ops->tb_start(db, cpu); tcg_debug_assert(db->is_jmp == DISAS_NEXT); /* no early exit */ diff --git a/include/exec/gen-icount.h b/include/exec/gen-icount.h index f6de79a6b4..20e7835ff0 100644 --- a/include/exec/gen-icount.h +++ b/include/exec/gen-icount.h @@ -2,6 +2,7 @@ #define GEN_ICOUNT_H #include "exec/exec-all.h" +#include "exec/tb-stats.h" /* Helpers for instruction counting code generation. */ diff --git a/include/exec/tb-stats.h b/include/exec/tb-stats.h index 606f643723..27a233b7f8 100644 --- a/include/exec/tb-stats.h +++ b/include/exec/tb-stats.h @@ -30,6 +30,9 @@ #include "exec/exec-all.h" #include "tcg/tcg.h" +#define tb_stats_enabled(tb, JIT_STATS) \ + (tb && tb->tb_stats && (tb->tb_stats->stats_enabled & JIT_STATS)) + typedef struct TBStatistics TBStatistics; /* @@ -46,16 +49,31 @@ struct TBStatistics { uint32_t flags; /* cs_base isn't included in the hash but we do check for matches */ target_ulong cs_base; + + /* which stats are enabled for this TBStats */ + uint32_t stats_enabled; + + /* Execution stats */ + struct { + unsigned long normal; + unsigned long atomic; + } executions; + }; bool tb_stats_cmp(const void *ap, const void *bp); void init_tb_stats_htable(void); +#define TB_NOTHING (1 << 0) +#define TB_EXEC_STATS (1 << 1) + void enable_collect_tb_stats(void); void disable_collect_tb_stats(void); void pause_collect_tb_stats(void); bool tb_stats_collection_enabled(void); bool tb_stats_collection_paused(void); +uint32_t get_default_tbstats_flag(void); + #endif