@@ -262,6 +262,7 @@ struct tb_desc {
CPUArchState *env;
tb_page_addr_t phys_page1;
uint32_t flags;
+ TRACE_QHT_VCPU_DSTATE_TYPE trace_vcpu_dstate;
};
static bool tb_cmp(const void *p, const void *d)
@@ -273,6 +274,7 @@ static bool tb_cmp(const void *p, const void *d)
tb->page_addr[0] == desc->phys_page1 &&
tb->cs_base == desc->cs_base &&
tb->flags == desc->flags &&
+ tb->trace_vcpu_dstate == desc->trace_vcpu_dstate &&
!atomic_read(&tb->invalid)) {
/* check next page if needed */
if (tb->page_addr[1] == -1) {
@@ -294,7 +296,8 @@ static bool tb_cmp(const void *p, const void *d)
static TranslationBlock *tb_htable_lookup(CPUState *cpu,
target_ulong pc,
target_ulong cs_base,
- uint32_t flags)
+ uint32_t flags,
+ uint32_t trace_vcpu_dstate)
{
tb_page_addr_t phys_pc;
struct tb_desc desc;
@@ -303,10 +306,11 @@ static TranslationBlock *tb_htable_lookup(CPUState *cpu,
desc.env = (CPUArchState *)cpu->env_ptr;
desc.cs_base = cs_base;
desc.flags = flags;
+ desc.trace_vcpu_dstate = trace_vcpu_dstate;
desc.pc = pc;
phys_pc = get_page_addr_code(desc.env, pc);
desc.phys_page1 = phys_pc & TARGET_PAGE_MASK;
- h = tb_hash_func(phys_pc, pc, flags);
+ h = tb_hash_func(phys_pc, pc, flags, trace_vcpu_dstate);
return qht_lookup(&tcg_ctx.tb_ctx.htable, tb_cmp, &desc, h);
}
@@ -318,16 +322,24 @@ static inline TranslationBlock *tb_find(CPUState *cpu,
TranslationBlock *tb;
target_ulong cs_base, pc;
uint32_t flags;
+ unsigned long trace_vcpu_dstate_bitmap;
+ TRACE_QHT_VCPU_DSTATE_TYPE trace_vcpu_dstate;
bool have_tb_lock = false;
+ bitmap_copy(&trace_vcpu_dstate_bitmap, cpu->trace_dstate,
+ trace_get_vcpu_event_count());
+ memcpy(&trace_vcpu_dstate, &trace_vcpu_dstate_bitmap,
+ sizeof(trace_vcpu_dstate));
+
/* we record a subset of the CPU state. It will
always be the same before a given translated block
is executed. */
cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);
tb = atomic_rcu_read(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)]);
if (unlikely(!tb || tb->pc != pc || tb->cs_base != cs_base ||
- tb->flags != flags)) {
- tb = tb_htable_lookup(cpu, pc, cs_base, flags);
+ tb->flags != flags ||
+ tb->trace_vcpu_dstate != trace_vcpu_dstate)) {
+ tb = tb_htable_lookup(cpu, pc, cs_base, flags, trace_vcpu_dstate);
if (!tb) {
/* mmap_lock is needed by tb_gen_code, and mmap_lock must be
@@ -341,7 +353,7 @@ static inline TranslationBlock *tb_find(CPUState *cpu,
/* There's a chance that our desired tb has been translated while
* taking the locks so we check again inside the lock.
*/
- tb = tb_htable_lookup(cpu, pc, cs_base, flags);
+ tb = tb_htable_lookup(cpu, pc, cs_base, flags, trace_vcpu_dstate);
if (!tb) {
/* if no translated code available, then translate it now */
tb = tb_gen_code(cpu, pc, cs_base, flags, 0);
@@ -465,6 +477,7 @@ static inline bool cpu_handle_exception(CPUState *cpu, int *ret)
if (unlikely(atomic_read(&cpu->trace_dstate_delayed_req))) {
bitmap_copy(cpu->trace_dstate, cpu->trace_dstate_delayed,
trace_get_vcpu_event_count());
+ tb_flush_jmp_cache_all(cpu);
}
return true;
@@ -660,6 +673,9 @@ int cpu_exec(CPUState *cpu)
if (unlikely(atomic_read(&cpu->trace_dstate_delayed_req))) {
bitmap_copy(cpu->trace_dstate, cpu->trace_dstate_delayed,
trace_get_vcpu_event_count());
+ tb_flush_jmp_cache_all(cpu);
+ /* avoid chaining TBs with different dstates */
+ last_tb = NULL;
}
/* Try to align the host and virtual clocks
@@ -200,6 +200,10 @@ static inline void tlb_flush_by_mmuidx(CPUState *cpu, ...)
#define USE_DIRECT_JUMP
#endif
+/**
+ * TranslationBlock:
+ * @trace_vcpu_dstate: Per-vCPU dynamic tracing state used to generate this TB.
+ */
struct TranslationBlock {
target_ulong pc; /* simulated PC corresponding to this block (EIP + CS base) */
target_ulong cs_base; /* CS base for this block */
@@ -215,6 +219,7 @@ struct TranslationBlock {
#define CF_IGNORE_ICOUNT 0x40000 /* Do not generate icount code */
uint16_t invalid;
+ TRACE_QHT_VCPU_DSTATE_TYPE trace_vcpu_dstate;
void *tc_ptr; /* pointer to the translated code */
uint8_t *tc_search; /* pointer to search data */
@@ -35,6 +35,7 @@
#define EXEC_TB_HASH_XX_H
#include "qemu/bitops.h"
+#include "qemu-common.h"
#define PRIME32_1 2654435761U
#define PRIME32_2 2246822519U
@@ -49,7 +50,8 @@
* contiguous in memory.
*/
static inline
-uint32_t tb_hash_func5(uint64_t a0, uint64_t b0, uint32_t e)
+uint32_t tb_hash_func6(uint64_t a0, uint64_t b0, uint32_t e,
+ TRACE_QHT_VCPU_DSTATE_TYPE f)
{
uint32_t v1 = TB_HASH_XX_SEED + PRIME32_1 + PRIME32_2;
uint32_t v2 = TB_HASH_XX_SEED + PRIME32_2;
@@ -83,6 +85,10 @@ uint32_t tb_hash_func5(uint64_t a0, uint64_t b0, uint32_t e)
h32 += e * PRIME32_3;
h32 = rol32(h32, 17) * PRIME32_4;
+ QEMU_BUILD_BUG_ON(sizeof(TRACE_QHT_VCPU_DSTATE_TYPE) != sizeof(uint32_t));
+ h32 += f * PRIME32_3;
+ h32 = rol32(h32, 17) * PRIME32_4;
+
h32 ^= h32 >> 15;
h32 *= PRIME32_2;
h32 ^= h32 >> 13;
@@ -46,9 +46,10 @@ static inline unsigned int tb_jmp_cache_hash_func(target_ulong pc)
}
static inline
-uint32_t tb_hash_func(tb_page_addr_t phys_pc, target_ulong pc, uint32_t flags)
+uint32_t tb_hash_func(tb_page_addr_t phys_pc, target_ulong pc,
+ uint32_t flags, uint32_t trace_vcpu_dstate)
{
- return tb_hash_func5(phys_pc, pc, flags);
+ return tb_hash_func6(phys_pc, pc, flags, trace_vcpu_dstate);
}
#endif
@@ -151,4 +151,7 @@ void page_size_init(void);
* returned. */
bool dump_in_progress(void);
+/* Use a macro to allow safe changes to its size in the future */
+#define TRACE_QHT_VCPU_DSTATE_TYPE uint32_t
+
#endif
@@ -103,7 +103,7 @@ static bool is_equal(const void *obj, const void *userp)
static inline uint32_t h(unsigned long v)
{
- return tb_hash_func5(v, 0, 0);
+ return tb_hash_func6(v, 0, 0, 0);
}
/*
@@ -82,7 +82,10 @@ void trace_event_set_vcpu_state_dynamic(CPUState *vcpu,
}
(*ev->dstate)--;
}
+ /* Make sure next translated/executed TB uses the new dstate */
atomic_set(&vcpu->trace_dstate_delayed_req, true);
+ /* NOTE: checked by all TBs in gen_tb_start() */
+ atomic_set(&vcpu->tcg_exit_req, 1);
}
}
@@ -165,6 +165,9 @@ void trace_event_set_state_dynamic(TraceEvent *ev, bool state);
* Set the dynamic tracing state of an event for the given vCPU.
*
* Pre-condition: trace_event_get_vcpu_state_static(ev) == true
+ *
+ * Note: Changes for execution-time events with the 'tcg' property will not be
+ * propagated until the next TB is executed (iff executing in TCG mode).
*/
void trace_event_set_vcpu_state_dynamic(CPUState *vcpu,
TraceEvent *ev, bool state);
@@ -54,6 +54,7 @@
#include "exec/tb-hash.h"
#include "translate-all.h"
#include "qemu/bitmap.h"
+#include "qemu/error-report.h"
#include "qemu/timer.h"
#include "exec/log.h"
@@ -813,6 +814,12 @@ static void tb_htable_init(void)
{
unsigned int mode = QHT_MODE_AUTO_RESIZE;
+ /* Ensure TB hash function covers the bitmap size */
+ if (DIV_ROUND_UP(trace_get_vcpu_event_count(), BITS_PER_BYTE) >
+ sizeof(TRACE_QHT_VCPU_DSTATE_TYPE)) {
+ error_report("too many 'vcpu' events for the TB hash function");
+ }
+
qht_init(&tcg_ctx.tb_ctx.htable, CODE_GEN_HTABLE_SIZE, mode);
}
@@ -1106,7 +1113,7 @@ void tb_phys_invalidate(TranslationBlock *tb, tb_page_addr_t page_addr)
/* remove the TB from the hash list */
phys_pc = tb->page_addr[0] + (tb->pc & ~TARGET_PAGE_MASK);
- h = tb_hash_func(phys_pc, tb->pc, tb->flags);
+ h = tb_hash_func(phys_pc, tb->pc, tb->flags, tb->trace_vcpu_dstate);
qht_remove(&tcg_ctx.tb_ctx.htable, tb, h);
/* remove the TB from the page list */
@@ -1251,7 +1258,7 @@ static void tb_link_page(TranslationBlock *tb, tb_page_addr_t phys_pc,
}
/* add in the hash table */
- h = tb_hash_func(phys_pc, tb->pc, tb->flags);
+ h = tb_hash_func(phys_pc, tb->pc, tb->flags, tb->trace_vcpu_dstate);
qht_insert(&tcg_ctx.tb_ctx.htable, tb, h);
#ifdef DEBUG_TB_CHECK
@@ -1270,6 +1277,7 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
target_ulong virt_page2;
tcg_insn_unit *gen_code_buf;
int gen_code_size, search_size;
+ unsigned long trace_vcpu_dstate_bitmap;
#ifdef CONFIG_PROFILER
int64_t ti;
#endif
@@ -1294,6 +1302,10 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
tb->cs_base = cs_base;
tb->flags = flags;
tb->cflags = cflags;
+ bitmap_copy(&trace_vcpu_dstate_bitmap, cpu->trace_dstate,
+ trace_get_vcpu_event_count());
+ memcpy(&tb->trace_vcpu_dstate, &trace_vcpu_dstate_bitmap,
+ sizeof(tb->trace_vcpu_dstate));
#ifdef CONFIG_PROFILER
tcg_ctx.tb_count1++; /* includes aborted translations because of
Every vCPU now uses a separate set of TBs for each set of dynamic tracing event state values. Each set of TBs can be used by any number of vCPUs to maximize TB reuse when vCPUs have the same tracing state. This feature is later used by tracetool to optimize tracing of guest code events. The maximum number of TB sets is defined as 2^E, where E is the number of events that have the 'vcpu' property (their state is stored in CPUState->trace_dstate). For this to work, a change on the dynamic tracing state of a vCPU will force it to flush its virtual TB cache (which is only indexed by address), and fall back to the physical TB cache (which now contains the vCPU's dynamic tracing state as part of the hashing function). Signed-off-by: Lluís Vilanova <vilanova@ac.upc.edu> --- cpu-exec.c | 26 +++++++++++++++++++++----- include/exec/exec-all.h | 5 +++++ include/exec/tb-hash-xx.h | 8 +++++++- include/exec/tb-hash.h | 5 +++-- include/qemu-common.h | 3 +++ tests/qht-bench.c | 2 +- trace/control-target.c | 3 +++ trace/control.h | 3 +++ translate-all.c | 16 ++++++++++++++-- 9 files changed, 60 insertions(+), 11 deletions(-)