diff mbox

[v6,4/7] exec: [tcg] Use different TBs according to the vCPU's dynamic tracing state

Message ID 148295047597.19871.17997092463779305691.stgit@fimbulvetr.bsc.es (mailing list archive)
State New, archived
Headers show

Commit Message

Lluís Vilanova Dec. 28, 2016, 6:41 p.m. UTC
Every vCPU now uses a separate set of TBs for each set of dynamic
tracing event state values. Each set of TBs can be used by any number of
vCPUs to maximize TB reuse when vCPUs have the same tracing state.

This feature is later used by tracetool to optimize tracing of guest
code events.

The maximum number of TB sets is defined as 2^E, where E is the number
of events that have the 'vcpu' property (their state is stored in
CPUState->trace_dstate).

For this to work, a change on the dynamic tracing state of a vCPU will
force it to flush its virtual TB cache (which is only indexed by
address), and fall back to the physical TB cache (which now contains the
vCPU's dynamic tracing state as part of the hashing function).

Signed-off-by: Lluís Vilanova <vilanova@ac.upc.edu>
---
 cpu-exec.c                |   26 +++++++++++++++++++++-----
 include/exec/exec-all.h   |    5 +++++
 include/exec/tb-hash-xx.h |    8 +++++++-
 include/exec/tb-hash.h    |    5 +++--
 include/qemu-common.h     |    3 +++
 tests/qht-bench.c         |    2 +-
 trace/control-target.c    |    3 +++
 trace/control.h           |    3 +++
 translate-all.c           |   16 ++++++++++++++--
 9 files changed, 60 insertions(+), 11 deletions(-)

Comments

Richard Henderson Jan. 10, 2017, 8:10 p.m. UTC | #1
On 12/28/2016 10:41 AM, Lluís Vilanova wrote:
> Every vCPU now uses a separate set of TBs for each set of dynamic
> tracing event state values. Each set of TBs can be used by any number of
> vCPUs to maximize TB reuse when vCPUs have the same tracing state.
>
> This feature is later used by tracetool to optimize tracing of guest
> code events.
>
> The maximum number of TB sets is defined as 2^E, where E is the number
> of events that have the 'vcpu' property (their state is stored in
> CPUState->trace_dstate).
>
> For this to work, a change on the dynamic tracing state of a vCPU will
> force it to flush its virtual TB cache (which is only indexed by
> address), and fall back to the physical TB cache (which now contains the
> vCPU's dynamic tracing state as part of the hashing function).
>
> Signed-off-by: Lluís Vilanova <vilanova@ac.upc.edu>
> ---
>  cpu-exec.c                |   26 +++++++++++++++++++++-----
>  include/exec/exec-all.h   |    5 +++++
>  include/exec/tb-hash-xx.h |    8 +++++++-
>  include/exec/tb-hash.h    |    5 +++--
>  include/qemu-common.h     |    3 +++
>  tests/qht-bench.c         |    2 +-
>  trace/control-target.c    |    3 +++
>  trace/control.h           |    3 +++
>  translate-all.c           |   16 ++++++++++++++--
>  9 files changed, 60 insertions(+), 11 deletions(-)


Reviewed-by: Richard Henderson <rth@twiddle.net>


r~
diff mbox

Patch

diff --git a/cpu-exec.c b/cpu-exec.c
index 1b7366efb0..a377505b9c 100644
--- a/cpu-exec.c
+++ b/cpu-exec.c
@@ -262,6 +262,7 @@  struct tb_desc {
     CPUArchState *env;
     tb_page_addr_t phys_page1;
     uint32_t flags;
+    TRACE_QHT_VCPU_DSTATE_TYPE trace_vcpu_dstate;
 };
 
 static bool tb_cmp(const void *p, const void *d)
@@ -273,6 +274,7 @@  static bool tb_cmp(const void *p, const void *d)
         tb->page_addr[0] == desc->phys_page1 &&
         tb->cs_base == desc->cs_base &&
         tb->flags == desc->flags &&
+        tb->trace_vcpu_dstate == desc->trace_vcpu_dstate &&
         !atomic_read(&tb->invalid)) {
         /* check next page if needed */
         if (tb->page_addr[1] == -1) {
@@ -294,7 +296,8 @@  static bool tb_cmp(const void *p, const void *d)
 static TranslationBlock *tb_htable_lookup(CPUState *cpu,
                                           target_ulong pc,
                                           target_ulong cs_base,
-                                          uint32_t flags)
+                                          uint32_t flags,
+                                          uint32_t trace_vcpu_dstate)
 {
     tb_page_addr_t phys_pc;
     struct tb_desc desc;
@@ -303,10 +306,11 @@  static TranslationBlock *tb_htable_lookup(CPUState *cpu,
     desc.env = (CPUArchState *)cpu->env_ptr;
     desc.cs_base = cs_base;
     desc.flags = flags;
+    desc.trace_vcpu_dstate = trace_vcpu_dstate;
     desc.pc = pc;
     phys_pc = get_page_addr_code(desc.env, pc);
     desc.phys_page1 = phys_pc & TARGET_PAGE_MASK;
-    h = tb_hash_func(phys_pc, pc, flags);
+    h = tb_hash_func(phys_pc, pc, flags, trace_vcpu_dstate);
     return qht_lookup(&tcg_ctx.tb_ctx.htable, tb_cmp, &desc, h);
 }
 
@@ -318,16 +322,24 @@  static inline TranslationBlock *tb_find(CPUState *cpu,
     TranslationBlock *tb;
     target_ulong cs_base, pc;
     uint32_t flags;
+    unsigned long trace_vcpu_dstate_bitmap;
+    TRACE_QHT_VCPU_DSTATE_TYPE trace_vcpu_dstate;
     bool have_tb_lock = false;
 
+    bitmap_copy(&trace_vcpu_dstate_bitmap, cpu->trace_dstate,
+                trace_get_vcpu_event_count());
+    memcpy(&trace_vcpu_dstate, &trace_vcpu_dstate_bitmap,
+           sizeof(trace_vcpu_dstate));
+
     /* we record a subset of the CPU state. It will
        always be the same before a given translated block
        is executed. */
     cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);
     tb = atomic_rcu_read(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)]);
     if (unlikely(!tb || tb->pc != pc || tb->cs_base != cs_base ||
-                 tb->flags != flags)) {
-        tb = tb_htable_lookup(cpu, pc, cs_base, flags);
+                 tb->flags != flags ||
+                 tb->trace_vcpu_dstate != trace_vcpu_dstate)) {
+        tb = tb_htable_lookup(cpu, pc, cs_base, flags, trace_vcpu_dstate);
         if (!tb) {
 
             /* mmap_lock is needed by tb_gen_code, and mmap_lock must be
@@ -341,7 +353,7 @@  static inline TranslationBlock *tb_find(CPUState *cpu,
             /* There's a chance that our desired tb has been translated while
              * taking the locks so we check again inside the lock.
              */
-            tb = tb_htable_lookup(cpu, pc, cs_base, flags);
+            tb = tb_htable_lookup(cpu, pc, cs_base, flags, trace_vcpu_dstate);
             if (!tb) {
                 /* if no translated code available, then translate it now */
                 tb = tb_gen_code(cpu, pc, cs_base, flags, 0);
@@ -465,6 +477,7 @@  static inline bool cpu_handle_exception(CPUState *cpu, int *ret)
         if (unlikely(atomic_read(&cpu->trace_dstate_delayed_req))) {
             bitmap_copy(cpu->trace_dstate, cpu->trace_dstate_delayed,
                         trace_get_vcpu_event_count());
+            tb_flush_jmp_cache_all(cpu);
         }
 
         return true;
@@ -660,6 +673,9 @@  int cpu_exec(CPUState *cpu)
                 if (unlikely(atomic_read(&cpu->trace_dstate_delayed_req))) {
                     bitmap_copy(cpu->trace_dstate, cpu->trace_dstate_delayed,
                                 trace_get_vcpu_event_count());
+                    tb_flush_jmp_cache_all(cpu);
+                    /* avoid chaining TBs with different dstates */
+                    last_tb = NULL;
                 }
 
                 /* Try to align the host and virtual clocks
diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index 57cd978578..ae74f61ea2 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -200,6 +200,10 @@  static inline void tlb_flush_by_mmuidx(CPUState *cpu, ...)
 #define USE_DIRECT_JUMP
 #endif
 
+/**
+ * TranslationBlock:
+ * @trace_vcpu_dstate: Per-vCPU dynamic tracing state used to generate this TB.
+ */
 struct TranslationBlock {
     target_ulong pc;   /* simulated PC corresponding to this block (EIP + CS base) */
     target_ulong cs_base; /* CS base for this block */
@@ -215,6 +219,7 @@  struct TranslationBlock {
 #define CF_IGNORE_ICOUNT 0x40000 /* Do not generate icount code */
 
     uint16_t invalid;
+    TRACE_QHT_VCPU_DSTATE_TYPE trace_vcpu_dstate;
 
     void *tc_ptr;    /* pointer to the translated code */
     uint8_t *tc_search;  /* pointer to search data */
diff --git a/include/exec/tb-hash-xx.h b/include/exec/tb-hash-xx.h
index 2c40b5c466..0a18801fd3 100644
--- a/include/exec/tb-hash-xx.h
+++ b/include/exec/tb-hash-xx.h
@@ -35,6 +35,7 @@ 
 #define EXEC_TB_HASH_XX_H
 
 #include "qemu/bitops.h"
+#include "qemu-common.h"
 
 #define PRIME32_1   2654435761U
 #define PRIME32_2   2246822519U
@@ -49,7 +50,8 @@ 
  * contiguous in memory.
  */
 static inline
-uint32_t tb_hash_func5(uint64_t a0, uint64_t b0, uint32_t e)
+uint32_t tb_hash_func6(uint64_t a0, uint64_t b0, uint32_t e,
+                       TRACE_QHT_VCPU_DSTATE_TYPE f)
 {
     uint32_t v1 = TB_HASH_XX_SEED + PRIME32_1 + PRIME32_2;
     uint32_t v2 = TB_HASH_XX_SEED + PRIME32_2;
@@ -83,6 +85,10 @@  uint32_t tb_hash_func5(uint64_t a0, uint64_t b0, uint32_t e)
     h32 += e * PRIME32_3;
     h32  = rol32(h32, 17) * PRIME32_4;
 
+    QEMU_BUILD_BUG_ON(sizeof(TRACE_QHT_VCPU_DSTATE_TYPE) != sizeof(uint32_t));
+    h32 += f * PRIME32_3;
+    h32  = rol32(h32, 17) * PRIME32_4;
+
     h32 ^= h32 >> 15;
     h32 *= PRIME32_2;
     h32 ^= h32 >> 13;
diff --git a/include/exec/tb-hash.h b/include/exec/tb-hash.h
index 2c27490cb8..a042f24c97 100644
--- a/include/exec/tb-hash.h
+++ b/include/exec/tb-hash.h
@@ -46,9 +46,10 @@  static inline unsigned int tb_jmp_cache_hash_func(target_ulong pc)
 }
 
 static inline
-uint32_t tb_hash_func(tb_page_addr_t phys_pc, target_ulong pc, uint32_t flags)
+uint32_t tb_hash_func(tb_page_addr_t phys_pc, target_ulong pc,
+                      uint32_t flags, uint32_t trace_vcpu_dstate)
 {
-    return tb_hash_func5(phys_pc, pc, flags);
+    return tb_hash_func6(phys_pc, pc, flags, trace_vcpu_dstate);
 }
 
 #endif
diff --git a/include/qemu-common.h b/include/qemu-common.h
index 1430390eb6..aaaa73a6fe 100644
--- a/include/qemu-common.h
+++ b/include/qemu-common.h
@@ -151,4 +151,7 @@  void page_size_init(void);
  * returned. */
 bool dump_in_progress(void);
 
+/* Use a macro to allow safe changes to its size in the future */
+#define TRACE_QHT_VCPU_DSTATE_TYPE uint32_t
+
 #endif
diff --git a/tests/qht-bench.c b/tests/qht-bench.c
index 2afa09d859..11c1cec766 100644
--- a/tests/qht-bench.c
+++ b/tests/qht-bench.c
@@ -103,7 +103,7 @@  static bool is_equal(const void *obj, const void *userp)
 
 static inline uint32_t h(unsigned long v)
 {
-    return tb_hash_func5(v, 0, 0);
+    return tb_hash_func6(v, 0, 0, 0);
 }
 
 /*
diff --git a/trace/control-target.c b/trace/control-target.c
index aba8db55de..61e6a4545b 100644
--- a/trace/control-target.c
+++ b/trace/control-target.c
@@ -82,7 +82,10 @@  void trace_event_set_vcpu_state_dynamic(CPUState *vcpu,
             }
             (*ev->dstate)--;
         }
+        /* Make sure next translated/executed TB uses the new dstate */
         atomic_set(&vcpu->trace_dstate_delayed_req, true);
+        /* NOTE: checked by all TBs in gen_tb_start() */
+        atomic_set(&vcpu->tcg_exit_req, 1);
     }
 }
 
diff --git a/trace/control.h b/trace/control.h
index 80d326c4d1..cab84a0308 100644
--- a/trace/control.h
+++ b/trace/control.h
@@ -165,6 +165,9 @@  void trace_event_set_state_dynamic(TraceEvent *ev, bool state);
  * Set the dynamic tracing state of an event for the given vCPU.
  *
  * Pre-condition: trace_event_get_vcpu_state_static(ev) == true
+ *
+ * Note: Changes for execution-time events with the 'tcg' property will not be
+ *       propagated until the next TB is executed (iff executing in TCG mode).
  */
 void trace_event_set_vcpu_state_dynamic(CPUState *vcpu,
                                         TraceEvent *ev, bool state);
diff --git a/translate-all.c b/translate-all.c
index 29ccb9e546..6e1b1d474c 100644
--- a/translate-all.c
+++ b/translate-all.c
@@ -54,6 +54,7 @@ 
 #include "exec/tb-hash.h"
 #include "translate-all.h"
 #include "qemu/bitmap.h"
+#include "qemu/error-report.h"
 #include "qemu/timer.h"
 #include "exec/log.h"
 
@@ -813,6 +814,12 @@  static void tb_htable_init(void)
 {
     unsigned int mode = QHT_MODE_AUTO_RESIZE;
 
+    /* Ensure TB hash function covers the bitmap size */
+    if (DIV_ROUND_UP(trace_get_vcpu_event_count(), BITS_PER_BYTE) >
+        sizeof(TRACE_QHT_VCPU_DSTATE_TYPE)) {
+        error_report("too many 'vcpu' events for the TB hash function");
+    }
+
     qht_init(&tcg_ctx.tb_ctx.htable, CODE_GEN_HTABLE_SIZE, mode);
 }
 
@@ -1106,7 +1113,7 @@  void tb_phys_invalidate(TranslationBlock *tb, tb_page_addr_t page_addr)
 
     /* remove the TB from the hash list */
     phys_pc = tb->page_addr[0] + (tb->pc & ~TARGET_PAGE_MASK);
-    h = tb_hash_func(phys_pc, tb->pc, tb->flags);
+    h = tb_hash_func(phys_pc, tb->pc, tb->flags, tb->trace_vcpu_dstate);
     qht_remove(&tcg_ctx.tb_ctx.htable, tb, h);
 
     /* remove the TB from the page list */
@@ -1251,7 +1258,7 @@  static void tb_link_page(TranslationBlock *tb, tb_page_addr_t phys_pc,
     }
 
     /* add in the hash table */
-    h = tb_hash_func(phys_pc, tb->pc, tb->flags);
+    h = tb_hash_func(phys_pc, tb->pc, tb->flags, tb->trace_vcpu_dstate);
     qht_insert(&tcg_ctx.tb_ctx.htable, tb, h);
 
 #ifdef DEBUG_TB_CHECK
@@ -1270,6 +1277,7 @@  TranslationBlock *tb_gen_code(CPUState *cpu,
     target_ulong virt_page2;
     tcg_insn_unit *gen_code_buf;
     int gen_code_size, search_size;
+    unsigned long trace_vcpu_dstate_bitmap;
 #ifdef CONFIG_PROFILER
     int64_t ti;
 #endif
@@ -1294,6 +1302,10 @@  TranslationBlock *tb_gen_code(CPUState *cpu,
     tb->cs_base = cs_base;
     tb->flags = flags;
     tb->cflags = cflags;
+    bitmap_copy(&trace_vcpu_dstate_bitmap, cpu->trace_dstate,
+                trace_get_vcpu_event_count());
+    memcpy(&tb->trace_vcpu_dstate, &trace_vcpu_dstate_bitmap,
+           sizeof(tb->trace_vcpu_dstate));
 
 #ifdef CONFIG_PROFILER
     tcg_ctx.tb_count1++; /* includes aborted translations because of