@@ -67,37 +67,15 @@ typedef uint64_t target_ulong;
#define CPU_TLB_ENTRY_BITS 5
#endif
-/* TCG_TARGET_TLB_DISPLACEMENT_BITS is used in CPU_TLB_BITS to ensure that
- * the TLB is not unnecessarily small, but still small enough for the
- * TLB lookup instruction sequence used by the TCG target.
- *
- * TCG will have to generate an operand as large as the distance between
- * env and the tlb_table[NB_MMU_MODES - 1][0].addend. For simplicity,
- * the TCG targets just round everything up to the next power of two, and
- * count bits. This works because: 1) the size of each TLB is a largish
- * power of two, 2) and because the limit of the displacement is really close
- * to a power of two, 3) the offset of tlb_table[0][0] inside env is smaller
- * than the size of a TLB.
- *
- * For example, the maximum displacement 0xFFF0 on PPC and MIPS, but TCG
- * just says "the displacement is 16 bits". TCG_TARGET_TLB_DISPLACEMENT_BITS
- * then ensures that tlb_table at least 0x8000 bytes large ("not unnecessarily
- * small": 2^15). The operand then will come up smaller than 0xFFF0 without
- * any particular care, because the TLB for a single MMU mode is larger than
- * 0x10000-0xFFF0=16 bytes. In the end, the maximum value of the operand
- * could be something like 0xC000 (the offset of the last TLB table) plus
- * 0x18 (the offset of the addend field in each TLB entry) plus the offset
- * of tlb_table inside env (which is non-trivial but not huge).
+#define MIN_CPU_TLB_BITS 6
+#define DEFAULT_CPU_TLB_BITS 8
+/*
+ * Assuming TARGET_PAGE_BITS==12, with 2**22 entries we can cover 2**(22+12) ==
+ * 2**34 == 16G of address space. This is roughly what one would expect a
+ * TLB to cover in a modern (as of 2018) x86_64 CPU. For instance, Intel
+ * Skylake's Level-2 STLB has 16 1G entries.
*/
-#define CPU_TLB_BITS \
- MIN(8, \
- TCG_TARGET_TLB_DISPLACEMENT_BITS - CPU_TLB_ENTRY_BITS - \
- (NB_MMU_MODES <= 1 ? 0 : \
- NB_MMU_MODES <= 2 ? 1 : \
- NB_MMU_MODES <= 4 ? 2 : \
- NB_MMU_MODES <= 8 ? 3 : 4))
-
-#define CPU_TLB_SIZE (1 << CPU_TLB_BITS)
+#define MAX_CPU_TLB_BITS 22
typedef struct CPUTLBEntry {
/* bit TARGET_LONG_BITS to TARGET_PAGE_BITS : virtual address
@@ -143,6 +121,7 @@ typedef struct CPUIOTLBEntry {
typedef struct CPUTLBDesc {
size_t n_used_entries;
+ size_t n_flushes_low_rate;
} CPUTLBDesc;
#define CPU_COMMON_TLB \
@@ -80,12 +80,13 @@ void tlb_init(CPUState *cpu)
qemu_spin_init(&env->tlb_lock);
for (i = 0; i < NB_MMU_MODES; i++) {
- size_t n_entries = CPU_TLB_SIZE;
+ size_t n_entries = 1 << DEFAULT_CPU_TLB_BITS;
env->tlb_desc[i].n_used_entries = 0;
+ env->tlb_desc[i].n_flushes_low_rate = 0;
env->tlb_mask[i] = (n_entries - 1) << CPU_TLB_ENTRY_BITS;
env->tlb_table[i] = g_new(CPUTLBEntry, n_entries);
- env->iotlb[i] = g_new0(CPUIOTLBEntry, n_entries);
+ env->iotlb[i] = g_new(CPUIOTLBEntry, n_entries);
}
}
@@ -121,6 +122,40 @@ size_t tlb_flush_count(void)
return count;
}
+/* Call with tlb_lock held */
+static void tlb_mmu_resize_locked(CPUArchState *env, int mmu_idx)
+{
+ CPUTLBDesc *desc = &env->tlb_desc[mmu_idx];
+ size_t old_size = tlb_n_entries(env, mmu_idx);
+ size_t rate = desc->n_used_entries * 100 / old_size;
+ size_t new_size = old_size;
+
+ if (rate == 100) {
+ new_size = MIN(old_size << 2, 1 << MAX_CPU_TLB_BITS);
+ } else if (rate > 70) {
+ new_size = MIN(old_size << 1, 1 << MAX_CPU_TLB_BITS);
+ } else if (rate < 30) {
+ desc->n_flushes_low_rate++;
+ if (desc->n_flushes_low_rate == 100) {
+ new_size = MAX(old_size >> 1, 1 << MIN_CPU_TLB_BITS);
+ desc->n_flushes_low_rate = 0;
+ }
+ }
+
+ if (new_size == old_size) {
+ return;
+ }
+
+ g_free(env->tlb_table[mmu_idx]);
+ g_free(env->iotlb[mmu_idx]);
+
+ /* desc->n_used_entries is cleared by the caller */
+ desc->n_flushes_low_rate = 0;
+ env->tlb_mask[mmu_idx] = (new_size - 1) << CPU_TLB_ENTRY_BITS;
+ env->tlb_table[mmu_idx] = g_new(CPUTLBEntry, new_size);
+ env->iotlb[mmu_idx] = g_new(CPUIOTLBEntry, new_size);
+}
+
/* This is OK because CPU architectures generally permit an
* implementation to drop entries from the TLB at any time, so
* flushing more entries than required is only an efficiency issue,
@@ -150,6 +185,7 @@ static void tlb_flush_nocheck(CPUState *cpu)
*/
qemu_spin_lock(&env->tlb_lock);
for (i = 0; i < NB_MMU_MODES; i++) {
+ tlb_mmu_resize_locked(env, i);
memset(env->tlb_table[i], -1, sizeof_tlb(env, i));
env->tlb_desc[i].n_used_entries = 0;
}
@@ -213,6 +249,7 @@ static void tlb_flush_by_mmuidx_async_work(CPUState *cpu, run_on_cpu_data data)
if (test_bit(mmu_idx, &mmu_idx_bitmask)) {
tlb_debug("%d\n", mmu_idx);
+ tlb_mmu_resize_locked(env, mmu_idx);
memset(env->tlb_table[mmu_idx], -1, sizeof_tlb(env, mmu_idx));
memset(env->tlb_v_table[mmu_idx], -1, sizeof(env->tlb_v_table[0]));
env->tlb_desc[mmu_idx].n_used_entries = 0;
@@ -1626,7 +1626,7 @@ static inline void tcg_out_tlb_load(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
}
if (TCG_TYPE_PTR == TCG_TYPE_I64) {
hrexw = P_REXW;
- if (TARGET_PAGE_BITS + CPU_TLB_BITS > 32) {
+ if (TARGET_PAGE_BITS + MAX_CPU_TLB_BITS > 32) {
tlbtype = TCG_TYPE_I64;
tlbrexw = P_REXW;
}