diff mbox

[v3,11/11] translate-all: add tb hash bucket info to 'info jit' dump

Message ID 571A82B8.5080908@twiddle.net (mailing list archive)
State New, archived
Headers show

Commit Message

Richard Henderson April 22, 2016, 7:59 p.m. UTC
On 04/22/2016 10:41 AM, Richard Henderson wrote:
> On 04/19/2016 04:07 PM, Emilio G. Cota wrote:
>> +    ht_avg_len = qht_avg_bucket_chain_length(&tcg_ctx.tb_ctx.htable, &ht_heads);
>> +    cpu_fprintf(f, "TB hash avg chain   %0.5f buckets\n", ht_avg_len);
>> +    cpu_fprintf(f, "TB hash size        %zu head buckets\n", ht_heads);
> 
> 
> I think the accounting is questionable here.
> 
> Consider the following data:
> 
> TB count             230467/671088
> TB invalidate count  25915
> TB hash avg chain    1.03073 buckets
> TB hash size         131072 head buckets
> 
> This means that we've got 230467 - 25915 = 204552 active TB's, installed into a
> hash table with 131072 heads.  For a perfectly uniform distribution of TBs,
> that would be an average chain length of 204552 / 131072 = 1.56.
> 
> In order to get the average down to 1.03, there need to be a substantial number
> of heads with zero entries.
> 
> I think perhaps it might be more enlightening to separately account for empty
> and non-empty heads.  E.g.
> 
> TB hash buckets used  xxxx/131072
> TB hash avg chain     yyyy
> 
> where xxxx is the number of non-empty heads, and yyyy = |TBs| / xxxx.
> 
> I also wonder if it wouldn't be better to size the hash table as appropriate
> for the maximum number of allowable TBs.

FWIW, so that I could get an idea of how the stats change as we improve the
hashing, I inserted the attachment 1 patch between patches 5 and 6, and with
attachment 2 attempting to fix the accounting for patches 9 and 10.

For booting an alpha kernel to login prompt:

Before hashing changes (@5/11)

TB count             175363/671088
TB invalidate count  3996
TB hash buckets      31731/32768
TB hash avg chain    5.289 max=59

After xxhash patch (@7/11)

TB hash buckets      32582/32768
TB hash avg chain    5.260 max=18

So far so good!

After qht patches (@11/11)

TB hash buckets      94360/131072
TB hash avg chain    1.774 max=8

Do note that those last numbers are off: 1.774 avg * 94360 used buckets =
167394 total entries, which is far from 171367, the correct number of total
entries.

I'm tempted to pull over gcc's non-chaining hash table implementation
(libiberty/hashtab.c, still gplv2+) and compare...



r~
>From cdc7b3631fd78bd2e31d2823f7543e2a56681149 Mon Sep 17 00:00:00 2001
From: Richard Henderson <rth@twiddle.net>
Date: Fri, 22 Apr 2016 11:28:52 -0700
Subject: translate-all: Add hashtable accounting to info jit

Dump hash table occupancy numbers with "info jit".

Signed-off-by: Richard Henderson <rth@twiddle.net>
>From 7f1d677f3d085b5891e1adbd5f602185d68ba81a Mon Sep 17 00:00:00 2001
From: Richard Henderson <rth@twiddle.net>
Date: Fri, 22 Apr 2016 12:50:00 -0700
Subject: fixup to {09,10,11}/11


diff --git a/include/qemu/qht.h b/include/qemu/qht.h
index a0a1aa8..2d0b58f 100644
--- a/include/qemu/qht.h
+++ b/include/qemu/qht.h
@@ -49,6 +49,14 @@ void qht_grow(struct qht *ht);
 
 void *qht_lookup(struct qht *ht, qht_lookup_func_t func, const void *userp,
                  uint32_t hash);
-double qht_avg_bucket_chain_length(struct qht *ht, size_t *n_head_buckets);
+
+struct qht_stats {
+    size_t used_buckets;
+    size_t max_buckets;
+    size_t used_entries;
+    size_t max_chain;
+};
+
+struct qht_stats qht_statistics(struct qht *ht);
 
 #endif /* QHT_H */
diff --git a/translate-all.c b/translate-all.c
index 3b73b46..a9ceb0a 100644
--- a/translate-all.c
+++ b/translate-all.c
@@ -1664,8 +1664,7 @@ void dump_exec_info(FILE *f, fprintf_function cpu_fprintf)
 {
     size_t target_code_size, max_target_code_size;
     unsigned direct_jmp_count, direct_jmp2_count, cross_page;
-    unsigned used_buckets, max_chain, hash_tbs;
-    TranslationBlock *tb;
+    struct qht_stats hinfo;
     int i;
 
     target_code_size = 0;
@@ -1673,35 +1672,23 @@ void dump_exec_info(FILE *f, fprintf_function cpu_fprintf)
     cross_page = 0;
     direct_jmp_count = 0;
     direct_jmp2_count = 0;
-    used_buckets = 0;
-    hash_tbs = 0;
-    max_chain = 0;
-
-    for (i = 0; i < CODE_GEN_PHYS_HASH_SIZE; i++) {
-        if (tcg_ctx.tb_ctx.tb_phys_hash[i]) {
-            unsigned this_chain = 0;
-            for (tb = tcg_ctx.tb_ctx.tb_phys_hash[i]; tb != NULL;
-                 tb = tb->phys_hash_next) {
-                this_chain++;
-                hash_tbs++;
-                target_code_size += tb->size;
-                if (tb->page_addr[1] != -1) {
-                    cross_page++;
-                }
-                if (tb->tb_next_offset[0] != 0xffff) {
-                    direct_jmp_count++;
-                    if (tb->tb_next_offset[1] != 0xffff) {
-                        direct_jmp2_count++;
-                    }
-                }
-            }
-            if (this_chain > max_chain) {
-                max_chain = this_chain;
+
+    for (i = 0; i < tcg_ctx.tb_ctx.nb_tbs; i++) {
+        const TranslationBlock *tb = &tcg_ctx.tb_ctx.tbs[i];
+        target_code_size += tb->size;
+        if (tb->page_addr[1] != -1) {
+            cross_page++;
+        }
+        if (tb->tb_next_offset[0] != 0xffff) {
+            direct_jmp_count++;
+            if (tb->tb_next_offset[1] != 0xffff) {
+                direct_jmp2_count++;
             }
-            used_buckets++;
         }
     }
-    assert(hash_tbs ==
+
+    hinfo = qht_statistics(&tcg_ctx.tb_ctx.htable);
+    assert(hinfo.used_entries ==
            tcg_ctx.tb_ctx.nb_tbs - tcg_ctx.tb_ctx.tb_phys_invalidate_count);
 
     cpu_fprintf(f, "Translation buffer state:\n");
@@ -1731,11 +1718,12 @@ void dump_exec_info(FILE *f, fprintf_function cpu_fprintf)
                 direct_jmp2_count,
                 tcg_ctx.tb_ctx.nb_tbs ? (direct_jmp2_count * 100) /
                         tcg_ctx.tb_ctx.nb_tbs : 0);
-    cpu_fprintf(f, "TB hash buckets     %u/%d\n",
-                used_buckets, CODE_GEN_PHYS_HASH_SIZE);
-    cpu_fprintf(f, "TB hash avg chain   %0.3f max=%u\n",
-                used_buckets ? (double)hash_tbs / used_buckets : 0.0,
-                max_chain);
+    cpu_fprintf(f, "TB hash buckets     %zu/%zu\n",
+                hinfo.used_buckets, hinfo.max_buckets);
+    cpu_fprintf(f, "TB hash avg chain   %0.3f max=%zu\n",
+                hinfo.used_buckets
+                ? (double)hinfo.used_entries / hinfo.used_buckets : 0.0,
+                hinfo.max_chain);
     cpu_fprintf(f, "\nStatistics:\n");
     cpu_fprintf(f, "TB flush count      %d\n", tcg_ctx.tb_ctx.tb_flush_count);
     cpu_fprintf(f, "TB invalidate count %d\n",
diff --git a/util/qht.c b/util/qht.c
index 05ea5e8..535057b 100644
--- a/util/qht.c
+++ b/util/qht.c
@@ -556,35 +556,45 @@ void qht_grow(struct qht *ht)
  * value should be close to 1.
  * Note that each bucket tracks up to QHT_BUCKET_ENTRIES items.
  */
-double qht_avg_bucket_chain_length(struct qht *ht, size_t *n_head_buckets)
+struct qht_stats qht_statistics(struct qht *ht)
 {
     struct qht_map *map;
-    size_t count = 0;
-    size_t i;
+    struct qht_stats s = {};
+    size_t i, n;
 
     map = atomic_read(&ht->map);
     /* paired with smp_wmb() before setting ht->map */
     smp_rmb();
+    s.max_buckets = n = map->n;
 
-    for (i = 0; i < map->n; i++) {
+    for (i = 0; i < n; i++) {
         struct qht_bucket *head = &map->buckets[i];
         struct qht_bucket *b;
-        size_t bucket_count;
+        size_t this_chain;
         uint32_t version;
 
         do {
             version = seqlock_read_begin(&head->sequence);
-            bucket_count = 0;
+            this_chain = 0;
             b = head;
             do {
-                bucket_count++;
+                int j;
+                for (j = 0; j < QHT_BUCKET_ENTRIES; j++) {
+                    if (b->hashes[j]) {
+                        this_chain++;
+                    }
+                }
                 b = b->next;
             } while (b);
         } while (seqlock_read_retry(&head->sequence, version));
-        count += bucket_count;
-    }
-    if (n_head_buckets) {
-        *n_head_buckets = map->n;
+        if (this_chain != 0) {
+            s.used_entries += this_chain;
+            if (s.max_chain < this_chain) {
+                s.max_chain = this_chain;
+            }
+            s.used_buckets++;
+        }
     }
-    return (double)count / map->n;
+
+    return s;
 }

Comments

Emilio Cota April 22, 2016, 11:57 p.m. UTC | #1
On Fri, Apr 22, 2016 at 12:59:52 -0700, Richard Henderson wrote:
> FWIW, so that I could get an idea of how the stats change as we improve the
> hashing, I inserted the attachment 1 patch between patches 5 and 6, and with
> attachment 2 attempting to fix the accounting for patches 9 and 10.

For qht, I dislike the approach of reporting "avg chain" per-element,
instead of per-bucket. Performance for a bucket whose entries are
all valid is virtually the same as that of a bucket that only
has one valid element; thus, with per-bucket reporting, we'd say that
the chain lenght is 1 in both cases, i.e. "perfect". With per-element
reporting, we'd report 4 (on a 64-bit host, since that's the value of
QHT_BUCKET_ENTRIES) when the bucket is full, which IMO gives the
wrong idea (users would think they're in trouble, when they're not).

Using the avg-bucket-chain metric you can test how good the hashing is.
For instance, the metric is 1.01 for xxhash with phys_pc, pc and flags
(i.e. func5), and 1.21 if func5 takes only a valid phys_pc (the other two are 0).

I think reporting fully empty buckets as well as the longest chain
(of buckets for qht) in addition to this metric is a good idea, though.

> For booting an alpha kernel to login prompt:
> 
> Before hashing changes (@5/11)
> 
> TB count             175363/671088
> TB invalidate count  3996
> TB hash buckets      31731/32768
> TB hash avg chain    5.289 max=59
> 
> After xxhash patch (@7/11)
> 
> TB hash buckets      32582/32768
> TB hash avg chain    5.260 max=18
> 
> So far so good!
> 
> After qht patches (@11/11)
> 
> TB hash buckets      94360/131072
> TB hash avg chain    1.774 max=8
> 
> Do note that those last numbers are off: 1.774 avg * 94360 used buckets =
> 167394 total entries, which is far from 171367, the correct number of total
> entries.

If those numbers are off, then either this
    assert(hinfo.used_entries ==
           tcg_ctx.tb_ctx.nb_tbs - tcg_ctx.tb_ctx.tb_phys_invalidate_count);
should trigger, or the accounting isn't right.

Another option is that the "TB count - invalidate_count" is different
for each test you ran. I think this is what's going on, otherwise we couldn't
explain why the first report ("before 5/11") is also "wrong":

  5.289*31731=167825.259

Only the second report ("after 7/11") seems good (taking into account
lack of precision of just 3 decimals):
  5.26*32582=171381.32 ~= 171367
which leads me to believe that you've used the TB and invalidate
counts from that test.

I just tested your patches (on an ARM bootup) and the assert doesn't trigger,
and the stats are spot on for "after 11/11":

TB count            643610/2684354
TB hash buckets     369534/524288
TB hash avg chain   1.729 max=8
TB flush count      0
TB invalidate count 4718

1.729*369534=638924.286, which is ~= 643610-4718 = 638892.

> I'm tempted to pull over gcc's non-chaining hash table implementation
> (libiberty/hashtab.c, still gplv2+) and compare...

You can try, but I think performance wouldn't be great, because
the comparison function would be called way too often due to the
ht using open addressing. The problem there is not only the comparisons
themselves, but the all the cache lines needed to read the fields of
the comparison. I haven't tested libiberty's htable but I did test
the htable in concurrencykit[1], which also uses open addressing.

With ck's ht, performance was not good when booting ARM: IIRC ~30% of
runtime was spent on tb_cmp(); I also added the full hash to each TB so
that it would be compared first, but it didn't make a difference since
the delay was due to loading the cache line (I saw this with perf(1)'s
annotated code, which showed that ~80% of the time spent in tb_cmp()
was in performing the first load of the TB's fields).

This led me to a design that had buckets with a small set of
hash & pointer pairs, all in the same cache line as the head (then
I discovered somebody else had thought of this, and that's why there's
a link to the CLHT paper in qht.c).

BTW I tested ck's htable also because of a requirement we have for MTTCG,
which is to support lock-free concurrent lookups. AFAICT libiberty's ht
doesn't support this, so it might be a bit faster than ck's.

Thanks,

		Emilio

[1] http://concurrencykit.org/
    More info on their htable implementation here:
    http://backtrace.io/blog/blog/2015/03/13/workload-specialization/
Richard Henderson April 24, 2016, 7:46 p.m. UTC | #2
On 04/22/2016 04:57 PM, Emilio G. Cota wrote:
> On Fri, Apr 22, 2016 at 12:59:52 -0700, Richard Henderson wrote:
>> FWIW, so that I could get an idea of how the stats change as we improve the
>> hashing, I inserted the attachment 1 patch between patches 5 and 6, and with
>> attachment 2 attempting to fix the accounting for patches 9 and 10.
>
> For qht, I dislike the approach of reporting "avg chain" per-element,
> instead of per-bucket. Performance for a bucket whose entries are
> all valid is virtually the same as that of a bucket that only
> has one valid element; thus, with per-bucket reporting, we'd say that
> the chain lenght is 1 in both cases, i.e. "perfect". With per-element
> reporting, we'd report 4 (on a 64-bit host, since that's the value of
> QHT_BUCKET_ENTRIES) when the bucket is full, which IMO gives the
> wrong idea (users would think they're in trouble, when they're not).

But otherwise you have no way of knowing how full the buckets are.  The bucket 
size is just something that you have to keep in mind.

> If those numbers are off, then either this
>      assert(hinfo.used_entries ==
>             tcg_ctx.tb_ctx.nb_tbs - tcg_ctx.tb_ctx.tb_phys_invalidate_count);
> should trigger, or the accounting isn't right.

I think I used an NDEBUG build, so these weren't effective.

> Only the second report ("after 7/11") seems good (taking into account
> lack of precision of just 3 decimals):
>    5.26*32582=171381.32 ~= 171367
> which leads me to believe that you've used the TB and invalidate
> counts from that test.

The TB and invalidate numbers are repeatable; the same every time.

> You can try, but I think performance wouldn't be great, because
> the comparison function would be called way too often due to the
> ht using open addressing. The problem there is not only the comparisons
> themselves, but the all the cache lines needed to read the fields of
> the comparison. I haven't tested libiberty's htable but I did test
> the htable in concurrencykit[1], which also uses open addressing.

You are right that having the full hash for primary comparison is a big win, 
especially with how complex our comparison functions are.  And you're right 
that we have to have two of them.

> This led me to a design that had buckets with a small set of
> hash & pointer pairs, all in the same cache line as the head (then
> I discovered somebody else had thought of this, and that's why there's
> a link to the CLHT paper in qht.c).

Fair.  It's a good design.


r~
Emilio Cota April 24, 2016, 10:06 p.m. UTC | #3
On Sun, Apr 24, 2016 at 12:46:08 -0700, Richard Henderson wrote:
> On 04/22/2016 04:57 PM, Emilio G. Cota wrote:
> >On Fri, Apr 22, 2016 at 12:59:52 -0700, Richard Henderson wrote:
> >>FWIW, so that I could get an idea of how the stats change as we improve the
> >>hashing, I inserted the attachment 1 patch between patches 5 and 6, and with
> >>attachment 2 attempting to fix the accounting for patches 9 and 10.
> >
> >For qht, I dislike the approach of reporting "avg chain" per-element,
> >instead of per-bucket. Performance for a bucket whose entries are
> >all valid is virtually the same as that of a bucket that only
> >has one valid element; thus, with per-bucket reporting, we'd say that
> >the chain lenght is 1 in both cases, i.e. "perfect". With per-element
> >reporting, we'd report 4 (on a 64-bit host, since that's the value of
> >QHT_BUCKET_ENTRIES) when the bucket is full, which IMO gives the
> >wrong idea (users would think they're in trouble, when they're not).
> 
> But otherwise you have no way of knowing how full the buckets are.  The
> bucket size is just something that you have to keep in mind.

I'll make some changes in v4 that I think will address both your and
my concerns:
- Report the number of empty buckets
- Do not count empty buckets when reporting avg bucket chain length
- Report average bucket occupancy (in %, so that QHT_BUCKET_ENTRIES
  does not have to be reported.)

> >If those numbers are off, then either this
> >     assert(hinfo.used_entries ==
> >            tcg_ctx.tb_ctx.nb_tbs - tcg_ctx.tb_ctx.tb_phys_invalidate_count);
> >should trigger, or the accounting isn't right.
> 
> I think I used an NDEBUG build, so these weren't effective.
> 
> >Only the second report ("after 7/11") seems good (taking into account
> >lack of precision of just 3 decimals):
> >   5.26*32582=171381.32 ~= 171367
> >which leads me to believe that you've used the TB and invalidate
> >counts from that test.
> 
> The TB and invalidate numbers are repeatable; the same every time.

Then something else is going on, because both the 1st and 3rd tests are
way off. I'd re-test with assertions enabled.

Thanks,

		Emilio
Emilio Cota April 27, 2016, 2:43 a.m. UTC | #4
On Sun, Apr 24, 2016 at 18:06:51 -0400, Emilio G. Cota wrote:
> On Sun, Apr 24, 2016 at 12:46:08 -0700, Richard Henderson wrote:
> > On 04/22/2016 04:57 PM, Emilio G. Cota wrote:
> > >On Fri, Apr 22, 2016 at 12:59:52 -0700, Richard Henderson wrote:
> > >>FWIW, so that I could get an idea of how the stats change as we improve the
> > >>hashing, I inserted the attachment 1 patch between patches 5 and 6, and with
> > >>attachment 2 attempting to fix the accounting for patches 9 and 10.
> > >
> > >For qht, I dislike the approach of reporting "avg chain" per-element,
> > >instead of per-bucket. Performance for a bucket whose entries are
> > >all valid is virtually the same as that of a bucket that only
> > >has one valid element; thus, with per-bucket reporting, we'd say that
> > >the chain lenght is 1 in both cases, i.e. "perfect". With per-element
> > >reporting, we'd report 4 (on a 64-bit host, since that's the value of
> > >QHT_BUCKET_ENTRIES) when the bucket is full, which IMO gives the
> > >wrong idea (users would think they're in trouble, when they're not).
> > 
> > But otherwise you have no way of knowing how full the buckets are.  The
> > bucket size is just something that you have to keep in mind.
> 
> I'll make some changes in v4 that I think will address both your and
> my concerns:
> - Report the number of empty buckets
> - Do not count empty buckets when reporting avg bucket chain length
> - Report average bucket occupancy (in %, so that QHT_BUCKET_ENTRIES
>   does not have to be reported.)

How does the following look?

Example with good hashing, i.e. func5(phys_pc, pc, flags):
TB count            704242/1342156
[...]
TB hash buckets     386484/524288 (73.72% used)
TB hash occupancy   32.57% avg chain occupancy. Histogram: 0-10%??????????90-100%
TB hash avg chain   1.02 buckets. Histogram: 1???3

Example with bad hashing, i.e. func5(phys_pc, 0, 0):
TB count            710748/1342156
[...]
TB hash buckets     113569/524288 (21.66% used)
TB hash occupancy   10.24% avg chain occupancy. Histogram: 0-10%??????????90-100%
TB hash avg chain   2.11 buckets. Histogram: 1??????????93

Note that:

- "TB hash avg chain" does _not_ count empty buckets. This gives
  an idea of how many buckets a typical hit goes through.

- "TB hash occupancy" _does_ count empty buckets. It is called
  "avg chain occupancy" and not "avg occupancy" because the
  counts are only valid per-chain due to the seqlock protecting
  each chain.

Thanks,

		Emilio
Richard Henderson April 28, 2016, 4:37 p.m. UTC | #5
On 04/26/2016 07:43 PM, Emilio G. Cota wrote:
> How does the following look?
> 
> Example with good hashing, i.e. func5(phys_pc, pc, flags):
> TB count            704242/1342156
> [...]
> TB hash buckets     386484/524288 (73.72% used)
> TB hash occupancy   32.57% avg chain occupancy. Histogram: 0-10%??????????90-100%
> TB hash avg chain   1.02 buckets. Histogram: 1???3
> 
> Example with bad hashing, i.e. func5(phys_pc, 0, 0):
> TB count            710748/1342156
> [...]
> TB hash buckets     113569/524288 (21.66% used)
> TB hash occupancy   10.24% avg chain occupancy. Histogram: 0-10%??????????90-100%
> TB hash avg chain   2.11 buckets. Histogram: 1??????????93
> 
> Note that:
> 
> - "TB hash avg chain" does _not_ count empty buckets. This gives
>   an idea of how many buckets a typical hit goes through.
> 
> - "TB hash occupancy" _does_ count empty buckets. It is called
>   "avg chain occupancy" and not "avg occupancy" because the
>   counts are only valid per-chain due to the seqlock protecting
>   each chain.

Looks really good.


r~
diff mbox

Patch

diff --git a/translate-all.c b/translate-all.c
index 1a8f68b..ed296d5 100644
--- a/translate-all.c
+++ b/translate-all.c
@@ -1671,39 +1671,55 @@  void tb_flush_jmp_cache(CPUState *cpu, target_ulong addr)
 
 void dump_exec_info(FILE *f, fprintf_function cpu_fprintf)
 {
-    int i, target_code_size, max_target_code_size;
-    int direct_jmp_count, direct_jmp2_count, cross_page;
+    size_t target_code_size, max_target_code_size;
+    unsigned direct_jmp_count, direct_jmp2_count, cross_page;
+    unsigned used_buckets, max_chain, hash_tbs;
     TranslationBlock *tb;
+    int i;
 
     target_code_size = 0;
     max_target_code_size = 0;
     cross_page = 0;
     direct_jmp_count = 0;
     direct_jmp2_count = 0;
-    for (i = 0; i < tcg_ctx.tb_ctx.nb_tbs; i++) {
-        tb = &tcg_ctx.tb_ctx.tbs[i];
-        target_code_size += tb->size;
-        if (tb->size > max_target_code_size) {
-            max_target_code_size = tb->size;
-        }
-        if (tb->page_addr[1] != -1) {
-            cross_page++;
-        }
-        if (tb->tb_next_offset[0] != 0xffff) {
-            direct_jmp_count++;
-            if (tb->tb_next_offset[1] != 0xffff) {
-                direct_jmp2_count++;
+    used_buckets = 0;
+    hash_tbs = 0;
+    max_chain = 0;
+
+    for (i = 0; i < CODE_GEN_PHYS_HASH_SIZE; i++) {
+        if (tcg_ctx.tb_ctx.tb_phys_hash[i]) {
+            unsigned this_chain = 0;
+            for (tb = tcg_ctx.tb_ctx.tb_phys_hash[i]; tb != NULL;
+                 tb = tb->phys_hash_next) {
+                this_chain++;
+                hash_tbs++;
+                target_code_size += tb->size;
+                if (tb->page_addr[1] != -1) {
+                    cross_page++;
+                }
+                if (tb->tb_next_offset[0] != 0xffff) {
+                    direct_jmp_count++;
+                    if (tb->tb_next_offset[1] != 0xffff) {
+                        direct_jmp2_count++;
+                    }
+                }
             }
+            if (this_chain > max_chain) {
+                max_chain = this_chain;
+            }
+            used_buckets++;
         }
     }
-    /* XXX: avoid using doubles ? */
+    assert(hash_tbs ==
+           tcg_ctx.tb_ctx.nb_tbs - tcg_ctx.tb_ctx.tb_phys_invalidate_count);
+
     cpu_fprintf(f, "Translation buffer state:\n");
     cpu_fprintf(f, "gen code size       %td/%zd\n",
                 tcg_ctx.code_gen_ptr - tcg_ctx.code_gen_buffer,
                 tcg_ctx.code_gen_highwater - tcg_ctx.code_gen_buffer);
     cpu_fprintf(f, "TB count            %d/%d\n",
             tcg_ctx.tb_ctx.nb_tbs, tcg_ctx.code_gen_max_blocks);
-    cpu_fprintf(f, "TB avg target size  %d max=%d bytes\n",
+    cpu_fprintf(f, "TB avg target size  %zd max=%zd bytes\n",
             tcg_ctx.tb_ctx.nb_tbs ? target_code_size /
                     tcg_ctx.tb_ctx.nb_tbs : 0,
             max_target_code_size);
@@ -1717,13 +1733,18 @@  void dump_exec_info(FILE *f, fprintf_function cpu_fprintf)
     cpu_fprintf(f, "cross page TB count %d (%d%%)\n", cross_page,
             tcg_ctx.tb_ctx.nb_tbs ? (cross_page * 100) /
                                     tcg_ctx.tb_ctx.nb_tbs : 0);
-    cpu_fprintf(f, "direct jump count   %d (%d%%) (2 jumps=%d %d%%)\n",
+    cpu_fprintf(f, "direct jump count   %u (%u%%) (2 jumps=%u %u%%)\n",
                 direct_jmp_count,
                 tcg_ctx.tb_ctx.nb_tbs ? (direct_jmp_count * 100) /
                         tcg_ctx.tb_ctx.nb_tbs : 0,
                 direct_jmp2_count,
                 tcg_ctx.tb_ctx.nb_tbs ? (direct_jmp2_count * 100) /
                         tcg_ctx.tb_ctx.nb_tbs : 0);
+    cpu_fprintf(f, "TB hash buckets     %u/%d\n",
+                used_buckets, CODE_GEN_PHYS_HASH_SIZE);
+    cpu_fprintf(f, "TB hash avg chain   %0.3f max=%u\n",
+                used_buckets ? (double)hash_tbs / used_buckets : 0.0,
+                max_chain);
     cpu_fprintf(f, "\nStatistics:\n");
     cpu_fprintf(f, "TB flush count      %d\n", tcg_ctx.tb_ctx.tb_flush_count);
     cpu_fprintf(f, "TB invalidate count %d\n",