diff mbox series

[2/2] tcg: use QTree instead of GTree

Message ID 20230111035536.309885-3-cota@braap.org (mailing list archive)
State New, archived
Headers show
Series fix for #285 | expand

Commit Message

Emilio Cota Jan. 11, 2023, 3:55 a.m. UTC
qemu-user can hang in a multi-threaded fork. One common
reason is that when creating a TB, between fork and exec
we manipulate a GTree whose memory allocator (GSlice) is
not fork-safe.

Although POSIX does not mandate it, the system's allocator
(e.g. tcmalloc, libc malloc) is probably fork-safe.

Fix some of these hangs by using QTree, which uses
the system's allocator.

For more details, see:
  https://gitlab.com/qemu-project/qemu/-/issues/285

Performance impact on linux-user:
- ~2% slowdown in spec06
- 1.05% slowdown in Nbench-int
- 4.51% slowdown in Nbench-fp

                                        x86_64 spec06
                      Host: AMD Ryzen 7 PRO 5850U with Radeon Graphics
  1.04 +--------------------------------------------------------------------------------------------------+
       |                                                                                    |             |
       |                                                                          qtree-gcc1|.int         |
       |                                                +-+                                 |             |
  1.02 |-+...............................................|..................................|...........+-|
       |                                                 |                                  |             |
       |                          +-+                    |                                  |             |
     1 |-+..........+-+............|............+-+......|..........................+-+.....|...........+-|
       |           **|**           |            *|**     |    *+-+                   |      |             |
       |           *+-+*           |            +-+*     |    *  *    +-+          **|**    |             |
       |           *   *   +-+    *|**          *  *   **|*   *  *     |           *+-+*    |     +-+     |
  0.98 |-+.........*...*....|.....*|.*..........*..*...*.|*...*..*...**|*..........*...*....|....**|**..+-|
       |           *   *  **|**   *| *   +-+    *  *   * |*   *  *   *+-+    +-+   *   *  **|**  *+-+*    |
       |    *+-+*  *   *  * | *   +-+*    |     *  *   * |*   *  *   *  *     |    *   *  * | *  *   *    |
       |    *+-+*  *   *  *+-+*   *  *    |     *  *   * |*   *  *   *  *     |    *   *  * | *  *   *    |
  0.96 |-+..*...*..*...*..*...*...*..*...*|**...*..*...*.|*...*..*...*..*.....|....*...*..*.|.*..*...*..+-|
       |    *   *  *   *  *   *   *  *   *| *   *  *   * |*   *  *   *  *   **|*   *   *  * | *  *   *    |
       |    *   *  *   *  *   *   *  *   +-+*   *  *   *+-+   *  *   *  *   * |*   *   *  * | *  *   *    |
       |    *   *  *   *  *   *   *  *   *  *   *  *   *  *   *  *   *  *   * |*   *   *  * | *  *   *    |
  0.94 |-+..*...*..*...*..*...*...*..*...*..*...*..*...*..*...*..*...*..*...*.|*...*...*..*.|.*..*...*..+-|
       |    *   *  *   *  *   *   *  *   *  *   *  *   *  *   *  *   *  *   * |*   *   *  * | *  *   *    |
       |    *   *  *   *  *   *   *  *   *  *   *  *   *  *   *  *   *  *   *+-+   *   *  * | *  *   *    |
  0.92 |-+..*...*..*...*..*...*...*..*...*..*...*..*...*..*...*..*...*..*...*..*...*...*..*.|.*..*...*..+-|
       |    *   *  *   *  *   *   *  *   *  *   *  *   *  *   *  *   *  *   *  *   *   *  *+-+*  *   *    |
       |    *   *  *   *  *   *   *  *   *  *   *  *   *  *   *  *   *  *   *  *   *   *  *   *  *   *    |
       |    *   *  *   *  *   *   *  *   *  *   *  *   *  *   *  *   *  *   *  *   *   *  *   *  *   *    |
   0.9 +--------------------------------------------------------------------------------------------------+
 400.perlben401.bzip2403.gcc429.m445.gob456.hmme45462.libqua464.h26471.omnet473483.xalancbmkgeomean

                         aarch64 NBench Integer Performance
                  Host: AMD Ryzen 7 PRO 5850U with Radeon Graphics
     81.2 +----------------------------------------------------------------+
          |                     +                    +                     |
          |                    ***                                         |
       81 |-+                   B                                        +-|
          |                    **##                                        |
     80.8 |-+                      ##                                    +-|
          |                          ##                                    |
          |                            ##                                  |
     80.6 |-+                            ##                              +-|
          |                                ##                              |
          |                                  ##                            |
     80.4 |-+                                  ##                        +-|
          |                                      ##                        |
     80.2 |-+                                      ##**                  +-|
          |                                          B                     |
          |                     +                    *                     |
       80 +----------------------------------------------------------------+
                          master-gcc12          qtree-gcc12
                                    QEMU version

                      aarch64 NBench Floating Point Performance
                  Host: AMD Ryzen 7 PRO 5850U with Radeon Graphics
     13.8 +----------------------------------------------------------------+
          |                    *B*                   +                     |
     13.7 |-+                  **##                                      +-|
          |                        ##                                      |
     13.6 |-+                        #                                   +-|
          |                           ##                                   |
     13.5 |-+                           #                                +-|
          |                              ##                                |
     13.4 |-+                              ##                            +-|
          |                                  #                             |
     13.3 |-+                                 ##                         +-|
          |                                     #                          |
     13.2 |-+                                    ##                      +-|
          |                                        ##**                    |
     13.1 |-+                                        B                   +-|
          |                     +                   ***                    |
       13 +----------------------------------------------------------------+
                          master-gcc12          qtree-gcc12
                                    QEMU version

Signed-off-by: Emilio Cota <cota@braap.org>
---
 accel/tcg/tb-maint.c | 17 +++++++++--------
 tcg/region.c         | 19 ++++++++++---------
 2 files changed, 19 insertions(+), 17 deletions(-)

Comments

Daniel P. Berrangé Jan. 11, 2023, 12:10 p.m. UTC | #1
On Tue, Jan 10, 2023 at 10:55:36PM -0500, Emilio Cota wrote:
> qemu-user can hang in a multi-threaded fork. One common
> reason is that when creating a TB, between fork and exec
> we manipulate a GTree whose memory allocator (GSlice) is
> not fork-safe.
> 
> Although POSIX does not mandate it, the system's allocator
> (e.g. tcmalloc, libc malloc) is probably fork-safe.
> 
> Fix some of these hangs by using QTree, which uses
> the system's allocator.
> 
> For more details, see:
>   https://gitlab.com/qemu-project/qemu/-/issues/285
> 
> Performance impact on linux-user:
> - ~2% slowdown in spec06
> - 1.05% slowdown in Nbench-int
> - 4.51% slowdown in Nbench-fp

What do you get *before* applying this patch, if you just run
linux-user with G_SLICE=always-malloc set ?

Also what libc impl were you testing with ? glibc or musl or something
else ?

With regards,
Daniel
Daniel P. Berrangé Jan. 11, 2023, 12:34 p.m. UTC | #2
On Tue, Jan 10, 2023 at 10:55:36PM -0500, Emilio Cota wrote:
> qemu-user can hang in a multi-threaded fork. One common
> reason is that when creating a TB, between fork and exec
> we manipulate a GTree whose memory allocator (GSlice) is
> not fork-safe.

BTW, I just checked latest glib status

  https://gitlab.gnome.org/GNOME/glib/-/issues/1079

it appears they're pretty close to deciding to delete the
GSlice impl and always use system malloc.

So if we do take this patch series it'll hopefully be a time
limited thing to carry. 


With regards,
Daniel
Daniel P. Berrangé Jan. 25, 2023, 3:58 p.m. UTC | #3
On Wed, Jan 11, 2023 at 12:34:29PM +0000, Daniel P. Berrangé wrote:
> On Tue, Jan 10, 2023 at 10:55:36PM -0500, Emilio Cota wrote:
> > qemu-user can hang in a multi-threaded fork. One common
> > reason is that when creating a TB, between fork and exec
> > we manipulate a GTree whose memory allocator (GSlice) is
> > not fork-safe.
> 
> BTW, I just checked latest glib status
> 
>   https://gitlab.gnome.org/GNOME/glib/-/issues/1079
> 
> it appears they're pretty close to deciding to delete the
> GSlice impl and always use system malloc.

They have now merged the code to delete the GSlice custom allocator.

So glib >= 2.76.0 should not exhibit a hang

> So if we do take this patch series it'll hopefully be a time
> limited thing to carry. 

So the question is whether the issue is critical enough that we want
to carry a workaround for a while, vs telling users to upgrade to
newer glib  (once 2.76 actually gets released)


With regards,
Daniel
Emilio Cota Jan. 29, 2023, 10:32 p.m. UTC | #4
On Wed, Jan 11, 2023 at 12:10:53 +0000, Daniel P. Berrangé wrote:
> On Tue, Jan 10, 2023 at 10:55:36PM -0500, Emilio Cota wrote:
> > Performance impact on linux-user:
> > - ~2% slowdown in spec06
> > - 1.05% slowdown in Nbench-int
> > - 4.51% slowdown in Nbench-fp
> 
> What do you get *before* applying this patch, if you just run
> linux-user with G_SLICE=always-malloc set ?
> 
> Also what libc impl were you testing with ? glibc or musl or something
> else ?

I've now done a few runs of nbench and the measurements I'm getting
are within noise. So I'd say no perf difference.
For SPEC I don't have time right now although I could do more runs
if you think they're needed.

I'm tempted for v2 to just remove these macrobenchmark numbers, since
with perf we can see that little time is spent on gtree anyway,
and at the moment I don't have time to do proper benchmarking.

Thanks,
		E.
Emilio Cota Jan. 29, 2023, 10:38 p.m. UTC | #5
On Wed, Jan 25, 2023 at 15:58:25 +0000, Daniel P. Berrangé wrote:
> On Wed, Jan 11, 2023 at 12:34:29PM +0000, Daniel P. Berrangé wrote:
> > On Tue, Jan 10, 2023 at 10:55:36PM -0500, Emilio Cota wrote:
> > > qemu-user can hang in a multi-threaded fork. One common
> > > reason is that when creating a TB, between fork and exec
> > > we manipulate a GTree whose memory allocator (GSlice) is
> > > not fork-safe.
> > 
> > BTW, I just checked latest glib status
> > 
> >   https://gitlab.gnome.org/GNOME/glib/-/issues/1079
> > 
> > it appears they're pretty close to deciding to delete the
> > GSlice impl and always use system malloc.
> 
> They have now merged the code to delete the GSlice custom allocator.
> 
> So glib >= 2.76.0 should not exhibit a hang
> 
> > So if we do take this patch series it'll hopefully be a time
> > limited thing to carry. 
> 
> So the question is whether the issue is critical enough that we want
> to carry a workaround for a while, vs telling users to upgrade to
> newer glib  (once 2.76 actually gets released)

That is great news!

Since this is a correctness issue, I think we should ship with qtree
and use it when configuring with glib <2.76.0. For later glib versions
we would just use gtree, e.g. via typedef + inline functions.

Once the minimum glib required by the configure script is >= 2.76.0,
then we'd remove qtree.

If that sounds like a good plan, I can send a v2.

Thanks,
		Emilio
Daniel P. Berrangé Jan. 30, 2023, 9:27 a.m. UTC | #6
On Sun, Jan 29, 2023 at 05:38:08PM -0500, Emilio Cota wrote:
> On Wed, Jan 25, 2023 at 15:58:25 +0000, Daniel P. Berrangé wrote:
> > On Wed, Jan 11, 2023 at 12:34:29PM +0000, Daniel P. Berrangé wrote:
> > > On Tue, Jan 10, 2023 at 10:55:36PM -0500, Emilio Cota wrote:
> > > > qemu-user can hang in a multi-threaded fork. One common
> > > > reason is that when creating a TB, between fork and exec
> > > > we manipulate a GTree whose memory allocator (GSlice) is
> > > > not fork-safe.
> > > 
> > > BTW, I just checked latest glib status
> > > 
> > >   https://gitlab.gnome.org/GNOME/glib/-/issues/1079
> > > 
> > > it appears they're pretty close to deciding to delete the
> > > GSlice impl and always use system malloc.
> > 
> > They have now merged the code to delete the GSlice custom allocator.
> > 
> > So glib >= 2.76.0 should not exhibit a hang
> > 
> > > So if we do take this patch series it'll hopefully be a time
> > > limited thing to carry. 
> > 
> > So the question is whether the issue is critical enough that we want
> > to carry a workaround for a while, vs telling users to upgrade to
> > newer glib  (once 2.76 actually gets released)
> 
> That is great news!
> 
> Since this is a correctness issue, I think we should ship with qtree
> and use it when configuring with glib <2.76.0. For later glib versions
> we would just use gtree, e.g. via typedef + inline functions.
> 
> Once the minimum glib required by the configure script is >= 2.76.0,
> then we'd remove qtree.
> 
> If that sounds like a good plan, I can send a v2.

I'm fine with it, but be good to have an opinion here from the TCG
subsystem maintainers, CC'ing them


With regards,
Daniel
Richard Henderson Jan. 30, 2023, 7:09 p.m. UTC | #7
On 1/29/23 23:27, Daniel P. Berrangé wrote:
> On Sun, Jan 29, 2023 at 05:38:08PM -0500, Emilio Cota wrote:
>> Since this is a correctness issue, I think we should ship with qtree
>> and use it when configuring with glib <2.76.0. For later glib versions
>> we would just use gtree, e.g. via typedef + inline functions.
>>
>> Once the minimum glib required by the configure script is >= 2.76.0,
>> then we'd remove qtree.
>>
>> If that sounds like a good plan, I can send a v2.
> 
> I'm fine with it, but be good to have an opinion here from the TCG
> subsystem maintainers, CC'ing them

I agree the correctness issue wants the fix now,
and typedef + inlines sounds like a good way moving forward.


r~
Emilio Cota Feb. 5, 2023, 4:41 p.m. UTC | #8
On Mon, Jan 30, 2023 at 09:09:47 -1000, Richard Henderson wrote:
> On 1/29/23 23:27, Daniel P. Berrangé wrote:
> > On Sun, Jan 29, 2023 at 05:38:08PM -0500, Emilio Cota wrote:
> > > Since this is a correctness issue, I think we should ship with qtree
> > > and use it when configuring with glib <2.76.0. For later glib versions
> > > we would just use gtree, e.g. via typedef + inline functions.
> > > 
> > > Once the minimum glib required by the configure script is >= 2.76.0,
> > > then we'd remove qtree.
> > > 
> > > If that sounds like a good plan, I can send a v2.
> > 
> > I'm fine with it, but be good to have an opinion here from the TCG
> > subsystem maintainers, CC'ing them
> 
> I agree the correctness issue wants the fix now,
> and typedef + inlines sounds like a good way moving forward.

Thanks, just sent a v2 implementing the above.

		Emilio
diff mbox series

Patch

diff --git a/accel/tcg/tb-maint.c b/accel/tcg/tb-maint.c
index b3d6529ae2..e6370ddc71 100644
--- a/accel/tcg/tb-maint.c
+++ b/accel/tcg/tb-maint.c
@@ -19,6 +19,7 @@ 
 
 #include "qemu/osdep.h"
 #include "qemu/interval-tree.h"
+#include "qemu/qtree.h"
 #include "exec/cputlb.h"
 #include "exec/log.h"
 #include "exec/exec-all.h"
@@ -313,7 +314,7 @@  struct page_entry {
  * See also: page_collection_lock().
  */
 struct page_collection {
-    GTree *tree;
+    QTree *tree;
     struct page_entry *max;
 };
 
@@ -466,7 +467,7 @@  static bool page_trylock_add(struct page_collection *set, tb_page_addr_t addr)
     struct page_entry *pe;
     PageDesc *pd;
 
-    pe = g_tree_lookup(set->tree, &index);
+    pe = q_tree_lookup(set->tree, &index);
     if (pe) {
         return false;
     }
@@ -477,7 +478,7 @@  static bool page_trylock_add(struct page_collection *set, tb_page_addr_t addr)
     }
 
     pe = page_entry_new(pd, index);
-    g_tree_insert(set->tree, &pe->index, pe);
+    q_tree_insert(set->tree, &pe->index, pe);
 
     /*
      * If this is either (1) the first insertion or (2) a page whose index
@@ -524,13 +525,13 @@  static struct page_collection *page_collection_lock(tb_page_addr_t start,
     end   >>= TARGET_PAGE_BITS;
     g_assert(start <= end);
 
-    set->tree = g_tree_new_full(tb_page_addr_cmp, NULL, NULL,
+    set->tree = q_tree_new_full(tb_page_addr_cmp, NULL, NULL,
                                 page_entry_destroy);
     set->max = NULL;
     assert_no_pages_locked();
 
  retry:
-    g_tree_foreach(set->tree, page_entry_lock, NULL);
+    q_tree_foreach(set->tree, page_entry_lock, NULL);
 
     for (index = start; index <= end; index++) {
         TranslationBlock *tb;
@@ -541,7 +542,7 @@  static struct page_collection *page_collection_lock(tb_page_addr_t start,
             continue;
         }
         if (page_trylock_add(set, index << TARGET_PAGE_BITS)) {
-            g_tree_foreach(set->tree, page_entry_unlock, NULL);
+            q_tree_foreach(set->tree, page_entry_unlock, NULL);
             goto retry;
         }
         assert_page_locked(pd);
@@ -550,7 +551,7 @@  static struct page_collection *page_collection_lock(tb_page_addr_t start,
                 (tb_page_addr1(tb) != -1 &&
                  page_trylock_add(set, tb_page_addr1(tb)))) {
                 /* drop all locks, and reacquire in order */
-                g_tree_foreach(set->tree, page_entry_unlock, NULL);
+                q_tree_foreach(set->tree, page_entry_unlock, NULL);
                 goto retry;
             }
         }
@@ -561,7 +562,7 @@  static struct page_collection *page_collection_lock(tb_page_addr_t start,
 static void page_collection_unlock(struct page_collection *set)
 {
     /* entries are unlocked and freed via page_entry_destroy */
-    g_tree_destroy(set->tree);
+    q_tree_destroy(set->tree);
     g_free(set);
 }
 
diff --git a/tcg/region.c b/tcg/region.c
index 88d6bb273f..bef4c4756f 100644
--- a/tcg/region.c
+++ b/tcg/region.c
@@ -28,6 +28,7 @@ 
 #include "qemu/mprotect.h"
 #include "qemu/memalign.h"
 #include "qemu/cacheinfo.h"
+#include "qemu/qtree.h"
 #include "qapi/error.h"
 #include "exec/exec-all.h"
 #include "tcg/tcg.h"
@@ -36,7 +37,7 @@ 
 
 struct tcg_region_tree {
     QemuMutex lock;
-    GTree *tree;
+    QTree *tree;
     /* padding to avoid false sharing is computed at run-time */
 };
 
@@ -163,7 +164,7 @@  static void tcg_region_trees_init(void)
         struct tcg_region_tree *rt = region_trees + i * tree_size;
 
         qemu_mutex_init(&rt->lock);
-        rt->tree = g_tree_new_full(tb_tc_cmp, NULL, NULL, tb_destroy);
+        rt->tree = q_tree_new_full(tb_tc_cmp, NULL, NULL, tb_destroy);
     }
 }
 
@@ -202,7 +203,7 @@  void tcg_tb_insert(TranslationBlock *tb)
 
     g_assert(rt != NULL);
     qemu_mutex_lock(&rt->lock);
-    g_tree_insert(rt->tree, &tb->tc, tb);
+    q_tree_insert(rt->tree, &tb->tc, tb);
     qemu_mutex_unlock(&rt->lock);
 }
 
@@ -212,7 +213,7 @@  void tcg_tb_remove(TranslationBlock *tb)
 
     g_assert(rt != NULL);
     qemu_mutex_lock(&rt->lock);
-    g_tree_remove(rt->tree, &tb->tc);
+    q_tree_remove(rt->tree, &tb->tc);
     qemu_mutex_unlock(&rt->lock);
 }
 
@@ -232,7 +233,7 @@  TranslationBlock *tcg_tb_lookup(uintptr_t tc_ptr)
     }
 
     qemu_mutex_lock(&rt->lock);
-    tb = g_tree_lookup(rt->tree, &s);
+    tb = q_tree_lookup(rt->tree, &s);
     qemu_mutex_unlock(&rt->lock);
     return tb;
 }
@@ -267,7 +268,7 @@  void tcg_tb_foreach(GTraverseFunc func, gpointer user_data)
     for (i = 0; i < region.n; i++) {
         struct tcg_region_tree *rt = region_trees + i * tree_size;
 
-        g_tree_foreach(rt->tree, func, user_data);
+        q_tree_foreach(rt->tree, func, user_data);
     }
     tcg_region_tree_unlock_all();
 }
@@ -281,7 +282,7 @@  size_t tcg_nb_tbs(void)
     for (i = 0; i < region.n; i++) {
         struct tcg_region_tree *rt = region_trees + i * tree_size;
 
-        nb_tbs += g_tree_nnodes(rt->tree);
+        nb_tbs += q_tree_nnodes(rt->tree);
     }
     tcg_region_tree_unlock_all();
     return nb_tbs;
@@ -296,8 +297,8 @@  static void tcg_region_tree_reset_all(void)
         struct tcg_region_tree *rt = region_trees + i * tree_size;
 
         /* Increment the refcount first so that destroy acts as a reset */
-        g_tree_ref(rt->tree);
-        g_tree_destroy(rt->tree);
+        q_tree_ref(rt->tree);
+        q_tree_destroy(rt->tree);
     }
     tcg_region_tree_unlock_all();
 }