diff mbox

[v4,11/12] cpu-exec-common: Introduce async_safe_run_on_cpu()

Message ID 20160715185726.10181-12-sergey.fedorov@linaro.org (mailing list archive)
State New, archived
Headers show

Commit Message

sergey.fedorov@linaro.org July 15, 2016, 6:57 p.m. UTC
From: Sergey Fedorov <serge.fdrv@gmail.com>

This patch is based on the ideas found in work of KONRAD Frederic [1],
Alex Bennée [2], and Alvise Rigo [3].

This mechanism allows to perform an operation safely in a quiescent
state. Quiescent state means: (1) no vCPU is running and (2) BQL in
system-mode or 'exclusive_lock' in user-mode emulation is held while
performing the operation. This functionality is required e.g. for
performing translation buffer flush safely in multi-threaded user-mode
emulation.

The existing CPU work queue is used to schedule such safe operations. A
new 'safe' flag is added into struct qemu_work_item to designate the
special requirements of the safe work. An operation in a quiescent sate
can be scheduled by using async_safe_run_on_cpu() function which is
actually the same as sync_run_on_cpu() except that it marks the queued
work item with the 'safe' flag set to true. Given this flag set
queue_work_on_cpu() atomically increments 'safe_work_pending' global
counter and kicks all the CPUs instead of just the target CPU as in case
of normal CPU work. This allows to force other CPUs to exit their
execution loops and wait in wait_safe_cpu_work() function for the safe
work to finish. When a CPU drains its work queue, if it encounters a
work item marked as safe, it first waits for other CPUs to exit their
execution loops, then called the work item function, and finally
decrements 'safe_work_pending' counter with signalling other CPUs to let
them continue execution as soon as all pending safe work items have been
processed. The 'tcg_pending_threads' protected by 'exclusive_lock' in
user-mode or by 'qemu_global_mutex' in system-mode emulation is used to
determine if there is any CPU run and wait for it to exit the execution
loop. The fairness of all the CPU work queues is ensured by draining all
the pending safe work items before any CPU can run.

[1] http://lists.nongnu.org/archive/html/qemu-devel/2015-08/msg01128.html
[2] http://lists.nongnu.org/archive/html/qemu-devel/2016-04/msg02531.html
[3] http://lists.nongnu.org/archive/html/qemu-devel/2016-05/msg04792.html

Signed-off-by: Sergey Fedorov <serge.fdrv@gmail.com>
Signed-off-by: Sergey Fedorov <sergey.fedorov@linaro.org>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>

---
Changes in v3:
 - bsd-user supported
Changes in v2:
 - some conditional varialbes moved to cpu-exec-common.c
 - documentation commend for new public API added
---
 bsd-user/main.c         |  3 ++-
 cpu-exec-common.c       | 49 ++++++++++++++++++++++++++++++++++++++++++++++++-
 cpus.c                  | 20 ++++++++++++++++++++
 include/exec/exec-all.h | 14 ++++++++++++++
 include/qom/cpu.h       | 14 ++++++++++++++
 linux-user/main.c       | 13 +++++++------
 6 files changed, 105 insertions(+), 8 deletions(-)

Comments

Alex Bennée July 26, 2016, 3:37 p.m. UTC | #1
Sergey Fedorov <sergey.fedorov@linaro.org> writes:

> From: Sergey Fedorov <serge.fdrv@gmail.com>
>
> This patch is based on the ideas found in work of KONRAD Frederic [1],
> Alex Bennée [2], and Alvise Rigo [3].
>
> This mechanism allows to perform an operation safely in a quiescent
> state. Quiescent state means: (1) no vCPU is running and (2) BQL in
> system-mode or 'exclusive_lock' in user-mode emulation is held while
> performing the operation. This functionality is required e.g. for
> performing translation buffer flush safely in multi-threaded user-mode
> emulation.
<snip>
>
>  QemuCond qemu_work_cond;
> +QemuCond qemu_safe_work_cond;
> +QemuCond qemu_exclusive_cond;
> +
> +static int safe_work_pending;
> +
> +void wait_safe_cpu_work(void)
> +{
> +    while (atomic_mb_read(&safe_work_pending) > 0) {
> +        qemu_cond_wait(&qemu_safe_work_cond, qemu_get_cpu_work_mutex());
> +    }
> +}

Testing on system mode I got a dead-lock with one thread sitting here (I
assume after doing its work) and the other threads all waiting on the
exclusive condition. I think we need this:

    void wait_safe_cpu_work(void)
    {
        while (atomic_mb_read(&safe_work_pending) > 0) {
            /*
             * If there is pending safe work and no pending threads we
             * need to signal another thread to start its work.
             */
            if (tcg_pending_threads == 0) {
                qemu_cond_signal(&qemu_exclusive_cond);
            }
            qemu_cond_wait(&qemu_safe_work_cond, qemu_get_cpu_work_mutex());
        }
    }

>
>  static void queue_work_on_cpu(CPUState *cpu, struct qemu_work_item *wi)
>  {
> @@ -91,9 +103,18 @@ static void queue_work_on_cpu(CPUState *cpu, struct qemu_work_item *wi)
>      cpu->queued_work_last = wi;
>      wi->next = NULL;
>      wi->done = false;
> +    if (wi->safe) {
> +        atomic_inc(&safe_work_pending);
> +    }
>      qemu_mutex_unlock(&cpu->work_mutex);
>
> -    qemu_cpu_kick(cpu);
> +    if (!wi->safe) {
> +        qemu_cpu_kick(cpu);
> +    } else {
> +        CPU_FOREACH(cpu) {
> +            qemu_cpu_kick(cpu);
> +        }
> +    }
>  }
>
>  void run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data)
> @@ -108,6 +129,7 @@ void run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data)
>      wi.func = func;
>      wi.data = data;
>      wi.free = false;
> +    wi.safe = false;
>
>      queue_work_on_cpu(cpu, &wi);
>      while (!atomic_mb_read(&wi.done)) {
> @@ -131,6 +153,20 @@ void async_run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data)
>      wi->func = func;
>      wi->data = data;
>      wi->free = true;
> +    wi->safe = false;
> +
> +    queue_work_on_cpu(cpu, wi);
> +}
> +
> +void async_safe_run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data)
> +{
> +    struct qemu_work_item *wi;
> +
> +    wi = g_malloc0(sizeof(struct qemu_work_item));
> +    wi->func = func;
> +    wi->data = data;
> +    wi->free = true;
> +    wi->safe = true;
>
>      queue_work_on_cpu(cpu, wi);
>  }
> @@ -150,9 +186,20 @@ void process_queued_cpu_work(CPUState *cpu)
>          if (!cpu->queued_work_first) {
>              cpu->queued_work_last = NULL;
>          }
> +        if (wi->safe) {
> +            while (tcg_pending_threads) {
> +                qemu_cond_wait(&qemu_exclusive_cond,
> +                               qemu_get_cpu_work_mutex());
> +            }
> +        }
>          qemu_mutex_unlock(&cpu->work_mutex);
>          wi->func(cpu, wi->data);
>          qemu_mutex_lock(&cpu->work_mutex);
> +        if (wi->safe) {
> +            if (!atomic_dec_fetch(&safe_work_pending)) {
> +                qemu_cond_broadcast(&qemu_safe_work_cond);
> +            }
> +        }
>          if (wi->free) {
>              g_free(wi);
>          } else {
> diff --git a/cpus.c b/cpus.c
> index 282d7e399902..b7122043f650 100644
> --- a/cpus.c
> +++ b/cpus.c
> @@ -903,6 +903,8 @@ void qemu_init_cpu_loop(void)
>      qemu_cond_init(&qemu_cpu_cond);
>      qemu_cond_init(&qemu_pause_cond);
>      qemu_cond_init(&qemu_work_cond);
> +    qemu_cond_init(&qemu_safe_work_cond);
> +    qemu_cond_init(&qemu_exclusive_cond);
>      qemu_cond_init(&qemu_io_proceeded_cond);
>      qemu_mutex_init(&qemu_global_mutex);
>
> @@ -926,6 +928,20 @@ static void qemu_tcg_destroy_vcpu(CPUState *cpu)
>  {
>  }
>
> +/* called with qemu_global_mutex held */
> +static inline void tcg_cpu_exec_start(CPUState *cpu)
> +{
> +    tcg_pending_threads++;
> +}
> +
> +/* called with qemu_global_mutex held */
> +static inline void tcg_cpu_exec_end(CPUState *cpu)
> +{
> +    if (--tcg_pending_threads) {
> +        qemu_cond_broadcast(&qemu_exclusive_cond);

Is the broadcast a bit excessive here. Shouldn't we just wake up one
thread with a qemu_cond_signal?

> +    }
> +}
> +
>  static void qemu_wait_io_event_common(CPUState *cpu)
>  {
>      if (cpu->stop) {
> @@ -950,6 +966,8 @@ static void qemu_tcg_wait_io_event(CPUState *cpu)
>      CPU_FOREACH(cpu) {
>          qemu_wait_io_event_common(cpu);
>      }
> +
> +    wait_safe_cpu_work();
>  }
>
>  static void qemu_kvm_wait_io_event(CPUState *cpu)
> @@ -1485,7 +1503,9 @@ static void tcg_exec_all(void)
>                            (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
>
>          if (cpu_can_run(cpu)) {
> +            tcg_cpu_exec_start(cpu);
>              r = tcg_cpu_exec(cpu);
> +            tcg_cpu_exec_end(cpu);
>              if (r == EXCP_DEBUG) {
>                  cpu_handle_guest_debug(cpu);
>                  break;
> diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
> index 8d5c7dbcf5a9..1f0272beb362 100644
> --- a/include/exec/exec-all.h
> +++ b/include/exec/exec-all.h
> @@ -405,12 +405,22 @@ extern int singlestep;
>
>  /* cpu-exec.c, accessed with atomic_mb_read/atomic_mb_set */
>  extern CPUState *tcg_current_cpu;
> +extern int tcg_pending_threads;
>  extern bool exit_request;
>
>  /**
>   * qemu_work_cond - condition to wait for CPU work items completion
>   */
>  extern QemuCond qemu_work_cond;
> +/**
> + * qemu_safe_work_cond - condition to wait for safe CPU work items completion
> + */
> +extern QemuCond qemu_safe_work_cond;
> +/**
> + * qemu_exclusive_cond - condition to wait for all TCG threads to be out of
> + *                       guest code execution loop
> + */
> +extern QemuCond qemu_exclusive_cond;
>
>  /**
>   * qemu_get_cpu_work_mutex() - get the mutex which protects CPU work execution
> @@ -423,5 +433,9 @@ QemuMutex *qemu_get_cpu_work_mutex(void);
>   * @cpu: The CPU which work queue to process.
>   */
>  void process_queued_cpu_work(CPUState *cpu);
> +/**
> + * wait_safe_cpu_work() - wait until all safe CPU work items has processed
> + */
> +void wait_safe_cpu_work(void);
>
>  #endif
> diff --git a/include/qom/cpu.h b/include/qom/cpu.h
> index c19a673f9f68..ab67bf2ba19f 100644
> --- a/include/qom/cpu.h
> +++ b/include/qom/cpu.h
> @@ -238,6 +238,7 @@ struct qemu_work_item {
>      void *data;
>      int done;
>      bool free;
> +    bool safe;
>  };
>
>  /**
> @@ -632,6 +633,19 @@ void run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data);
>  void async_run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data);
>
>  /**
> + * async_safe_run_on_cpu:
> + * @cpu: The vCPU to run on.
> + * @func: The function to be executed.
> + * @data: Data to pass to the function.
> + *
> + * Schedules the function @func for execution on the vCPU @cpu asynchronously
> + * and in quiescent state. Quiescent state means: (1) all other vCPUs are
> + * halted and (2) #qemu_global_mutex (a.k.a. BQL) in system-mode or
> + * #exclusive_lock in user-mode emulation is held while @func is executing.
> + */
> +void async_safe_run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data);
> +
> +/**
>   * qemu_get_cpu:
>   * @index: The CPUState@cpu_index value of the CPU to obtain.
>   *
> diff --git a/linux-user/main.c b/linux-user/main.c
> index fce61d5a35fc..d0ff5f9976e5 100644
> --- a/linux-user/main.c
> +++ b/linux-user/main.c
> @@ -110,18 +110,17 @@ int cpu_get_pic_interrupt(CPUX86State *env)
>     which requires quite a lot of per host/target work.  */
>  static QemuMutex cpu_list_mutex;
>  static QemuMutex exclusive_lock;
> -static QemuCond exclusive_cond;
>  static QemuCond exclusive_resume;
>  static bool exclusive_pending;
> -static int tcg_pending_threads;
>
>  void qemu_init_cpu_loop(void)
>  {
>      qemu_mutex_init(&cpu_list_mutex);
>      qemu_mutex_init(&exclusive_lock);
> -    qemu_cond_init(&exclusive_cond);
>      qemu_cond_init(&exclusive_resume);
>      qemu_cond_init(&qemu_work_cond);
> +    qemu_cond_init(&qemu_safe_work_cond);
> +    qemu_cond_init(&qemu_exclusive_cond);
>  }
>
>  /* Make sure everything is in a consistent state for calling fork().  */
> @@ -148,9 +147,10 @@ void fork_end(int child)
>          exclusive_pending = false;
>          qemu_mutex_init(&exclusive_lock);
>          qemu_mutex_init(&cpu_list_mutex);
> -        qemu_cond_init(&exclusive_cond);
>          qemu_cond_init(&exclusive_resume);
>          qemu_cond_init(&qemu_work_cond);
> +        qemu_cond_init(&qemu_safe_work_cond);
> +        qemu_cond_init(&qemu_exclusive_cond);
>          qemu_mutex_init(&tcg_ctx.tb_ctx.tb_lock);
>          gdbserver_fork(thread_cpu);
>      } else {
> @@ -190,7 +190,7 @@ static inline void start_exclusive(void)
>          }
>      }
>      while (tcg_pending_threads) {
> -        qemu_cond_wait(&exclusive_cond, &exclusive_lock);
> +        qemu_cond_wait(&qemu_exclusive_cond, &exclusive_lock);
>      }
>  }
>
> @@ -219,10 +219,11 @@ static inline void cpu_exec_end(CPUState *cpu)
>      cpu->running = false;
>      tcg_pending_threads--;
>      if (!tcg_pending_threads) {
> -        qemu_cond_signal(&exclusive_cond);
> +        qemu_cond_broadcast(&qemu_exclusive_cond);
>      }
>      exclusive_idle();
>      process_queued_cpu_work(cpu);
> +    wait_safe_cpu_work();
>      qemu_mutex_unlock(&exclusive_lock);
>  }


--
Alex Bennée
diff mbox

Patch

diff --git a/bsd-user/main.c b/bsd-user/main.c
index f738dd64d691..5433bca0fca6 100644
--- a/bsd-user/main.c
+++ b/bsd-user/main.c
@@ -66,9 +66,10 @@  int cpu_get_pic_interrupt(CPUX86State *env)
 void qemu_init_cpu_loop(void)
 {
     /* We need to do this becuase process_queued_cpu_work() calls
-     * qemu_cond_broadcast() on it
+     * qemu_cond_broadcast() on them
      */
     qemu_cond_init(&qemu_work_cond);
+    qemu_cond_init(&qemu_safe_work_cond);
 }
 
 QemuMutex *qemu_get_cpu_work_mutex(void)
diff --git a/cpu-exec-common.c b/cpu-exec-common.c
index a233f0124559..6f278d6d3b70 100644
--- a/cpu-exec-common.c
+++ b/cpu-exec-common.c
@@ -25,6 +25,7 @@ 
 
 bool exit_request;
 CPUState *tcg_current_cpu;
+int tcg_pending_threads;
 
 /* exit the current TB, but without causing any exception to be raised */
 void cpu_loop_exit_noexc(CPUState *cpu)
@@ -79,6 +80,17 @@  void cpu_loop_exit_restore(CPUState *cpu, uintptr_t pc)
 }
 
 QemuCond qemu_work_cond;
+QemuCond qemu_safe_work_cond;
+QemuCond qemu_exclusive_cond;
+
+static int safe_work_pending;
+
+void wait_safe_cpu_work(void)
+{
+    while (atomic_mb_read(&safe_work_pending) > 0) {
+        qemu_cond_wait(&qemu_safe_work_cond, qemu_get_cpu_work_mutex());
+    }
+}
 
 static void queue_work_on_cpu(CPUState *cpu, struct qemu_work_item *wi)
 {
@@ -91,9 +103,18 @@  static void queue_work_on_cpu(CPUState *cpu, struct qemu_work_item *wi)
     cpu->queued_work_last = wi;
     wi->next = NULL;
     wi->done = false;
+    if (wi->safe) {
+        atomic_inc(&safe_work_pending);
+    }
     qemu_mutex_unlock(&cpu->work_mutex);
 
-    qemu_cpu_kick(cpu);
+    if (!wi->safe) {
+        qemu_cpu_kick(cpu);
+    } else {
+        CPU_FOREACH(cpu) {
+            qemu_cpu_kick(cpu);
+        }
+    }
 }
 
 void run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data)
@@ -108,6 +129,7 @@  void run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data)
     wi.func = func;
     wi.data = data;
     wi.free = false;
+    wi.safe = false;
 
     queue_work_on_cpu(cpu, &wi);
     while (!atomic_mb_read(&wi.done)) {
@@ -131,6 +153,20 @@  void async_run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data)
     wi->func = func;
     wi->data = data;
     wi->free = true;
+    wi->safe = false;
+
+    queue_work_on_cpu(cpu, wi);
+}
+
+void async_safe_run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data)
+{
+    struct qemu_work_item *wi;
+
+    wi = g_malloc0(sizeof(struct qemu_work_item));
+    wi->func = func;
+    wi->data = data;
+    wi->free = true;
+    wi->safe = true;
 
     queue_work_on_cpu(cpu, wi);
 }
@@ -150,9 +186,20 @@  void process_queued_cpu_work(CPUState *cpu)
         if (!cpu->queued_work_first) {
             cpu->queued_work_last = NULL;
         }
+        if (wi->safe) {
+            while (tcg_pending_threads) {
+                qemu_cond_wait(&qemu_exclusive_cond,
+                               qemu_get_cpu_work_mutex());
+            }
+        }
         qemu_mutex_unlock(&cpu->work_mutex);
         wi->func(cpu, wi->data);
         qemu_mutex_lock(&cpu->work_mutex);
+        if (wi->safe) {
+            if (!atomic_dec_fetch(&safe_work_pending)) {
+                qemu_cond_broadcast(&qemu_safe_work_cond);
+            }
+        }
         if (wi->free) {
             g_free(wi);
         } else {
diff --git a/cpus.c b/cpus.c
index 282d7e399902..b7122043f650 100644
--- a/cpus.c
+++ b/cpus.c
@@ -903,6 +903,8 @@  void qemu_init_cpu_loop(void)
     qemu_cond_init(&qemu_cpu_cond);
     qemu_cond_init(&qemu_pause_cond);
     qemu_cond_init(&qemu_work_cond);
+    qemu_cond_init(&qemu_safe_work_cond);
+    qemu_cond_init(&qemu_exclusive_cond);
     qemu_cond_init(&qemu_io_proceeded_cond);
     qemu_mutex_init(&qemu_global_mutex);
 
@@ -926,6 +928,20 @@  static void qemu_tcg_destroy_vcpu(CPUState *cpu)
 {
 }
 
+/* called with qemu_global_mutex held */
+static inline void tcg_cpu_exec_start(CPUState *cpu)
+{
+    tcg_pending_threads++;
+}
+
+/* called with qemu_global_mutex held */
+static inline void tcg_cpu_exec_end(CPUState *cpu)
+{
+    if (--tcg_pending_threads) {
+        qemu_cond_broadcast(&qemu_exclusive_cond);
+    }
+}
+
 static void qemu_wait_io_event_common(CPUState *cpu)
 {
     if (cpu->stop) {
@@ -950,6 +966,8 @@  static void qemu_tcg_wait_io_event(CPUState *cpu)
     CPU_FOREACH(cpu) {
         qemu_wait_io_event_common(cpu);
     }
+
+    wait_safe_cpu_work();
 }
 
 static void qemu_kvm_wait_io_event(CPUState *cpu)
@@ -1485,7 +1503,9 @@  static void tcg_exec_all(void)
                           (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
 
         if (cpu_can_run(cpu)) {
+            tcg_cpu_exec_start(cpu);
             r = tcg_cpu_exec(cpu);
+            tcg_cpu_exec_end(cpu);
             if (r == EXCP_DEBUG) {
                 cpu_handle_guest_debug(cpu);
                 break;
diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index 8d5c7dbcf5a9..1f0272beb362 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -405,12 +405,22 @@  extern int singlestep;
 
 /* cpu-exec.c, accessed with atomic_mb_read/atomic_mb_set */
 extern CPUState *tcg_current_cpu;
+extern int tcg_pending_threads;
 extern bool exit_request;
 
 /**
  * qemu_work_cond - condition to wait for CPU work items completion
  */
 extern QemuCond qemu_work_cond;
+/**
+ * qemu_safe_work_cond - condition to wait for safe CPU work items completion
+ */
+extern QemuCond qemu_safe_work_cond;
+/**
+ * qemu_exclusive_cond - condition to wait for all TCG threads to be out of
+ *                       guest code execution loop
+ */
+extern QemuCond qemu_exclusive_cond;
 
 /**
  * qemu_get_cpu_work_mutex() - get the mutex which protects CPU work execution
@@ -423,5 +433,9 @@  QemuMutex *qemu_get_cpu_work_mutex(void);
  * @cpu: The CPU which work queue to process.
  */
 void process_queued_cpu_work(CPUState *cpu);
+/**
+ * wait_safe_cpu_work() - wait until all safe CPU work items has processed
+ */
+void wait_safe_cpu_work(void);
 
 #endif
diff --git a/include/qom/cpu.h b/include/qom/cpu.h
index c19a673f9f68..ab67bf2ba19f 100644
--- a/include/qom/cpu.h
+++ b/include/qom/cpu.h
@@ -238,6 +238,7 @@  struct qemu_work_item {
     void *data;
     int done;
     bool free;
+    bool safe;
 };
 
 /**
@@ -632,6 +633,19 @@  void run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data);
 void async_run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data);
 
 /**
+ * async_safe_run_on_cpu:
+ * @cpu: The vCPU to run on.
+ * @func: The function to be executed.
+ * @data: Data to pass to the function.
+ *
+ * Schedules the function @func for execution on the vCPU @cpu asynchronously
+ * and in quiescent state. Quiescent state means: (1) all other vCPUs are
+ * halted and (2) #qemu_global_mutex (a.k.a. BQL) in system-mode or
+ * #exclusive_lock in user-mode emulation is held while @func is executing.
+ */
+void async_safe_run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data);
+
+/**
  * qemu_get_cpu:
  * @index: The CPUState@cpu_index value of the CPU to obtain.
  *
diff --git a/linux-user/main.c b/linux-user/main.c
index fce61d5a35fc..d0ff5f9976e5 100644
--- a/linux-user/main.c
+++ b/linux-user/main.c
@@ -110,18 +110,17 @@  int cpu_get_pic_interrupt(CPUX86State *env)
    which requires quite a lot of per host/target work.  */
 static QemuMutex cpu_list_mutex;
 static QemuMutex exclusive_lock;
-static QemuCond exclusive_cond;
 static QemuCond exclusive_resume;
 static bool exclusive_pending;
-static int tcg_pending_threads;
 
 void qemu_init_cpu_loop(void)
 {
     qemu_mutex_init(&cpu_list_mutex);
     qemu_mutex_init(&exclusive_lock);
-    qemu_cond_init(&exclusive_cond);
     qemu_cond_init(&exclusive_resume);
     qemu_cond_init(&qemu_work_cond);
+    qemu_cond_init(&qemu_safe_work_cond);
+    qemu_cond_init(&qemu_exclusive_cond);
 }
 
 /* Make sure everything is in a consistent state for calling fork().  */
@@ -148,9 +147,10 @@  void fork_end(int child)
         exclusive_pending = false;
         qemu_mutex_init(&exclusive_lock);
         qemu_mutex_init(&cpu_list_mutex);
-        qemu_cond_init(&exclusive_cond);
         qemu_cond_init(&exclusive_resume);
         qemu_cond_init(&qemu_work_cond);
+        qemu_cond_init(&qemu_safe_work_cond);
+        qemu_cond_init(&qemu_exclusive_cond);
         qemu_mutex_init(&tcg_ctx.tb_ctx.tb_lock);
         gdbserver_fork(thread_cpu);
     } else {
@@ -190,7 +190,7 @@  static inline void start_exclusive(void)
         }
     }
     while (tcg_pending_threads) {
-        qemu_cond_wait(&exclusive_cond, &exclusive_lock);
+        qemu_cond_wait(&qemu_exclusive_cond, &exclusive_lock);
     }
 }
 
@@ -219,10 +219,11 @@  static inline void cpu_exec_end(CPUState *cpu)
     cpu->running = false;
     tcg_pending_threads--;
     if (!tcg_pending_threads) {
-        qemu_cond_signal(&exclusive_cond);
+        qemu_cond_broadcast(&qemu_exclusive_cond);
     }
     exclusive_idle();
     process_queued_cpu_work(cpu);
+    wait_safe_cpu_work();
     qemu_mutex_unlock(&exclusive_lock);
 }