diff mbox series

[3/3] coroutine: add x86 specific coroutine backend

Message ID 20190311123507.24867-4-pbonzini@redhat.com (mailing list archive)
State New, archived
Headers show
Series coroutine: add x86 specific coroutine backend | expand

Commit Message

Paolo Bonzini March 11, 2019, 12:35 p.m. UTC
This backend is faster (100ns vs 150ns per switch on my laptop), but
especially it will be possible to add CET support to it in 4.1.  In
the meanwhile, it is nice to have it as an experimental alternative.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 configure                        |   8 ++
 scripts/qemugdb/coroutine.py     |   5 +-
 scripts/qemugdb/coroutine_x86.py |  21 +++
 util/coroutine-x86.c             | 213 +++++++++++++++++++++++++++++++
 4 files changed, 245 insertions(+), 2 deletions(-)
 create mode 100644 scripts/qemugdb/coroutine_x86.py
 create mode 100644 util/coroutine-x86.c

Comments

Kevin Wolf March 13, 2019, 2:14 p.m. UTC | #1
Am 11.03.2019 um 13:35 hat Paolo Bonzini geschrieben:
> This backend is faster (100ns vs 150ns per switch on my laptop), but
> especially it will be possible to add CET support to it in 4.1.  In
> the meanwhile, it is nice to have it as an experimental alternative.
> 
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>

Creating a qcow2 image fails for me with a General Protection Fault.

Looks like this is because of a 'movaps %xmm0,(%rsp)' with an unaligned
stack pointer (0x7fffec5f8b78). We need to start with rsp 8 bytes lower
to comply with the calling convention.

> --- /dev/null
> +++ b/util/coroutine-x86.c
> @@ -0,0 +1,213 @@
> +/*
> + * x86-specific coroutine initialization code
> + *
> + * Copyright (C) 2006  Anthony Liguori <anthony@codemonkey.ws>
> + * Copyright (C) 2011  Kevin Wolf <kwolf@redhat.com>
> + * Copyright (C) 2019  Paolo Bonzini <pbonzini@redhat.com>
> + *
> + * This library is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.0 of the License, or (at your option) any later version.
> + *
> + * This library is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with this library; if not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +/* XXX Is there a nicer way to disable glibc's stack check for longjmp? */
> +#ifdef _FORTIFY_SOURCE
> +#undef _FORTIFY_SOURCE
> +#endif

You don't use setjmp/longjmp, so is this really necessary?

> +#include "qemu/osdep.h"
> +#include "qemu-common.h"
> +#include "qemu/coroutine_int.h"
> +
> +#ifdef CONFIG_VALGRIND_H
> +#include <valgrind/valgrind.h>
> +#endif
> +
> +#if defined(__SANITIZE_ADDRESS__) || __has_feature(address_sanitizer)
> +#ifdef CONFIG_ASAN_IFACE_FIBER
> +#define CONFIG_ASAN 1
> +#include <sanitizer/asan_interface.h>
> +#endif
> +#endif
> +
> +typedef struct {
> +    Coroutine base;
> +    void *stack;
> +    size_t stack_size;
> +    void *sp;
> +
> +#ifdef CONFIG_VALGRIND_H
> +    unsigned int valgrind_stack_id;
> +#endif
> +} CoroutineX86;
> +
> +/**
> + * Per-thread coroutine bookkeeping
> + */
> +static __thread CoroutineX86 leader;
> +static __thread Coroutine *current;
> +
> +static void finish_switch_fiber(void *fake_stack_save)
> +{
> +#ifdef CONFIG_ASAN
> +    const void *bottom_old;
> +    size_t size_old;
> +
> +    __sanitizer_finish_switch_fiber(fake_stack_save, &bottom_old, &size_old);
> +
> +    if (!leader.stack) {
> +        leader.stack = (void *)bottom_old;
> +        leader.stack_size = size_old;
> +    }
> +#endif
> +}
> +
> +static void start_switch_fiber(void **fake_stack_save,
> +                               const void *bottom, size_t size)
> +{
> +#ifdef CONFIG_ASAN
> +    __sanitizer_start_switch_fiber(fake_stack_save, bottom, size);
> +#endif
> +}

These two functions are duplicated without changes between ucontext and
x86, and they aren't really backend-specific. Should they be moved to a
place where both backends can share them, like util/qemu-coroutine.c?

> +/* On entry to a coroutine, rax is "value" and rsi is the coroutine itself.  */

rax is "action" (not "value"), and the coroutine is rdi (not rsi).

> +#define CO_SWITCH(from, to, action, jump) ({                                            \
> +    int ret = action;                                                                   \
> +    void *from_ = from;                                                                 \
> +    void *to_ = to;                                                                     \
> +    asm volatile(                                                                       \
> +        ".cfi_remember_state\n"                                                         \
> +        "pushq %%rbp\n"                     /* save scratch register on source stack */ \
> +        ".cfi_adjust_cfa_offset 8\n"                                                    \
> +        ".cfi_rel_offset %%rbp, 0\n"                                                    \
> +        "call 1f\n"                         /* switch continues at label 1 */           \
> +        ".cfi_adjust_cfa_offset 8\n"                                                    \
> +        "jmp 2f\n"                          /* switch back continues at label 2 */      \
> +        "1: movq (%%rsp), %%rbp\n"          /* save source IP for debugging */          \
> +        "movq %%rsp, %c[sp](%[FROM])\n"     /* save source SP */                        \
> +        "movq %c[sp](%[TO]), %%rsp\n"       /* load destination SP */                   \
> +        jump "\n"                           /* coroutine switch */                      \
> +        "2:"                                                                            \
> +        ".cfi_adjust_cfa_offset -8\n"                                                   \
> +        "popq %%rbp\n"                                                                  \
> +        ".cfi_adjust_cfa_offset -8\n"                                                   \
> +        ".cfi_restore_state\n"                                                          \
> +        : "+a" (ret), [FROM] "+b" (from_), [TO] "+D" (to_)                              \
> +        : [sp] "i" (offsetof(CoroutineX86, sp))                                         \
> +        : "rcx", "rdx", "rsi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",    \
> +          "memory");                                                                    \
> +    ret; \
> +})
> +
> +static void __attribute__((__used__)) coroutine_trampoline(void *arg)
> +{
> +    CoroutineX86 *self = arg;
> +    Coroutine *co = &self->base;
> +
> +    finish_switch_fiber(NULL);
> +
> +    while (true) {
> +        qemu_coroutine_switch(co, co->caller, COROUTINE_TERMINATE);
> +        co->entry(co->entry_arg);

Okay, inverse order because you have a fake entry on creation below...

> +    }
> +}
> +
> +Coroutine *qemu_coroutine_new(void)
> +{
> +    CoroutineX86 *co;
> +    void *fake_stack_save = NULL;
> +
> +    co = g_malloc0(sizeof(*co));
> +    co->stack_size = COROUTINE_STACK_SIZE;
> +    co->stack = qemu_alloc_stack(&co->stack_size);
> +    co->sp = co->stack + co->stack_size;
> +
> +#ifdef CONFIG_VALGRIND_H
> +    co->valgrind_stack_id =
> +        VALGRIND_STACK_REGISTER(co->stack, co->stack + co->stack_size);
> +#endif
> +
> +    /* Immediately enter the coroutine once to pass it its address as the argument */
> +    co->base.caller = qemu_coroutine_self();
> +    start_switch_fiber(&fake_stack_save, co->stack, co->stack_size);
> +    CO_SWITCH(current, co, 0, "jmp coroutine_trampoline");
> +    finish_switch_fiber(fake_stack_save);
> +    co->base.caller = NULL;

...but why is this necessary? CO_SWITCH() always passes the coroutine in
rdi, not just here, so wouldn't the first real call do this, too?

Ah, I see, because of the 'jmp coroutine_trampoline'. But the comment is
really misleading. Actually, I think the code would become simpler if
you just put the address of coroutine_trampoline on the initial stack
and then have 'ret' unconditionally (see below for a quick attempt at
something to squash in).

> +    return &co->base;
> +}
> +
> +#ifdef CONFIG_VALGRIND_H
> +#if defined(CONFIG_PRAGMA_DIAGNOSTIC_AVAILABLE) && !defined(__clang__)
> +/* Work around an unused variable in the valgrind.h macro... */
> +#pragma GCC diagnostic push
> +#pragma GCC diagnostic ignored "-Wunused-but-set-variable"
> +#endif
> +static inline void valgrind_stack_deregister(CoroutineX86 *co)
> +{
> +    VALGRIND_STACK_DEREGISTER(co->valgrind_stack_id);
> +}
> +#if defined(CONFIG_PRAGMA_DIAGNOSTIC_AVAILABLE) && !defined(__clang__)
> +#pragma GCC diagnostic pop
> +#endif
> +#endif

Another candidate for sharing instead of duplicating? (You could
trivially pass the valgrind_stack_id instead of the CoroutineX86
object.)

Kevin


diff --git a/util/coroutine-x86.c b/util/coroutine-x86.c
index b7649e7ae1..ff78c298c9 100644
--- a/util/coroutine-x86.c
+++ b/util/coroutine-x86.c
@@ -79,7 +79,7 @@ static void start_switch_fiber(void **fake_stack_save,
 }

 /* On entry to a coroutine, rax is "value" and rsi is the coroutine itself.  */
-#define CO_SWITCH(from, to, action, jump) ({                                            \
+#define CO_SWITCH(from, to, action) ({                                                  \
     int ret = action;                                                                   \
     void *from_ = from;                                                                 \
     void *to_ = to;                                                                     \
@@ -94,7 +94,7 @@ static void start_switch_fiber(void **fake_stack_save,
         "1: movq (%%rsp), %%rbp\n"          /* save source IP for debugging */          \
         "movq %%rsp, %c[sp](%[FROM])\n"     /* save source SP */                        \
         "movq %c[sp](%[TO]), %%rsp\n"       /* load destination SP */                   \
-        jump "\n"                           /* coroutine switch */                      \
+        "ret\n"                             /* coroutine switch */                      \
         "2:"                                                                            \
         ".cfi_adjust_cfa_offset -8\n"                                                   \
         "popq %%rbp\n"                                                                  \
@@ -115,15 +115,14 @@ static void __attribute__((__used__)) coroutine_trampoline(void *arg)
     finish_switch_fiber(NULL);

     while (true) {
-        qemu_coroutine_switch(co, co->caller, COROUTINE_TERMINATE);
         co->entry(co->entry_arg);
+        qemu_coroutine_switch(co, co->caller, COROUTINE_TERMINATE);
     }
 }

 Coroutine *qemu_coroutine_new(void)
 {
     CoroutineX86 *co;
-    void *fake_stack_save = NULL;

     co = g_malloc0(sizeof(*co));
     co->stack_size = COROUTINE_STACK_SIZE;
@@ -135,12 +134,10 @@ Coroutine *qemu_coroutine_new(void)
         VALGRIND_STACK_REGISTER(co->stack, co->stack + co->stack_size);
 #endif

-    /* Immediately enter the coroutine once to pass it its address as the argument */
-    co->base.caller = qemu_coroutine_self();
-    start_switch_fiber(&fake_stack_save, co->stack, co->stack_size);
-    CO_SWITCH(current, co, 0, "jmp coroutine_trampoline");
-    finish_switch_fiber(fake_stack_save);
-    co->base.caller = NULL;
+    /* Put entry point on the stack; 8 more bytes for the stack alignment
+     * required by the calling convention. */
+    co->sp -= 16;
+    *(uint64_t*) co->sp = (uint64_t) coroutine_trampoline;

     return &co->base;
 }
@@ -193,7 +190,7 @@ qemu_coroutine_switch(Coroutine *from_, Coroutine *to_,

     start_switch_fiber(action == COROUTINE_TERMINATE ?
                        NULL : &fake_stack_save, to->stack, to->stack_size);
-    action = CO_SWITCH(from, to, action, "ret");
+    action = CO_SWITCH(from, to, action);
     finish_switch_fiber(fake_stack_save);

     return action;
Paolo Bonzini March 13, 2019, 5:12 p.m. UTC | #2
On 13/03/19 15:14, Kevin Wolf wrote:
>> +    /* Immediately enter the coroutine once to pass it its address as the argument */
>> +    co->base.caller = qemu_coroutine_self();
>> +    start_switch_fiber(&fake_stack_save, co->stack, co->stack_size);
>> +    CO_SWITCH(current, co, 0, "jmp coroutine_trampoline");
>> +    finish_switch_fiber(fake_stack_save);
>> +    co->base.caller = NULL;
> ...but why is this necessary? CO_SWITCH() always passes the coroutine in
> rdi, not just here, so wouldn't the first real call do this, too?
> 
> Ah, I see, because of the 'jmp coroutine_trampoline'. But the comment is
> really misleading. Actually, I think the code would become simpler if
> you just put the address of coroutine_trampoline on the initial stack
> and then have 'ret' unconditionally (see below for a quick attempt at
> something to squash in).

Actually, it becomes even simpler if I do "call coroutine_trampoline".
Then I don't have to fiddle with the stack pointer anymore in order to
correct the alignment: the ABI says that %sp must be aligned before the
call, and it already will be if I let "call" push the return address.

It's true that then I wouldn't really need the CO_SWITCH at all here,
but leaving it there might make it a bit simpler to port to other
architectures, by removing the target-dependent stack pointer manipulation.

I'll fix the comment though.

>> +    return &co->base;
>> +}
>> +
>> +#ifdef CONFIG_VALGRIND_H
>> +#if defined(CONFIG_PRAGMA_DIAGNOSTIC_AVAILABLE) && !defined(__clang__)
>> +/* Work around an unused variable in the valgrind.h macro... */
>> +#pragma GCC diagnostic push
>> +#pragma GCC diagnostic ignored "-Wunused-but-set-variable"
>> +#endif
>> +static inline void valgrind_stack_deregister(CoroutineX86 *co)
>> +{
>> +    VALGRIND_STACK_DEREGISTER(co->valgrind_stack_id);
>> +}
>> +#if defined(CONFIG_PRAGMA_DIAGNOSTIC_AVAILABLE) && !defined(__clang__)
>> +#pragma GCC diagnostic pop
>> +#endif
>> +#endif
> Another candidate for sharing instead of duplicating? (You could
> trivially pass the valgrind_stack_id instead of the CoroutineX86
> object.)

Yes, good idea.

Paolo
diff mbox series

Patch

diff --git a/configure b/configure
index 62a2a490f2..af65edc30a 100755
--- a/configure
+++ b/configure
@@ -5123,6 +5123,14 @@  else
       error_exit "only the 'windows' coroutine backend is valid for Windows"
     fi
     ;;
+  x86)
+    if test "$mingw32" = "yes"; then
+      error_exit "only the 'windows' coroutine backend is valid for Windows"
+    fi
+    if test "$cpu" != "x86_64"; then
+      error_exit "the 'x86' backend is only valid for x86_64 hosts"
+    fi
+    ;;
   *)
     error_exit "unknown coroutine backend $coroutine"
     ;;
diff --git a/scripts/qemugdb/coroutine.py b/scripts/qemugdb/coroutine.py
index db2753d949..f716db22bb 100644
--- a/scripts/qemugdb/coroutine.py
+++ b/scripts/qemugdb/coroutine.py
@@ -10,14 +10,15 @@ 
 # This work is licensed under the terms of the GNU GPL, version 2
 # or later.  See the COPYING file in the top-level directory.
 
-from . import coroutine_ucontext
+from . import coroutine_ucontext, coroutine_x86
 import gdb
 
 VOID_PTR = gdb.lookup_type('void').pointer()
 UINTPTR_T = gdb.lookup_type('uintptr_t')
 
 backends = {
-    'CoroutineUContext': coroutine_ucontext
+    'CoroutineUContext': coroutine_ucontext,
+    'CoroutineX86': coroutine_x86
 }
 
 def coroutine_backend():
diff --git a/scripts/qemugdb/coroutine_x86.py b/scripts/qemugdb/coroutine_x86.py
new file mode 100644
index 0000000000..05f830cdb8
--- /dev/null
+++ b/scripts/qemugdb/coroutine_x86.py
@@ -0,0 +1,21 @@ 
+#!/usr/bin/python
+
+# GDB debugging support
+#
+# Copyright 2019 Red Hat, Inc.
+#
+# Authors:
+#  Paolo Bonzini <pbonzini@redhat.com>
+#
+# This work is licensed under the terms of the GNU GPL, version 2 or
+# later.  See the COPYING file in the top-level directory.
+
+import gdb
+
+U64_PTR = gdb.lookup_type('uint64_t').pointer()
+
+def get_coroutine_regs(addr):
+    addr = addr.cast(gdb.lookup_type('CoroutineX86').pointer())
+    rsp = addr['sp'].cast(U64_PTR)
+    return {'rsp': rsp,
+            'rip': rsp.dereference()}
diff --git a/util/coroutine-x86.c b/util/coroutine-x86.c
new file mode 100644
index 0000000000..7f5e7d7696
--- /dev/null
+++ b/util/coroutine-x86.c
@@ -0,0 +1,213 @@ 
+/*
+ * x86-specific coroutine initialization code
+ *
+ * Copyright (C) 2006  Anthony Liguori <anthony@codemonkey.ws>
+ * Copyright (C) 2011  Kevin Wolf <kwolf@redhat.com>
+ * Copyright (C) 2019  Paolo Bonzini <pbonzini@redhat.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.0 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/* XXX Is there a nicer way to disable glibc's stack check for longjmp? */
+#ifdef _FORTIFY_SOURCE
+#undef _FORTIFY_SOURCE
+#endif
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+#include "qemu/coroutine_int.h"
+
+#ifdef CONFIG_VALGRIND_H
+#include <valgrind/valgrind.h>
+#endif
+
+#if defined(__SANITIZE_ADDRESS__) || __has_feature(address_sanitizer)
+#ifdef CONFIG_ASAN_IFACE_FIBER
+#define CONFIG_ASAN 1
+#include <sanitizer/asan_interface.h>
+#endif
+#endif
+
+typedef struct {
+    Coroutine base;
+    void *stack;
+    size_t stack_size;
+    void *sp;
+
+#ifdef CONFIG_VALGRIND_H
+    unsigned int valgrind_stack_id;
+#endif
+} CoroutineX86;
+
+/**
+ * Per-thread coroutine bookkeeping
+ */
+static __thread CoroutineX86 leader;
+static __thread Coroutine *current;
+
+static void finish_switch_fiber(void *fake_stack_save)
+{
+#ifdef CONFIG_ASAN
+    const void *bottom_old;
+    size_t size_old;
+
+    __sanitizer_finish_switch_fiber(fake_stack_save, &bottom_old, &size_old);
+
+    if (!leader.stack) {
+        leader.stack = (void *)bottom_old;
+        leader.stack_size = size_old;
+    }
+#endif
+}
+
+static void start_switch_fiber(void **fake_stack_save,
+                               const void *bottom, size_t size)
+{
+#ifdef CONFIG_ASAN
+    __sanitizer_start_switch_fiber(fake_stack_save, bottom, size);
+#endif
+}
+
+/* On entry to a coroutine, rax is "value" and rsi is the coroutine itself.  */
+#define CO_SWITCH(from, to, action, jump) ({                                            \
+    int ret = action;                                                                   \
+    void *from_ = from;                                                                 \
+    void *to_ = to;                                                                     \
+    asm volatile(                                                                       \
+        ".cfi_remember_state\n"                                                         \
+        "pushq %%rbp\n"                     /* save scratch register on source stack */ \
+        ".cfi_adjust_cfa_offset 8\n"                                                    \
+        ".cfi_rel_offset %%rbp, 0\n"                                                    \
+        "call 1f\n"                         /* switch continues at label 1 */           \
+        ".cfi_adjust_cfa_offset 8\n"                                                    \
+        "jmp 2f\n"                          /* switch back continues at label 2 */      \
+        "1: movq (%%rsp), %%rbp\n"          /* save source IP for debugging */          \
+        "movq %%rsp, %c[sp](%[FROM])\n"     /* save source SP */                        \
+        "movq %c[sp](%[TO]), %%rsp\n"       /* load destination SP */                   \
+        jump "\n"                           /* coroutine switch */                      \
+        "2:"                                                                            \
+        ".cfi_adjust_cfa_offset -8\n"                                                   \
+        "popq %%rbp\n"                                                                  \
+        ".cfi_adjust_cfa_offset -8\n"                                                   \
+        ".cfi_restore_state\n"                                                          \
+        : "+a" (ret), [FROM] "+b" (from_), [TO] "+D" (to_)                              \
+        : [sp] "i" (offsetof(CoroutineX86, sp))                                         \
+        : "rcx", "rdx", "rsi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",    \
+          "memory");                                                                    \
+    ret; \
+})
+
+static void __attribute__((__used__)) coroutine_trampoline(void *arg)
+{
+    CoroutineX86 *self = arg;
+    Coroutine *co = &self->base;
+
+    finish_switch_fiber(NULL);
+
+    while (true) {
+        qemu_coroutine_switch(co, co->caller, COROUTINE_TERMINATE);
+        co->entry(co->entry_arg);
+    }
+}
+
+Coroutine *qemu_coroutine_new(void)
+{
+    CoroutineX86 *co;
+    void *fake_stack_save = NULL;
+
+    co = g_malloc0(sizeof(*co));
+    co->stack_size = COROUTINE_STACK_SIZE;
+    co->stack = qemu_alloc_stack(&co->stack_size);
+    co->sp = co->stack + co->stack_size;
+
+#ifdef CONFIG_VALGRIND_H
+    co->valgrind_stack_id =
+        VALGRIND_STACK_REGISTER(co->stack, co->stack + co->stack_size);
+#endif
+
+    /* Immediately enter the coroutine once to pass it its address as the argument */
+    co->base.caller = qemu_coroutine_self();
+    start_switch_fiber(&fake_stack_save, co->stack, co->stack_size);
+    CO_SWITCH(current, co, 0, "jmp coroutine_trampoline");
+    finish_switch_fiber(fake_stack_save);
+    co->base.caller = NULL;
+
+    return &co->base;
+}
+
+#ifdef CONFIG_VALGRIND_H
+#if defined(CONFIG_PRAGMA_DIAGNOSTIC_AVAILABLE) && !defined(__clang__)
+/* Work around an unused variable in the valgrind.h macro... */
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-but-set-variable"
+#endif
+static inline void valgrind_stack_deregister(CoroutineX86 *co)
+{
+    VALGRIND_STACK_DEREGISTER(co->valgrind_stack_id);
+}
+#if defined(CONFIG_PRAGMA_DIAGNOSTIC_AVAILABLE) && !defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+#endif
+
+void qemu_coroutine_delete(Coroutine *co_)
+{
+    CoroutineX86 *co = DO_UPCAST(CoroutineX86, base, co_);
+
+#ifdef CONFIG_VALGRIND_H
+    valgrind_stack_deregister(co);
+#endif
+
+    qemu_free_stack(co->stack, co->stack_size);
+    g_free(co);
+}
+
+/*
+ * This function is marked noinline to prevent GCC from inlining it
+ * into coroutine_trampoline(). If we allow it to do that then it
+ * hoists the code to get the address of the TLS variable "current"
+ * out of the while() loop. This is an invalid transformation because
+ * qemu_coroutine_switch() may be called when running thread A but
+ * return in thread B, and so we might be in a different thread
+ * context each time round the loop.
+ */
+CoroutineAction __attribute__((noinline))
+qemu_coroutine_switch(Coroutine *from_, Coroutine *to_,
+                      CoroutineAction action)
+{
+    CoroutineX86 *from = DO_UPCAST(CoroutineX86, base, from_);
+    CoroutineX86 *to = DO_UPCAST(CoroutineX86, base, to_);
+    void *fake_stack_save = NULL;
+
+    current = to_;
+
+    start_switch_fiber(action == COROUTINE_TERMINATE ?
+                       NULL : &fake_stack_save, to->stack, to->stack_size);
+    action = CO_SWITCH(from, to, action, "ret");
+    finish_switch_fiber(fake_stack_save);
+
+    return action;
+}
+
+Coroutine *qemu_coroutine_self(void)
+{
+    if (!current) {
+        current = &leader.base;
+    }
+    return current;
+}
+
+bool qemu_in_coroutine(void)
+{
+    return current && current->caller;
+}