Message ID | 20190311123507.24867-4-pbonzini@redhat.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | coroutine: add x86 specific coroutine backend | expand |
Am 11.03.2019 um 13:35 hat Paolo Bonzini geschrieben: > This backend is faster (100ns vs 150ns per switch on my laptop), but > especially it will be possible to add CET support to it in 4.1. In > the meanwhile, it is nice to have it as an experimental alternative. > > Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> Creating a qcow2 image fails for me with a General Protection Fault. Looks like this is because of a 'movaps %xmm0,(%rsp)' with an unaligned stack pointer (0x7fffec5f8b78). We need to start with rsp 8 bytes lower to comply with the calling convention. > --- /dev/null > +++ b/util/coroutine-x86.c > @@ -0,0 +1,213 @@ > +/* > + * x86-specific coroutine initialization code > + * > + * Copyright (C) 2006 Anthony Liguori <anthony@codemonkey.ws> > + * Copyright (C) 2011 Kevin Wolf <kwolf@redhat.com> > + * Copyright (C) 2019 Paolo Bonzini <pbonzini@redhat.com> > + * > + * This library is free software; you can redistribute it and/or > + * modify it under the terms of the GNU Lesser General Public > + * License as published by the Free Software Foundation; either > + * version 2.0 of the License, or (at your option) any later version. > + * > + * This library is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * Lesser General Public License for more details. > + * > + * You should have received a copy of the GNU Lesser General Public > + * License along with this library; if not, see <http://www.gnu.org/licenses/>. > + */ > + > +/* XXX Is there a nicer way to disable glibc's stack check for longjmp? */ > +#ifdef _FORTIFY_SOURCE > +#undef _FORTIFY_SOURCE > +#endif You don't use setjmp/longjmp, so is this really necessary? > +#include "qemu/osdep.h" > +#include "qemu-common.h" > +#include "qemu/coroutine_int.h" > + > +#ifdef CONFIG_VALGRIND_H > +#include <valgrind/valgrind.h> > +#endif > + > +#if defined(__SANITIZE_ADDRESS__) || __has_feature(address_sanitizer) > +#ifdef CONFIG_ASAN_IFACE_FIBER > +#define CONFIG_ASAN 1 > +#include <sanitizer/asan_interface.h> > +#endif > +#endif > + > +typedef struct { > + Coroutine base; > + void *stack; > + size_t stack_size; > + void *sp; > + > +#ifdef CONFIG_VALGRIND_H > + unsigned int valgrind_stack_id; > +#endif > +} CoroutineX86; > + > +/** > + * Per-thread coroutine bookkeeping > + */ > +static __thread CoroutineX86 leader; > +static __thread Coroutine *current; > + > +static void finish_switch_fiber(void *fake_stack_save) > +{ > +#ifdef CONFIG_ASAN > + const void *bottom_old; > + size_t size_old; > + > + __sanitizer_finish_switch_fiber(fake_stack_save, &bottom_old, &size_old); > + > + if (!leader.stack) { > + leader.stack = (void *)bottom_old; > + leader.stack_size = size_old; > + } > +#endif > +} > + > +static void start_switch_fiber(void **fake_stack_save, > + const void *bottom, size_t size) > +{ > +#ifdef CONFIG_ASAN > + __sanitizer_start_switch_fiber(fake_stack_save, bottom, size); > +#endif > +} These two functions are duplicated without changes between ucontext and x86, and they aren't really backend-specific. Should they be moved to a place where both backends can share them, like util/qemu-coroutine.c? > +/* On entry to a coroutine, rax is "value" and rsi is the coroutine itself. */ rax is "action" (not "value"), and the coroutine is rdi (not rsi). > +#define CO_SWITCH(from, to, action, jump) ({ \ > + int ret = action; \ > + void *from_ = from; \ > + void *to_ = to; \ > + asm volatile( \ > + ".cfi_remember_state\n" \ > + "pushq %%rbp\n" /* save scratch register on source stack */ \ > + ".cfi_adjust_cfa_offset 8\n" \ > + ".cfi_rel_offset %%rbp, 0\n" \ > + "call 1f\n" /* switch continues at label 1 */ \ > + ".cfi_adjust_cfa_offset 8\n" \ > + "jmp 2f\n" /* switch back continues at label 2 */ \ > + "1: movq (%%rsp), %%rbp\n" /* save source IP for debugging */ \ > + "movq %%rsp, %c[sp](%[FROM])\n" /* save source SP */ \ > + "movq %c[sp](%[TO]), %%rsp\n" /* load destination SP */ \ > + jump "\n" /* coroutine switch */ \ > + "2:" \ > + ".cfi_adjust_cfa_offset -8\n" \ > + "popq %%rbp\n" \ > + ".cfi_adjust_cfa_offset -8\n" \ > + ".cfi_restore_state\n" \ > + : "+a" (ret), [FROM] "+b" (from_), [TO] "+D" (to_) \ > + : [sp] "i" (offsetof(CoroutineX86, sp)) \ > + : "rcx", "rdx", "rsi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", \ > + "memory"); \ > + ret; \ > +}) > + > +static void __attribute__((__used__)) coroutine_trampoline(void *arg) > +{ > + CoroutineX86 *self = arg; > + Coroutine *co = &self->base; > + > + finish_switch_fiber(NULL); > + > + while (true) { > + qemu_coroutine_switch(co, co->caller, COROUTINE_TERMINATE); > + co->entry(co->entry_arg); Okay, inverse order because you have a fake entry on creation below... > + } > +} > + > +Coroutine *qemu_coroutine_new(void) > +{ > + CoroutineX86 *co; > + void *fake_stack_save = NULL; > + > + co = g_malloc0(sizeof(*co)); > + co->stack_size = COROUTINE_STACK_SIZE; > + co->stack = qemu_alloc_stack(&co->stack_size); > + co->sp = co->stack + co->stack_size; > + > +#ifdef CONFIG_VALGRIND_H > + co->valgrind_stack_id = > + VALGRIND_STACK_REGISTER(co->stack, co->stack + co->stack_size); > +#endif > + > + /* Immediately enter the coroutine once to pass it its address as the argument */ > + co->base.caller = qemu_coroutine_self(); > + start_switch_fiber(&fake_stack_save, co->stack, co->stack_size); > + CO_SWITCH(current, co, 0, "jmp coroutine_trampoline"); > + finish_switch_fiber(fake_stack_save); > + co->base.caller = NULL; ...but why is this necessary? CO_SWITCH() always passes the coroutine in rdi, not just here, so wouldn't the first real call do this, too? Ah, I see, because of the 'jmp coroutine_trampoline'. But the comment is really misleading. Actually, I think the code would become simpler if you just put the address of coroutine_trampoline on the initial stack and then have 'ret' unconditionally (see below for a quick attempt at something to squash in). > + return &co->base; > +} > + > +#ifdef CONFIG_VALGRIND_H > +#if defined(CONFIG_PRAGMA_DIAGNOSTIC_AVAILABLE) && !defined(__clang__) > +/* Work around an unused variable in the valgrind.h macro... */ > +#pragma GCC diagnostic push > +#pragma GCC diagnostic ignored "-Wunused-but-set-variable" > +#endif > +static inline void valgrind_stack_deregister(CoroutineX86 *co) > +{ > + VALGRIND_STACK_DEREGISTER(co->valgrind_stack_id); > +} > +#if defined(CONFIG_PRAGMA_DIAGNOSTIC_AVAILABLE) && !defined(__clang__) > +#pragma GCC diagnostic pop > +#endif > +#endif Another candidate for sharing instead of duplicating? (You could trivially pass the valgrind_stack_id instead of the CoroutineX86 object.) Kevin diff --git a/util/coroutine-x86.c b/util/coroutine-x86.c index b7649e7ae1..ff78c298c9 100644 --- a/util/coroutine-x86.c +++ b/util/coroutine-x86.c @@ -79,7 +79,7 @@ static void start_switch_fiber(void **fake_stack_save, } /* On entry to a coroutine, rax is "value" and rsi is the coroutine itself. */ -#define CO_SWITCH(from, to, action, jump) ({ \ +#define CO_SWITCH(from, to, action) ({ \ int ret = action; \ void *from_ = from; \ void *to_ = to; \ @@ -94,7 +94,7 @@ static void start_switch_fiber(void **fake_stack_save, "1: movq (%%rsp), %%rbp\n" /* save source IP for debugging */ \ "movq %%rsp, %c[sp](%[FROM])\n" /* save source SP */ \ "movq %c[sp](%[TO]), %%rsp\n" /* load destination SP */ \ - jump "\n" /* coroutine switch */ \ + "ret\n" /* coroutine switch */ \ "2:" \ ".cfi_adjust_cfa_offset -8\n" \ "popq %%rbp\n" \ @@ -115,15 +115,14 @@ static void __attribute__((__used__)) coroutine_trampoline(void *arg) finish_switch_fiber(NULL); while (true) { - qemu_coroutine_switch(co, co->caller, COROUTINE_TERMINATE); co->entry(co->entry_arg); + qemu_coroutine_switch(co, co->caller, COROUTINE_TERMINATE); } } Coroutine *qemu_coroutine_new(void) { CoroutineX86 *co; - void *fake_stack_save = NULL; co = g_malloc0(sizeof(*co)); co->stack_size = COROUTINE_STACK_SIZE; @@ -135,12 +134,10 @@ Coroutine *qemu_coroutine_new(void) VALGRIND_STACK_REGISTER(co->stack, co->stack + co->stack_size); #endif - /* Immediately enter the coroutine once to pass it its address as the argument */ - co->base.caller = qemu_coroutine_self(); - start_switch_fiber(&fake_stack_save, co->stack, co->stack_size); - CO_SWITCH(current, co, 0, "jmp coroutine_trampoline"); - finish_switch_fiber(fake_stack_save); - co->base.caller = NULL; + /* Put entry point on the stack; 8 more bytes for the stack alignment + * required by the calling convention. */ + co->sp -= 16; + *(uint64_t*) co->sp = (uint64_t) coroutine_trampoline; return &co->base; } @@ -193,7 +190,7 @@ qemu_coroutine_switch(Coroutine *from_, Coroutine *to_, start_switch_fiber(action == COROUTINE_TERMINATE ? NULL : &fake_stack_save, to->stack, to->stack_size); - action = CO_SWITCH(from, to, action, "ret"); + action = CO_SWITCH(from, to, action); finish_switch_fiber(fake_stack_save); return action;
On 13/03/19 15:14, Kevin Wolf wrote: >> + /* Immediately enter the coroutine once to pass it its address as the argument */ >> + co->base.caller = qemu_coroutine_self(); >> + start_switch_fiber(&fake_stack_save, co->stack, co->stack_size); >> + CO_SWITCH(current, co, 0, "jmp coroutine_trampoline"); >> + finish_switch_fiber(fake_stack_save); >> + co->base.caller = NULL; > ...but why is this necessary? CO_SWITCH() always passes the coroutine in > rdi, not just here, so wouldn't the first real call do this, too? > > Ah, I see, because of the 'jmp coroutine_trampoline'. But the comment is > really misleading. Actually, I think the code would become simpler if > you just put the address of coroutine_trampoline on the initial stack > and then have 'ret' unconditionally (see below for a quick attempt at > something to squash in). Actually, it becomes even simpler if I do "call coroutine_trampoline". Then I don't have to fiddle with the stack pointer anymore in order to correct the alignment: the ABI says that %sp must be aligned before the call, and it already will be if I let "call" push the return address. It's true that then I wouldn't really need the CO_SWITCH at all here, but leaving it there might make it a bit simpler to port to other architectures, by removing the target-dependent stack pointer manipulation. I'll fix the comment though. >> + return &co->base; >> +} >> + >> +#ifdef CONFIG_VALGRIND_H >> +#if defined(CONFIG_PRAGMA_DIAGNOSTIC_AVAILABLE) && !defined(__clang__) >> +/* Work around an unused variable in the valgrind.h macro... */ >> +#pragma GCC diagnostic push >> +#pragma GCC diagnostic ignored "-Wunused-but-set-variable" >> +#endif >> +static inline void valgrind_stack_deregister(CoroutineX86 *co) >> +{ >> + VALGRIND_STACK_DEREGISTER(co->valgrind_stack_id); >> +} >> +#if defined(CONFIG_PRAGMA_DIAGNOSTIC_AVAILABLE) && !defined(__clang__) >> +#pragma GCC diagnostic pop >> +#endif >> +#endif > Another candidate for sharing instead of duplicating? (You could > trivially pass the valgrind_stack_id instead of the CoroutineX86 > object.) Yes, good idea. Paolo
diff --git a/configure b/configure index 62a2a490f2..af65edc30a 100755 --- a/configure +++ b/configure @@ -5123,6 +5123,14 @@ else error_exit "only the 'windows' coroutine backend is valid for Windows" fi ;; + x86) + if test "$mingw32" = "yes"; then + error_exit "only the 'windows' coroutine backend is valid for Windows" + fi + if test "$cpu" != "x86_64"; then + error_exit "the 'x86' backend is only valid for x86_64 hosts" + fi + ;; *) error_exit "unknown coroutine backend $coroutine" ;; diff --git a/scripts/qemugdb/coroutine.py b/scripts/qemugdb/coroutine.py index db2753d949..f716db22bb 100644 --- a/scripts/qemugdb/coroutine.py +++ b/scripts/qemugdb/coroutine.py @@ -10,14 +10,15 @@ # This work is licensed under the terms of the GNU GPL, version 2 # or later. See the COPYING file in the top-level directory. -from . import coroutine_ucontext +from . import coroutine_ucontext, coroutine_x86 import gdb VOID_PTR = gdb.lookup_type('void').pointer() UINTPTR_T = gdb.lookup_type('uintptr_t') backends = { - 'CoroutineUContext': coroutine_ucontext + 'CoroutineUContext': coroutine_ucontext, + 'CoroutineX86': coroutine_x86 } def coroutine_backend(): diff --git a/scripts/qemugdb/coroutine_x86.py b/scripts/qemugdb/coroutine_x86.py new file mode 100644 index 0000000000..05f830cdb8 --- /dev/null +++ b/scripts/qemugdb/coroutine_x86.py @@ -0,0 +1,21 @@ +#!/usr/bin/python + +# GDB debugging support +# +# Copyright 2019 Red Hat, Inc. +# +# Authors: +# Paolo Bonzini <pbonzini@redhat.com> +# +# This work is licensed under the terms of the GNU GPL, version 2 or +# later. See the COPYING file in the top-level directory. + +import gdb + +U64_PTR = gdb.lookup_type('uint64_t').pointer() + +def get_coroutine_regs(addr): + addr = addr.cast(gdb.lookup_type('CoroutineX86').pointer()) + rsp = addr['sp'].cast(U64_PTR) + return {'rsp': rsp, + 'rip': rsp.dereference()} diff --git a/util/coroutine-x86.c b/util/coroutine-x86.c new file mode 100644 index 0000000000..7f5e7d7696 --- /dev/null +++ b/util/coroutine-x86.c @@ -0,0 +1,213 @@ +/* + * x86-specific coroutine initialization code + * + * Copyright (C) 2006 Anthony Liguori <anthony@codemonkey.ws> + * Copyright (C) 2011 Kevin Wolf <kwolf@redhat.com> + * Copyright (C) 2019 Paolo Bonzini <pbonzini@redhat.com> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.0 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see <http://www.gnu.org/licenses/>. + */ + +/* XXX Is there a nicer way to disable glibc's stack check for longjmp? */ +#ifdef _FORTIFY_SOURCE +#undef _FORTIFY_SOURCE +#endif +#include "qemu/osdep.h" +#include "qemu-common.h" +#include "qemu/coroutine_int.h" + +#ifdef CONFIG_VALGRIND_H +#include <valgrind/valgrind.h> +#endif + +#if defined(__SANITIZE_ADDRESS__) || __has_feature(address_sanitizer) +#ifdef CONFIG_ASAN_IFACE_FIBER +#define CONFIG_ASAN 1 +#include <sanitizer/asan_interface.h> +#endif +#endif + +typedef struct { + Coroutine base; + void *stack; + size_t stack_size; + void *sp; + +#ifdef CONFIG_VALGRIND_H + unsigned int valgrind_stack_id; +#endif +} CoroutineX86; + +/** + * Per-thread coroutine bookkeeping + */ +static __thread CoroutineX86 leader; +static __thread Coroutine *current; + +static void finish_switch_fiber(void *fake_stack_save) +{ +#ifdef CONFIG_ASAN + const void *bottom_old; + size_t size_old; + + __sanitizer_finish_switch_fiber(fake_stack_save, &bottom_old, &size_old); + + if (!leader.stack) { + leader.stack = (void *)bottom_old; + leader.stack_size = size_old; + } +#endif +} + +static void start_switch_fiber(void **fake_stack_save, + const void *bottom, size_t size) +{ +#ifdef CONFIG_ASAN + __sanitizer_start_switch_fiber(fake_stack_save, bottom, size); +#endif +} + +/* On entry to a coroutine, rax is "value" and rsi is the coroutine itself. */ +#define CO_SWITCH(from, to, action, jump) ({ \ + int ret = action; \ + void *from_ = from; \ + void *to_ = to; \ + asm volatile( \ + ".cfi_remember_state\n" \ + "pushq %%rbp\n" /* save scratch register on source stack */ \ + ".cfi_adjust_cfa_offset 8\n" \ + ".cfi_rel_offset %%rbp, 0\n" \ + "call 1f\n" /* switch continues at label 1 */ \ + ".cfi_adjust_cfa_offset 8\n" \ + "jmp 2f\n" /* switch back continues at label 2 */ \ + "1: movq (%%rsp), %%rbp\n" /* save source IP for debugging */ \ + "movq %%rsp, %c[sp](%[FROM])\n" /* save source SP */ \ + "movq %c[sp](%[TO]), %%rsp\n" /* load destination SP */ \ + jump "\n" /* coroutine switch */ \ + "2:" \ + ".cfi_adjust_cfa_offset -8\n" \ + "popq %%rbp\n" \ + ".cfi_adjust_cfa_offset -8\n" \ + ".cfi_restore_state\n" \ + : "+a" (ret), [FROM] "+b" (from_), [TO] "+D" (to_) \ + : [sp] "i" (offsetof(CoroutineX86, sp)) \ + : "rcx", "rdx", "rsi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", \ + "memory"); \ + ret; \ +}) + +static void __attribute__((__used__)) coroutine_trampoline(void *arg) +{ + CoroutineX86 *self = arg; + Coroutine *co = &self->base; + + finish_switch_fiber(NULL); + + while (true) { + qemu_coroutine_switch(co, co->caller, COROUTINE_TERMINATE); + co->entry(co->entry_arg); + } +} + +Coroutine *qemu_coroutine_new(void) +{ + CoroutineX86 *co; + void *fake_stack_save = NULL; + + co = g_malloc0(sizeof(*co)); + co->stack_size = COROUTINE_STACK_SIZE; + co->stack = qemu_alloc_stack(&co->stack_size); + co->sp = co->stack + co->stack_size; + +#ifdef CONFIG_VALGRIND_H + co->valgrind_stack_id = + VALGRIND_STACK_REGISTER(co->stack, co->stack + co->stack_size); +#endif + + /* Immediately enter the coroutine once to pass it its address as the argument */ + co->base.caller = qemu_coroutine_self(); + start_switch_fiber(&fake_stack_save, co->stack, co->stack_size); + CO_SWITCH(current, co, 0, "jmp coroutine_trampoline"); + finish_switch_fiber(fake_stack_save); + co->base.caller = NULL; + + return &co->base; +} + +#ifdef CONFIG_VALGRIND_H +#if defined(CONFIG_PRAGMA_DIAGNOSTIC_AVAILABLE) && !defined(__clang__) +/* Work around an unused variable in the valgrind.h macro... */ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-but-set-variable" +#endif +static inline void valgrind_stack_deregister(CoroutineX86 *co) +{ + VALGRIND_STACK_DEREGISTER(co->valgrind_stack_id); +} +#if defined(CONFIG_PRAGMA_DIAGNOSTIC_AVAILABLE) && !defined(__clang__) +#pragma GCC diagnostic pop +#endif +#endif + +void qemu_coroutine_delete(Coroutine *co_) +{ + CoroutineX86 *co = DO_UPCAST(CoroutineX86, base, co_); + +#ifdef CONFIG_VALGRIND_H + valgrind_stack_deregister(co); +#endif + + qemu_free_stack(co->stack, co->stack_size); + g_free(co); +} + +/* + * This function is marked noinline to prevent GCC from inlining it + * into coroutine_trampoline(). If we allow it to do that then it + * hoists the code to get the address of the TLS variable "current" + * out of the while() loop. This is an invalid transformation because + * qemu_coroutine_switch() may be called when running thread A but + * return in thread B, and so we might be in a different thread + * context each time round the loop. + */ +CoroutineAction __attribute__((noinline)) +qemu_coroutine_switch(Coroutine *from_, Coroutine *to_, + CoroutineAction action) +{ + CoroutineX86 *from = DO_UPCAST(CoroutineX86, base, from_); + CoroutineX86 *to = DO_UPCAST(CoroutineX86, base, to_); + void *fake_stack_save = NULL; + + current = to_; + + start_switch_fiber(action == COROUTINE_TERMINATE ? + NULL : &fake_stack_save, to->stack, to->stack_size); + action = CO_SWITCH(from, to, action, "ret"); + finish_switch_fiber(fake_stack_save); + + return action; +} + +Coroutine *qemu_coroutine_self(void) +{ + if (!current) { + current = &leader.base; + } + return current; +} + +bool qemu_in_coroutine(void) +{ + return current && current->caller; +}
This backend is faster (100ns vs 150ns per switch on my laptop), but especially it will be possible to add CET support to it in 4.1. In the meanwhile, it is nice to have it as an experimental alternative. Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> --- configure | 8 ++ scripts/qemugdb/coroutine.py | 5 +- scripts/qemugdb/coroutine_x86.py | 21 +++ util/coroutine-x86.c | 213 +++++++++++++++++++++++++++++++ 4 files changed, 245 insertions(+), 2 deletions(-) create mode 100644 scripts/qemugdb/coroutine_x86.py create mode 100644 util/coroutine-x86.c