Message ID | 1461283583-2833-1-git-send-email-cota@braap.org (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Emilio G. Cota <cota@braap.org> writes: > This is a first attempt at making tb_flush not have to stop all CPUs. > There are issues as pointed out below, but this could be a good start. > > Context: > https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg04658.html > https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg06942.html > > Known issues: > - Basically compile-tested only, since I've only run this with > single-threaded TCG; I also tried running it with linux-user, > but in order to trigger tb_flush I had to make code_gen_buffer > so small that the CPU calling tb_flush would immediately fill > the 2nd buffer, triggering the assert. If you have a working > multi-threaded workload that would be good to test this, please > let me know. With my latest mttcg unit tests: ./arm-softmmu/qemu-system-arm -machine virt,accel=tcg -cpu cortex-a15 \ -device virtio-serial-device -device virtconsole,chardev=ctd \ -chardev testdev,id=ctd -display none -serial stdio \ -kernel arm/tcg-test.flat -smp 4 -tcg mttcg=on \ -append "tight smc irq mod=1 rounds=100000" -name arm,debug-threads=on > - Windows; not even compile-tested! > > Signed-off-by: Emilio G. Cota <cota@braap.org> > --- > translate-all.c | 122 +++++++++++++++++++++++++++++++++++++++++++++++++++++--- > 1 file changed, 117 insertions(+), 5 deletions(-) > > diff --git a/translate-all.c b/translate-all.c > index bba9b62..4c14b4d 100644 > --- a/translate-all.c > +++ b/translate-all.c > @@ -536,8 +536,13 @@ static inline void *split_cross_256mb(void *buf1, size_t size1) > #endif > > #ifdef USE_STATIC_CODE_GEN_BUFFER > -static uint8_t static_code_gen_buffer[DEFAULT_CODE_GEN_BUFFER_SIZE] > +static uint8_t static_code_gen_buffer1[DEFAULT_CODE_GEN_BUFFER_SIZE] > __attribute__((aligned(CODE_GEN_ALIGN))); > +static uint8_t static_code_gen_buffer2[DEFAULT_CODE_GEN_BUFFER_SIZE] > + __attribute__((aligned(CODE_GEN_ALIGN))); > +static int static_buf_mask = 1; > +static void *static_buf1; > +static void *static_buf2; > > # ifdef _WIN32 > static inline void do_protect(void *addr, long size, int prot) > @@ -580,13 +585,12 @@ static inline void map_none(void *addr, long size) > } > # endif /* WIN32 */ > > -static inline void *alloc_code_gen_buffer(void) > +static void *alloc_static_code_gen_buffer(void *buf) > { > - void *buf = static_code_gen_buffer; > size_t full_size, size; > > /* The size of the buffer, rounded down to end on a page boundary. */ > - full_size = (((uintptr_t)buf + sizeof(static_code_gen_buffer)) > + full_size = (((uintptr_t)buf + sizeof(static_code_gen_buffer1)) > & qemu_real_host_page_mask) - (uintptr_t)buf; > > /* Reserve a guard page. */ > @@ -612,6 +616,15 @@ static inline void *alloc_code_gen_buffer(void) > > return buf; > } > + > +static inline void *alloc_code_gen_buffer(void) > +{ > + static_buf1 = alloc_static_code_gen_buffer(static_code_gen_buffer1); > + static_buf2 = alloc_static_code_gen_buffer(static_code_gen_buffer2); > + > + assert(static_buf_mask == 1); > + return static_buf1; > +} > #elif defined(_WIN32) > static inline void *alloc_code_gen_buffer(void) > { > @@ -829,8 +842,100 @@ static void page_flush_tb(void) > } > } > > +#ifdef USE_STATIC_CODE_GEN_BUFFER > + > +struct code_gen_desc { > + struct rcu_head rcu; > + int clear_bit; > +}; > + > +static void code_gen_buffer_clear(struct rcu_head *rcu) > +{ > + struct code_gen_desc *desc = container_of(rcu, struct code_gen_desc, rcu); > + > + tb_lock(); > + static_buf_mask &= ~desc->clear_bit; > + tb_unlock(); > + g_free(desc); > +} > + > +static void *code_gen_buffer_replace(void) > +{ > + struct code_gen_desc *desc = g_malloc0(sizeof(*desc)); > + > + /* > + * If both bits are set, we're having two concurrent flushes. This > + * can easily happen if the buffers are heavily undersized. > + */ > + assert(static_buf_mask == 1 || static_buf_mask == 2); > + > + desc->clear_bit = static_buf_mask; > + call_rcu1(&desc->rcu, code_gen_buffer_clear); > + > + if (static_buf_mask == 1) { > + static_buf_mask |= 2; > + return static_buf2; > + } > + static_buf_mask |= 1; > + return static_buf1; > +} > + > +#elif defined(_WIN32) > + > +struct code_gen_desc { > + struct rcu_head rcu; > + void *buf; > +}; > + > +static void code_gen_buffer_vfree(struct rcu_head *rcu) > +{ > + struct code_gen_desc *desc = container_of(rcu, struct code_gen_desc, rcu); > + > + VirtualFree(desc->buf, 0, MEM_RELEASE); > + g_free(desc); > +} > + > +static void *code_gen_buffer_replace(void) > +{ > + struct code_gen_desc *desc; > + > + desc = g_malloc0(sizeof(*desc)); > + desc->buf = tcg_ctx.code_gen_buffer; > + call_rcu1(&desc->rcu, code_gen_buffer_vfree); > + > + return alloc_code_gen_buffer(); > +} > + > +#else /* UNIX, dynamically-allocated code buffer */ > + > +struct code_gen_desc { > + struct rcu_head rcu; > + void *buf; > + size_t size; > +}; > + > +static void code_gen_buffer_unmap(struct rcu_head *rcu) > +{ > + struct code_gen_desc *desc = container_of(rcu, struct code_gen_desc, rcu); > + > + munmap(desc->buf, desc->size + qemu_real_host_page_size); > + g_free(desc); > +} > + > +static void *code_gen_buffer_replace(void) > +{ > + struct code_gen_desc *desc; > + > + desc = g_malloc0(sizeof(*desc)); > + desc->buf = tcg_ctx.code_gen_buffer; > + desc->size = tcg_ctx.code_gen_buffer_size; > + call_rcu1(&desc->rcu, code_gen_buffer_unmap); > + > + return alloc_code_gen_buffer(); > +} > +#endif /* USE_STATIC_CODE_GEN_BUFFER */ > + > /* flush all the translation blocks */ > -/* XXX: tb_flush is currently not thread safe */ > void tb_flush(CPUState *cpu) > { > #if defined(DEBUG_FLUSH) > @@ -853,10 +958,17 @@ void tb_flush(CPUState *cpu) > qht_reset_size(&tcg_ctx.tb_ctx.htable, CODE_GEN_HTABLE_SIZE); > page_flush_tb(); > > + tcg_ctx.code_gen_buffer = code_gen_buffer_replace(); > tcg_ctx.code_gen_ptr = tcg_ctx.code_gen_buffer; > + tcg_prologue_init(&tcg_ctx); > /* XXX: flush processor icache at this point if cache flush is > expensive */ > tcg_ctx.tb_ctx.tb_flush_count++; > + > + /* exit all CPUs so that the old buffer is quickly cleared. */ > + CPU_FOREACH(cpu) { > + cpu_exit(cpu); > + } > } > > #ifdef DEBUG_TB_CHECK -- Alex Bennée
Alex Bennée <alex.bennee@linaro.org> writes: > Emilio G. Cota <cota@braap.org> writes: > >> This is a first attempt at making tb_flush not have to stop all CPUs. >> There are issues as pointed out below, but this could be a good start. >> >> Context: >> https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg04658.html >> https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg06942.html >> >> Known issues: >> - Basically compile-tested only, since I've only run this with >> single-threaded TCG; I also tried running it with linux-user, >> but in order to trigger tb_flush I had to make code_gen_buffer >> so small that the CPU calling tb_flush would immediately fill >> the 2nd buffer, triggering the assert. If you have a working >> multi-threaded workload that would be good to test this, please >> let me know. > > With my latest mttcg unit tests: > > ./arm-softmmu/qemu-system-arm -machine virt,accel=tcg -cpu cortex-a15 \ > -device virtio-serial-device -device virtconsole,chardev=ctd \ > -chardev testdev,id=ctd -display none -serial stdio \ > -kernel arm/tcg-test.flat -smp 4 -tcg mttcg=on \ > -append "tight smc irq mod=1 rounds=100000" -name > arm,debug-threads=on Ahh, I just realised you wanted a linux-user workload. > > >> - Windows; not even compile-tested! >> >> Signed-off-by: Emilio G. Cota <cota@braap.org> >> --- >> translate-all.c | 122 +++++++++++++++++++++++++++++++++++++++++++++++++++++--- >> 1 file changed, 117 insertions(+), 5 deletions(-) >> >> diff --git a/translate-all.c b/translate-all.c >> index bba9b62..4c14b4d 100644 >> --- a/translate-all.c >> +++ b/translate-all.c >> @@ -536,8 +536,13 @@ static inline void *split_cross_256mb(void *buf1, size_t size1) >> #endif >> >> #ifdef USE_STATIC_CODE_GEN_BUFFER >> -static uint8_t static_code_gen_buffer[DEFAULT_CODE_GEN_BUFFER_SIZE] >> +static uint8_t static_code_gen_buffer1[DEFAULT_CODE_GEN_BUFFER_SIZE] >> __attribute__((aligned(CODE_GEN_ALIGN))); >> +static uint8_t static_code_gen_buffer2[DEFAULT_CODE_GEN_BUFFER_SIZE] >> + __attribute__((aligned(CODE_GEN_ALIGN))); >> +static int static_buf_mask = 1; >> +static void *static_buf1; >> +static void *static_buf2; >> >> # ifdef _WIN32 >> static inline void do_protect(void *addr, long size, int prot) >> @@ -580,13 +585,12 @@ static inline void map_none(void *addr, long size) >> } >> # endif /* WIN32 */ >> >> -static inline void *alloc_code_gen_buffer(void) >> +static void *alloc_static_code_gen_buffer(void *buf) >> { >> - void *buf = static_code_gen_buffer; >> size_t full_size, size; >> >> /* The size of the buffer, rounded down to end on a page boundary. */ >> - full_size = (((uintptr_t)buf + sizeof(static_code_gen_buffer)) >> + full_size = (((uintptr_t)buf + sizeof(static_code_gen_buffer1)) >> & qemu_real_host_page_mask) - (uintptr_t)buf; >> >> /* Reserve a guard page. */ >> @@ -612,6 +616,15 @@ static inline void *alloc_code_gen_buffer(void) >> >> return buf; >> } >> + >> +static inline void *alloc_code_gen_buffer(void) >> +{ >> + static_buf1 = alloc_static_code_gen_buffer(static_code_gen_buffer1); >> + static_buf2 = alloc_static_code_gen_buffer(static_code_gen_buffer2); >> + >> + assert(static_buf_mask == 1); >> + return static_buf1; >> +} >> #elif defined(_WIN32) >> static inline void *alloc_code_gen_buffer(void) >> { >> @@ -829,8 +842,100 @@ static void page_flush_tb(void) >> } >> } >> >> +#ifdef USE_STATIC_CODE_GEN_BUFFER >> + >> +struct code_gen_desc { >> + struct rcu_head rcu; >> + int clear_bit; >> +}; >> + >> +static void code_gen_buffer_clear(struct rcu_head *rcu) >> +{ >> + struct code_gen_desc *desc = container_of(rcu, struct code_gen_desc, rcu); >> + >> + tb_lock(); >> + static_buf_mask &= ~desc->clear_bit; >> + tb_unlock(); >> + g_free(desc); >> +} >> + >> +static void *code_gen_buffer_replace(void) >> +{ >> + struct code_gen_desc *desc = g_malloc0(sizeof(*desc)); >> + >> + /* >> + * If both bits are set, we're having two concurrent flushes. This >> + * can easily happen if the buffers are heavily undersized. >> + */ >> + assert(static_buf_mask == 1 || static_buf_mask == 2); >> + >> + desc->clear_bit = static_buf_mask; >> + call_rcu1(&desc->rcu, code_gen_buffer_clear); >> + >> + if (static_buf_mask == 1) { >> + static_buf_mask |= 2; >> + return static_buf2; >> + } >> + static_buf_mask |= 1; >> + return static_buf1; >> +} >> + >> +#elif defined(_WIN32) >> + >> +struct code_gen_desc { >> + struct rcu_head rcu; >> + void *buf; >> +}; >> + >> +static void code_gen_buffer_vfree(struct rcu_head *rcu) >> +{ >> + struct code_gen_desc *desc = container_of(rcu, struct code_gen_desc, rcu); >> + >> + VirtualFree(desc->buf, 0, MEM_RELEASE); >> + g_free(desc); >> +} >> + >> +static void *code_gen_buffer_replace(void) >> +{ >> + struct code_gen_desc *desc; >> + >> + desc = g_malloc0(sizeof(*desc)); >> + desc->buf = tcg_ctx.code_gen_buffer; >> + call_rcu1(&desc->rcu, code_gen_buffer_vfree); >> + >> + return alloc_code_gen_buffer(); >> +} >> + >> +#else /* UNIX, dynamically-allocated code buffer */ >> + >> +struct code_gen_desc { >> + struct rcu_head rcu; >> + void *buf; >> + size_t size; >> +}; >> + >> +static void code_gen_buffer_unmap(struct rcu_head *rcu) >> +{ >> + struct code_gen_desc *desc = container_of(rcu, struct code_gen_desc, rcu); >> + >> + munmap(desc->buf, desc->size + qemu_real_host_page_size); >> + g_free(desc); >> +} >> + >> +static void *code_gen_buffer_replace(void) >> +{ >> + struct code_gen_desc *desc; >> + >> + desc = g_malloc0(sizeof(*desc)); >> + desc->buf = tcg_ctx.code_gen_buffer; >> + desc->size = tcg_ctx.code_gen_buffer_size; >> + call_rcu1(&desc->rcu, code_gen_buffer_unmap); >> + >> + return alloc_code_gen_buffer(); >> +} >> +#endif /* USE_STATIC_CODE_GEN_BUFFER */ >> + >> /* flush all the translation blocks */ >> -/* XXX: tb_flush is currently not thread safe */ >> void tb_flush(CPUState *cpu) >> { >> #if defined(DEBUG_FLUSH) >> @@ -853,10 +958,17 @@ void tb_flush(CPUState *cpu) >> qht_reset_size(&tcg_ctx.tb_ctx.htable, CODE_GEN_HTABLE_SIZE); >> page_flush_tb(); >> >> + tcg_ctx.code_gen_buffer = code_gen_buffer_replace(); >> tcg_ctx.code_gen_ptr = tcg_ctx.code_gen_buffer; >> + tcg_prologue_init(&tcg_ctx); >> /* XXX: flush processor icache at this point if cache flush is >> expensive */ >> tcg_ctx.tb_ctx.tb_flush_count++; >> + >> + /* exit all CPUs so that the old buffer is quickly cleared. */ >> + CPU_FOREACH(cpu) { >> + cpu_exit(cpu); >> + } >> } >> >> #ifdef DEBUG_TB_CHECK -- Alex Bennée
On 04/21/2016 05:06 PM, Emilio G. Cota wrote: > #ifdef USE_STATIC_CODE_GEN_BUFFER > -static uint8_t static_code_gen_buffer[DEFAULT_CODE_GEN_BUFFER_SIZE] > +static uint8_t static_code_gen_buffer1[DEFAULT_CODE_GEN_BUFFER_SIZE] > __attribute__((aligned(CODE_GEN_ALIGN))); > +static uint8_t static_code_gen_buffer2[DEFAULT_CODE_GEN_BUFFER_SIZE] > + __attribute__((aligned(CODE_GEN_ALIGN))); > +static int static_buf_mask = 1; > +static void *static_buf1; > +static void *static_buf2; I don't like this at all. (1) This is (by default) 32MB we're adding to the RSS of the simulator. Surely we can do better than this. (2) On some hosts we require a maximum displacement from any point in the code gen buffer from the tcg prologue. That means you can't simply allocate two separate buffers. You have to take a single buffer, of known good size and alignment, and split it in half. r~
On Fri, Apr 22, 2016 at 15:41:13 +0100, Alex Bennée wrote: > Emilio G. Cota <cota@braap.org> writes: (snip) > > Known issues: > > - Basically compile-tested only, since I've only run this with > > single-threaded TCG; I also tried running it with linux-user, > > but in order to trigger tb_flush I had to make code_gen_buffer > > so small that the CPU calling tb_flush would immediately fill > > the 2nd buffer, triggering the assert. If you have a working > > multi-threaded workload that would be good to test this, please > > let me know. > > With my latest mttcg unit tests: > > ./arm-softmmu/qemu-system-arm -machine virt,accel=tcg -cpu cortex-a15 \ > -device virtio-serial-device -device virtconsole,chardev=ctd \ > -chardev testdev,id=ctd -display none -serial stdio \ > -kernel arm/tcg-test.flat -smp 4 -tcg mttcg=on \ > -append "tight smc irq mod=1 rounds=100000" -name arm,debug-threads=on This is useful. Never mind the need for testing linux-user, I can test both code paths (i.e. dynamic allocation and static buf) with qemu-system by simply defining USE_STATIC_CODE_GEN_BUFFER. After applying a modified version of this patch (that I'll send in a jiffy) to your enable-mttcg-for-armv7-v1 branch (reverting first "translate-all: introduces tb_flush_safe"), I can easily trigger this error when setting a low enough TB size, e.g. -tb-size 32: CPU1: online and setting up with pattern 0xa0b78cbf CPU2: online and setting up with pattern 0x22287c45 CPU3: online and setting up with pattern 0x6262c5c5 CPU0: online and setting up with pattern 0xa65e7ad6 qemu: flush code_size=10622184 nb_tbs=83886 avg_tb_size=126 qemu: flush code_size=10469016 nb_tbs=83886 avg_tb_size=124 qemu: flush code_size=10492920 nb_tbs=83886 avg_tb_size=125 qemu: flush code_size=10477464 nb_tbs=83886 avg_tb_size=124 qemu: flush code_size=10495800 nb_tbs=83886 avg_tb_size=125 PASS: smc: irq: 0 errors, IRQs not checked Unhandled exception 3 (pabt) Exception frame registers: pc : [<e59f2028>] lr : [<40010700>] psr: a0000153 sp : 400ac5c0 ip : 400ab4e8 fp : 40032ca8 r10: 00000000 r9 : 00000000 r8 : 00000000 r7 : 00000000 r6 : 00000000 r5 : 00000000 r4 : 00000000 r3 : 00000000 r2 : 00000000 r1 : e59f2028 r0 : 00000000 Flags: NzCv IRQs on FIQs off Mode SVC_32 Control: 00c5107d Table: 40060000 DAC: 00000000 IFAR: e59f2028 IFSR: 00000205 Any input on where to look would be appreciated. Thanks, Emilio
Emilio G. Cota <cota@braap.org> writes: > On Fri, Apr 22, 2016 at 15:41:13 +0100, Alex Bennée wrote: >> Emilio G. Cota <cota@braap.org> writes: > (snip) >> > Known issues: >> > - Basically compile-tested only, since I've only run this with >> > single-threaded TCG; I also tried running it with linux-user, >> > but in order to trigger tb_flush I had to make code_gen_buffer >> > so small that the CPU calling tb_flush would immediately fill >> > the 2nd buffer, triggering the assert. If you have a working >> > multi-threaded workload that would be good to test this, please >> > let me know. >> >> With my latest mttcg unit tests: >> >> ./arm-softmmu/qemu-system-arm -machine virt,accel=tcg -cpu cortex-a15 \ >> -device virtio-serial-device -device virtconsole,chardev=ctd \ >> -chardev testdev,id=ctd -display none -serial stdio \ >> -kernel arm/tcg-test.flat -smp 4 -tcg mttcg=on \ >> -append "tight smc irq mod=1 rounds=100000" -name arm,debug-threads=on > > This is useful. Never mind the need for testing linux-user, I can test > both code paths (i.e. dynamic allocation and static buf) with qemu-system > by simply defining USE_STATIC_CODE_GEN_BUFFER. > > After applying a modified version of this patch (that I'll send in > a jiffy) to your enable-mttcg-for-armv7-v1 branch (reverting first > "translate-all: introduces tb_flush_safe"), I can easily trigger > this error when setting a low enough TB size, e.g. -tb-size 32: > > CPU1: online and setting up with pattern 0xa0b78cbf > CPU2: online and setting up with pattern 0x22287c45 > CPU3: online and setting up with pattern 0x6262c5c5 > CPU0: online and setting up with pattern 0xa65e7ad6 > qemu: flush code_size=10622184 nb_tbs=83886 avg_tb_size=126 > qemu: flush code_size=10469016 nb_tbs=83886 avg_tb_size=124 > qemu: flush code_size=10492920 nb_tbs=83886 avg_tb_size=125 > qemu: flush code_size=10477464 nb_tbs=83886 avg_tb_size=124 > qemu: flush code_size=10495800 nb_tbs=83886 avg_tb_size=125 > PASS: smc: irq: 0 errors, IRQs not checked > Unhandled exception 3 (pabt) > Exception frame registers: > pc : [<e59f2028>] lr : [<40010700>] psr: a0000153 > sp : 400ac5c0 ip : 400ab4e8 fp : 40032ca8 > r10: 00000000 r9 : 00000000 r8 : 00000000 > r7 : 00000000 r6 : 00000000 r5 : 00000000 r4 : 00000000 > r3 : 00000000 r2 : 00000000 r1 : e59f2028 r0 : 00000000 > Flags: NzCv IRQs on FIQs off Mode SVC_32 > Control: 00c5107d Table: 40060000 DAC: 00000000 > IFAR: e59f2028 IFSR: 00000205 > > Any input on where to look would be appreciated. Thanks, I'll have a look and see if I can replicate. > > Emilio -- Alex Bennée
diff --git a/translate-all.c b/translate-all.c index bba9b62..4c14b4d 100644 --- a/translate-all.c +++ b/translate-all.c @@ -536,8 +536,13 @@ static inline void *split_cross_256mb(void *buf1, size_t size1) #endif #ifdef USE_STATIC_CODE_GEN_BUFFER -static uint8_t static_code_gen_buffer[DEFAULT_CODE_GEN_BUFFER_SIZE] +static uint8_t static_code_gen_buffer1[DEFAULT_CODE_GEN_BUFFER_SIZE] __attribute__((aligned(CODE_GEN_ALIGN))); +static uint8_t static_code_gen_buffer2[DEFAULT_CODE_GEN_BUFFER_SIZE] + __attribute__((aligned(CODE_GEN_ALIGN))); +static int static_buf_mask = 1; +static void *static_buf1; +static void *static_buf2; # ifdef _WIN32 static inline void do_protect(void *addr, long size, int prot) @@ -580,13 +585,12 @@ static inline void map_none(void *addr, long size) } # endif /* WIN32 */ -static inline void *alloc_code_gen_buffer(void) +static void *alloc_static_code_gen_buffer(void *buf) { - void *buf = static_code_gen_buffer; size_t full_size, size; /* The size of the buffer, rounded down to end on a page boundary. */ - full_size = (((uintptr_t)buf + sizeof(static_code_gen_buffer)) + full_size = (((uintptr_t)buf + sizeof(static_code_gen_buffer1)) & qemu_real_host_page_mask) - (uintptr_t)buf; /* Reserve a guard page. */ @@ -612,6 +616,15 @@ static inline void *alloc_code_gen_buffer(void) return buf; } + +static inline void *alloc_code_gen_buffer(void) +{ + static_buf1 = alloc_static_code_gen_buffer(static_code_gen_buffer1); + static_buf2 = alloc_static_code_gen_buffer(static_code_gen_buffer2); + + assert(static_buf_mask == 1); + return static_buf1; +} #elif defined(_WIN32) static inline void *alloc_code_gen_buffer(void) { @@ -829,8 +842,100 @@ static void page_flush_tb(void) } } +#ifdef USE_STATIC_CODE_GEN_BUFFER + +struct code_gen_desc { + struct rcu_head rcu; + int clear_bit; +}; + +static void code_gen_buffer_clear(struct rcu_head *rcu) +{ + struct code_gen_desc *desc = container_of(rcu, struct code_gen_desc, rcu); + + tb_lock(); + static_buf_mask &= ~desc->clear_bit; + tb_unlock(); + g_free(desc); +} + +static void *code_gen_buffer_replace(void) +{ + struct code_gen_desc *desc = g_malloc0(sizeof(*desc)); + + /* + * If both bits are set, we're having two concurrent flushes. This + * can easily happen if the buffers are heavily undersized. + */ + assert(static_buf_mask == 1 || static_buf_mask == 2); + + desc->clear_bit = static_buf_mask; + call_rcu1(&desc->rcu, code_gen_buffer_clear); + + if (static_buf_mask == 1) { + static_buf_mask |= 2; + return static_buf2; + } + static_buf_mask |= 1; + return static_buf1; +} + +#elif defined(_WIN32) + +struct code_gen_desc { + struct rcu_head rcu; + void *buf; +}; + +static void code_gen_buffer_vfree(struct rcu_head *rcu) +{ + struct code_gen_desc *desc = container_of(rcu, struct code_gen_desc, rcu); + + VirtualFree(desc->buf, 0, MEM_RELEASE); + g_free(desc); +} + +static void *code_gen_buffer_replace(void) +{ + struct code_gen_desc *desc; + + desc = g_malloc0(sizeof(*desc)); + desc->buf = tcg_ctx.code_gen_buffer; + call_rcu1(&desc->rcu, code_gen_buffer_vfree); + + return alloc_code_gen_buffer(); +} + +#else /* UNIX, dynamically-allocated code buffer */ + +struct code_gen_desc { + struct rcu_head rcu; + void *buf; + size_t size; +}; + +static void code_gen_buffer_unmap(struct rcu_head *rcu) +{ + struct code_gen_desc *desc = container_of(rcu, struct code_gen_desc, rcu); + + munmap(desc->buf, desc->size + qemu_real_host_page_size); + g_free(desc); +} + +static void *code_gen_buffer_replace(void) +{ + struct code_gen_desc *desc; + + desc = g_malloc0(sizeof(*desc)); + desc->buf = tcg_ctx.code_gen_buffer; + desc->size = tcg_ctx.code_gen_buffer_size; + call_rcu1(&desc->rcu, code_gen_buffer_unmap); + + return alloc_code_gen_buffer(); +} +#endif /* USE_STATIC_CODE_GEN_BUFFER */ + /* flush all the translation blocks */ -/* XXX: tb_flush is currently not thread safe */ void tb_flush(CPUState *cpu) { #if defined(DEBUG_FLUSH) @@ -853,10 +958,17 @@ void tb_flush(CPUState *cpu) qht_reset_size(&tcg_ctx.tb_ctx.htable, CODE_GEN_HTABLE_SIZE); page_flush_tb(); + tcg_ctx.code_gen_buffer = code_gen_buffer_replace(); tcg_ctx.code_gen_ptr = tcg_ctx.code_gen_buffer; + tcg_prologue_init(&tcg_ctx); /* XXX: flush processor icache at this point if cache flush is expensive */ tcg_ctx.tb_ctx.tb_flush_count++; + + /* exit all CPUs so that the old buffer is quickly cleared. */ + CPU_FOREACH(cpu) { + cpu_exit(cpu); + } } #ifdef DEBUG_TB_CHECK
This is a first attempt at making tb_flush not have to stop all CPUs. There are issues as pointed out below, but this could be a good start. Context: https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg04658.html https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg06942.html Known issues: - Basically compile-tested only, since I've only run this with single-threaded TCG; I also tried running it with linux-user, but in order to trigger tb_flush I had to make code_gen_buffer so small that the CPU calling tb_flush would immediately fill the 2nd buffer, triggering the assert. If you have a working multi-threaded workload that would be good to test this, please let me know. - Windows; not even compile-tested! Signed-off-by: Emilio G. Cota <cota@braap.org> --- translate-all.c | 122 +++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 117 insertions(+), 5 deletions(-)