diff mbox

[RFC] translate-all: protect code_gen_buffer with RCU

Message ID 1461283583-2833-1-git-send-email-cota@braap.org (mailing list archive)
State New, archived
Headers show

Commit Message

Emilio Cota April 22, 2016, 12:06 a.m. UTC
This is a first attempt at making tb_flush not have to stop all CPUs.
There are issues as pointed out below, but this could be a good start.

Context:
  https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg04658.html
  https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg06942.html

Known issues:
- Basically compile-tested only, since I've only run this with
  single-threaded TCG; I also tried running it with linux-user,
  but in order to trigger tb_flush I had to make code_gen_buffer
  so small that the CPU calling tb_flush would immediately fill
  the 2nd buffer, triggering the assert. If you have a working
  multi-threaded workload that would be good to test this, please
  let me know.
- Windows; not even compile-tested!

Signed-off-by: Emilio G. Cota <cota@braap.org>
---
 translate-all.c | 122 +++++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 117 insertions(+), 5 deletions(-)

Comments

Alex Bennée April 22, 2016, 2:41 p.m. UTC | #1
Emilio G. Cota <cota@braap.org> writes:

> This is a first attempt at making tb_flush not have to stop all CPUs.
> There are issues as pointed out below, but this could be a good start.
>
> Context:
>   https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg04658.html
>   https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg06942.html
>
> Known issues:
> - Basically compile-tested only, since I've only run this with
>   single-threaded TCG; I also tried running it with linux-user,
>   but in order to trigger tb_flush I had to make code_gen_buffer
>   so small that the CPU calling tb_flush would immediately fill
>   the 2nd buffer, triggering the assert. If you have a working
>   multi-threaded workload that would be good to test this, please
>   let me know.

With my latest mttcg unit tests:

./arm-softmmu/qemu-system-arm -machine virt,accel=tcg -cpu cortex-a15 \
  -device virtio-serial-device -device virtconsole,chardev=ctd \
  -chardev testdev,id=ctd -display none -serial stdio \
  -kernel arm/tcg-test.flat -smp 4 -tcg mttcg=on \
  -append "tight smc irq mod=1 rounds=100000"  -name arm,debug-threads=on


> - Windows; not even compile-tested!
>
> Signed-off-by: Emilio G. Cota <cota@braap.org>
> ---
>  translate-all.c | 122 +++++++++++++++++++++++++++++++++++++++++++++++++++++---
>  1 file changed, 117 insertions(+), 5 deletions(-)
>
> diff --git a/translate-all.c b/translate-all.c
> index bba9b62..4c14b4d 100644
> --- a/translate-all.c
> +++ b/translate-all.c
> @@ -536,8 +536,13 @@ static inline void *split_cross_256mb(void *buf1, size_t size1)
>  #endif
>
>  #ifdef USE_STATIC_CODE_GEN_BUFFER
> -static uint8_t static_code_gen_buffer[DEFAULT_CODE_GEN_BUFFER_SIZE]
> +static uint8_t static_code_gen_buffer1[DEFAULT_CODE_GEN_BUFFER_SIZE]
>      __attribute__((aligned(CODE_GEN_ALIGN)));
> +static uint8_t static_code_gen_buffer2[DEFAULT_CODE_GEN_BUFFER_SIZE]
> +    __attribute__((aligned(CODE_GEN_ALIGN)));
> +static int static_buf_mask = 1;
> +static void *static_buf1;
> +static void *static_buf2;
>
>  # ifdef _WIN32
>  static inline void do_protect(void *addr, long size, int prot)
> @@ -580,13 +585,12 @@ static inline void map_none(void *addr, long size)
>  }
>  # endif /* WIN32 */
>
> -static inline void *alloc_code_gen_buffer(void)
> +static void *alloc_static_code_gen_buffer(void *buf)
>  {
> -    void *buf = static_code_gen_buffer;
>      size_t full_size, size;
>
>      /* The size of the buffer, rounded down to end on a page boundary.  */
> -    full_size = (((uintptr_t)buf + sizeof(static_code_gen_buffer))
> +    full_size = (((uintptr_t)buf + sizeof(static_code_gen_buffer1))
>                   & qemu_real_host_page_mask) - (uintptr_t)buf;
>
>      /* Reserve a guard page.  */
> @@ -612,6 +616,15 @@ static inline void *alloc_code_gen_buffer(void)
>
>      return buf;
>  }
> +
> +static inline void *alloc_code_gen_buffer(void)
> +{
> +    static_buf1 = alloc_static_code_gen_buffer(static_code_gen_buffer1);
> +    static_buf2 = alloc_static_code_gen_buffer(static_code_gen_buffer2);
> +
> +    assert(static_buf_mask == 1);
> +    return static_buf1;
> +}
>  #elif defined(_WIN32)
>  static inline void *alloc_code_gen_buffer(void)
>  {
> @@ -829,8 +842,100 @@ static void page_flush_tb(void)
>      }
>  }
>
> +#ifdef USE_STATIC_CODE_GEN_BUFFER
> +
> +struct code_gen_desc {
> +    struct rcu_head rcu;
> +    int clear_bit;
> +};
> +
> +static void code_gen_buffer_clear(struct rcu_head *rcu)
> +{
> +    struct code_gen_desc *desc = container_of(rcu, struct code_gen_desc, rcu);
> +
> +    tb_lock();
> +    static_buf_mask &= ~desc->clear_bit;
> +    tb_unlock();
> +    g_free(desc);
> +}
> +
> +static void *code_gen_buffer_replace(void)
> +{
> +    struct code_gen_desc *desc = g_malloc0(sizeof(*desc));
> +
> +    /*
> +     * If both bits are set, we're having two concurrent flushes. This
> +     * can easily happen if the buffers are heavily undersized.
> +     */
> +    assert(static_buf_mask == 1 || static_buf_mask == 2);
> +
> +    desc->clear_bit = static_buf_mask;
> +    call_rcu1(&desc->rcu, code_gen_buffer_clear);
> +
> +    if (static_buf_mask == 1) {
> +        static_buf_mask |= 2;
> +        return static_buf2;
> +    }
> +    static_buf_mask |= 1;
> +    return static_buf1;
> +}
> +
> +#elif defined(_WIN32)
> +
> +struct code_gen_desc {
> +    struct rcu_head rcu;
> +    void *buf;
> +};
> +
> +static void code_gen_buffer_vfree(struct rcu_head *rcu)
> +{
> +    struct code_gen_desc *desc = container_of(rcu, struct code_gen_desc, rcu);
> +
> +    VirtualFree(desc->buf, 0, MEM_RELEASE);
> +    g_free(desc);
> +}
> +
> +static void *code_gen_buffer_replace(void)
> +{
> +    struct code_gen_desc *desc;
> +
> +    desc = g_malloc0(sizeof(*desc));
> +    desc->buf = tcg_ctx.code_gen_buffer;
> +    call_rcu1(&desc->rcu, code_gen_buffer_vfree);
> +
> +    return alloc_code_gen_buffer();
> +}
> +
> +#else /* UNIX, dynamically-allocated code buffer */
> +
> +struct code_gen_desc {
> +    struct rcu_head rcu;
> +    void *buf;
> +    size_t size;
> +};
> +
> +static void code_gen_buffer_unmap(struct rcu_head *rcu)
> +{
> +    struct code_gen_desc *desc = container_of(rcu, struct code_gen_desc, rcu);
> +
> +    munmap(desc->buf, desc->size + qemu_real_host_page_size);
> +    g_free(desc);
> +}
> +
> +static void *code_gen_buffer_replace(void)
> +{
> +    struct code_gen_desc *desc;
> +
> +    desc = g_malloc0(sizeof(*desc));
> +    desc->buf = tcg_ctx.code_gen_buffer;
> +    desc->size = tcg_ctx.code_gen_buffer_size;
> +    call_rcu1(&desc->rcu, code_gen_buffer_unmap);
> +
> +    return alloc_code_gen_buffer();
> +}
> +#endif /* USE_STATIC_CODE_GEN_BUFFER */
> +
>  /* flush all the translation blocks */
> -/* XXX: tb_flush is currently not thread safe */
>  void tb_flush(CPUState *cpu)
>  {
>  #if defined(DEBUG_FLUSH)
> @@ -853,10 +958,17 @@ void tb_flush(CPUState *cpu)
>      qht_reset_size(&tcg_ctx.tb_ctx.htable, CODE_GEN_HTABLE_SIZE);
>      page_flush_tb();
>
> +    tcg_ctx.code_gen_buffer = code_gen_buffer_replace();
>      tcg_ctx.code_gen_ptr = tcg_ctx.code_gen_buffer;
> +    tcg_prologue_init(&tcg_ctx);
>      /* XXX: flush processor icache at this point if cache flush is
>         expensive */
>      tcg_ctx.tb_ctx.tb_flush_count++;
> +
> +    /* exit all CPUs so that the old buffer is quickly cleared. */
> +    CPU_FOREACH(cpu) {
> +        cpu_exit(cpu);
> +    }
>  }
>
>  #ifdef DEBUG_TB_CHECK


--
Alex Bennée
Alex Bennée April 22, 2016, 2:47 p.m. UTC | #2
Alex Bennée <alex.bennee@linaro.org> writes:

> Emilio G. Cota <cota@braap.org> writes:
>
>> This is a first attempt at making tb_flush not have to stop all CPUs.
>> There are issues as pointed out below, but this could be a good start.
>>
>> Context:
>>   https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg04658.html
>>   https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg06942.html
>>
>> Known issues:
>> - Basically compile-tested only, since I've only run this with
>>   single-threaded TCG; I also tried running it with linux-user,
>>   but in order to trigger tb_flush I had to make code_gen_buffer
>>   so small that the CPU calling tb_flush would immediately fill
>>   the 2nd buffer, triggering the assert. If you have a working
>>   multi-threaded workload that would be good to test this, please
>>   let me know.
>
> With my latest mttcg unit tests:
>
> ./arm-softmmu/qemu-system-arm -machine virt,accel=tcg -cpu cortex-a15 \
>   -device virtio-serial-device -device virtconsole,chardev=ctd \
>   -chardev testdev,id=ctd -display none -serial stdio \
>   -kernel arm/tcg-test.flat -smp 4 -tcg mttcg=on \
>   -append "tight smc irq mod=1 rounds=100000"  -name
>   arm,debug-threads=on

Ahh, I just realised you wanted a linux-user workload.

>
>
>> - Windows; not even compile-tested!
>>
>> Signed-off-by: Emilio G. Cota <cota@braap.org>
>> ---
>>  translate-all.c | 122 +++++++++++++++++++++++++++++++++++++++++++++++++++++---
>>  1 file changed, 117 insertions(+), 5 deletions(-)
>>
>> diff --git a/translate-all.c b/translate-all.c
>> index bba9b62..4c14b4d 100644
>> --- a/translate-all.c
>> +++ b/translate-all.c
>> @@ -536,8 +536,13 @@ static inline void *split_cross_256mb(void *buf1, size_t size1)
>>  #endif
>>
>>  #ifdef USE_STATIC_CODE_GEN_BUFFER
>> -static uint8_t static_code_gen_buffer[DEFAULT_CODE_GEN_BUFFER_SIZE]
>> +static uint8_t static_code_gen_buffer1[DEFAULT_CODE_GEN_BUFFER_SIZE]
>>      __attribute__((aligned(CODE_GEN_ALIGN)));
>> +static uint8_t static_code_gen_buffer2[DEFAULT_CODE_GEN_BUFFER_SIZE]
>> +    __attribute__((aligned(CODE_GEN_ALIGN)));
>> +static int static_buf_mask = 1;
>> +static void *static_buf1;
>> +static void *static_buf2;
>>
>>  # ifdef _WIN32
>>  static inline void do_protect(void *addr, long size, int prot)
>> @@ -580,13 +585,12 @@ static inline void map_none(void *addr, long size)
>>  }
>>  # endif /* WIN32 */
>>
>> -static inline void *alloc_code_gen_buffer(void)
>> +static void *alloc_static_code_gen_buffer(void *buf)
>>  {
>> -    void *buf = static_code_gen_buffer;
>>      size_t full_size, size;
>>
>>      /* The size of the buffer, rounded down to end on a page boundary.  */
>> -    full_size = (((uintptr_t)buf + sizeof(static_code_gen_buffer))
>> +    full_size = (((uintptr_t)buf + sizeof(static_code_gen_buffer1))
>>                   & qemu_real_host_page_mask) - (uintptr_t)buf;
>>
>>      /* Reserve a guard page.  */
>> @@ -612,6 +616,15 @@ static inline void *alloc_code_gen_buffer(void)
>>
>>      return buf;
>>  }
>> +
>> +static inline void *alloc_code_gen_buffer(void)
>> +{
>> +    static_buf1 = alloc_static_code_gen_buffer(static_code_gen_buffer1);
>> +    static_buf2 = alloc_static_code_gen_buffer(static_code_gen_buffer2);
>> +
>> +    assert(static_buf_mask == 1);
>> +    return static_buf1;
>> +}
>>  #elif defined(_WIN32)
>>  static inline void *alloc_code_gen_buffer(void)
>>  {
>> @@ -829,8 +842,100 @@ static void page_flush_tb(void)
>>      }
>>  }
>>
>> +#ifdef USE_STATIC_CODE_GEN_BUFFER
>> +
>> +struct code_gen_desc {
>> +    struct rcu_head rcu;
>> +    int clear_bit;
>> +};
>> +
>> +static void code_gen_buffer_clear(struct rcu_head *rcu)
>> +{
>> +    struct code_gen_desc *desc = container_of(rcu, struct code_gen_desc, rcu);
>> +
>> +    tb_lock();
>> +    static_buf_mask &= ~desc->clear_bit;
>> +    tb_unlock();
>> +    g_free(desc);
>> +}
>> +
>> +static void *code_gen_buffer_replace(void)
>> +{
>> +    struct code_gen_desc *desc = g_malloc0(sizeof(*desc));
>> +
>> +    /*
>> +     * If both bits are set, we're having two concurrent flushes. This
>> +     * can easily happen if the buffers are heavily undersized.
>> +     */
>> +    assert(static_buf_mask == 1 || static_buf_mask == 2);
>> +
>> +    desc->clear_bit = static_buf_mask;
>> +    call_rcu1(&desc->rcu, code_gen_buffer_clear);
>> +
>> +    if (static_buf_mask == 1) {
>> +        static_buf_mask |= 2;
>> +        return static_buf2;
>> +    }
>> +    static_buf_mask |= 1;
>> +    return static_buf1;
>> +}
>> +
>> +#elif defined(_WIN32)
>> +
>> +struct code_gen_desc {
>> +    struct rcu_head rcu;
>> +    void *buf;
>> +};
>> +
>> +static void code_gen_buffer_vfree(struct rcu_head *rcu)
>> +{
>> +    struct code_gen_desc *desc = container_of(rcu, struct code_gen_desc, rcu);
>> +
>> +    VirtualFree(desc->buf, 0, MEM_RELEASE);
>> +    g_free(desc);
>> +}
>> +
>> +static void *code_gen_buffer_replace(void)
>> +{
>> +    struct code_gen_desc *desc;
>> +
>> +    desc = g_malloc0(sizeof(*desc));
>> +    desc->buf = tcg_ctx.code_gen_buffer;
>> +    call_rcu1(&desc->rcu, code_gen_buffer_vfree);
>> +
>> +    return alloc_code_gen_buffer();
>> +}
>> +
>> +#else /* UNIX, dynamically-allocated code buffer */
>> +
>> +struct code_gen_desc {
>> +    struct rcu_head rcu;
>> +    void *buf;
>> +    size_t size;
>> +};
>> +
>> +static void code_gen_buffer_unmap(struct rcu_head *rcu)
>> +{
>> +    struct code_gen_desc *desc = container_of(rcu, struct code_gen_desc, rcu);
>> +
>> +    munmap(desc->buf, desc->size + qemu_real_host_page_size);
>> +    g_free(desc);
>> +}
>> +
>> +static void *code_gen_buffer_replace(void)
>> +{
>> +    struct code_gen_desc *desc;
>> +
>> +    desc = g_malloc0(sizeof(*desc));
>> +    desc->buf = tcg_ctx.code_gen_buffer;
>> +    desc->size = tcg_ctx.code_gen_buffer_size;
>> +    call_rcu1(&desc->rcu, code_gen_buffer_unmap);
>> +
>> +    return alloc_code_gen_buffer();
>> +}
>> +#endif /* USE_STATIC_CODE_GEN_BUFFER */
>> +
>>  /* flush all the translation blocks */
>> -/* XXX: tb_flush is currently not thread safe */
>>  void tb_flush(CPUState *cpu)
>>  {
>>  #if defined(DEBUG_FLUSH)
>> @@ -853,10 +958,17 @@ void tb_flush(CPUState *cpu)
>>      qht_reset_size(&tcg_ctx.tb_ctx.htable, CODE_GEN_HTABLE_SIZE);
>>      page_flush_tb();
>>
>> +    tcg_ctx.code_gen_buffer = code_gen_buffer_replace();
>>      tcg_ctx.code_gen_ptr = tcg_ctx.code_gen_buffer;
>> +    tcg_prologue_init(&tcg_ctx);
>>      /* XXX: flush processor icache at this point if cache flush is
>>         expensive */
>>      tcg_ctx.tb_ctx.tb_flush_count++;
>> +
>> +    /* exit all CPUs so that the old buffer is quickly cleared. */
>> +    CPU_FOREACH(cpu) {
>> +        cpu_exit(cpu);
>> +    }
>>  }
>>
>>  #ifdef DEBUG_TB_CHECK


--
Alex Bennée
Richard Henderson April 22, 2016, 6:25 p.m. UTC | #3
On 04/21/2016 05:06 PM, Emilio G. Cota wrote:
>  #ifdef USE_STATIC_CODE_GEN_BUFFER
> -static uint8_t static_code_gen_buffer[DEFAULT_CODE_GEN_BUFFER_SIZE]
> +static uint8_t static_code_gen_buffer1[DEFAULT_CODE_GEN_BUFFER_SIZE]
>      __attribute__((aligned(CODE_GEN_ALIGN)));
> +static uint8_t static_code_gen_buffer2[DEFAULT_CODE_GEN_BUFFER_SIZE]
> +    __attribute__((aligned(CODE_GEN_ALIGN)));
> +static int static_buf_mask = 1;
> +static void *static_buf1;
> +static void *static_buf2;

I don't like this at all.

(1) This is (by default) 32MB we're adding to the RSS of the
    simulator.  Surely we can do better than this.

(2) On some hosts we require a maximum displacement from
    any point in the code gen buffer from the tcg prologue.
    That means you can't simply allocate two separate buffers.

    You have to take a single buffer, of known good size and
    alignment, and split it in half.


r~
Emilio Cota April 24, 2016, 3:20 a.m. UTC | #4
On Fri, Apr 22, 2016 at 15:41:13 +0100, Alex Bennée wrote:
> Emilio G. Cota <cota@braap.org> writes:
(snip)
> > Known issues:
> > - Basically compile-tested only, since I've only run this with
> >   single-threaded TCG; I also tried running it with linux-user,
> >   but in order to trigger tb_flush I had to make code_gen_buffer
> >   so small that the CPU calling tb_flush would immediately fill
> >   the 2nd buffer, triggering the assert. If you have a working
> >   multi-threaded workload that would be good to test this, please
> >   let me know.
> 
> With my latest mttcg unit tests:
> 
> ./arm-softmmu/qemu-system-arm -machine virt,accel=tcg -cpu cortex-a15 \
>   -device virtio-serial-device -device virtconsole,chardev=ctd \
>   -chardev testdev,id=ctd -display none -serial stdio \
>   -kernel arm/tcg-test.flat -smp 4 -tcg mttcg=on \
>   -append "tight smc irq mod=1 rounds=100000"  -name arm,debug-threads=on

This is useful. Never mind the need for testing linux-user, I can test
both code paths (i.e. dynamic allocation and static buf) with qemu-system
by simply defining USE_STATIC_CODE_GEN_BUFFER.

After applying a modified version of this patch (that I'll send in
a jiffy) to your enable-mttcg-for-armv7-v1 branch (reverting first
"translate-all: introduces tb_flush_safe"), I can easily trigger
this error when setting a low enough TB size, e.g. -tb-size 32:

 CPU1: online and setting up with pattern 0xa0b78cbf
 CPU2: online and setting up with pattern 0x22287c45
 CPU3: online and setting up with pattern 0x6262c5c5
 CPU0: online and setting up with pattern 0xa65e7ad6
 qemu: flush code_size=10622184 nb_tbs=83886 avg_tb_size=126
 qemu: flush code_size=10469016 nb_tbs=83886 avg_tb_size=124
 qemu: flush code_size=10492920 nb_tbs=83886 avg_tb_size=125
 qemu: flush code_size=10477464 nb_tbs=83886 avg_tb_size=124
 qemu: flush code_size=10495800 nb_tbs=83886 avg_tb_size=125
 PASS: smc: irq: 0 errors, IRQs not checked
 Unhandled exception 3 (pabt)
 Exception frame registers:
 pc : [<e59f2028>]    lr : [<40010700>]    psr: a0000153
 sp : 400ac5c0  ip : 400ab4e8  fp : 40032ca8
 r10: 00000000  r9 : 00000000  r8 : 00000000
 r7 : 00000000  r6 : 00000000  r5 : 00000000  r4 : 00000000
 r3 : 00000000  r2 : 00000000  r1 : e59f2028  r0 : 00000000
 Flags: NzCv  IRQs on  FIQs off  Mode SVC_32
 Control: 00c5107d  Table: 40060000  DAC: 00000000
 IFAR: e59f2028    IFSR: 00000205

Any input on where to look would be appreciated. Thanks,

		Emilio
Alex Bennée April 25, 2016, 8:35 a.m. UTC | #5
Emilio G. Cota <cota@braap.org> writes:

> On Fri, Apr 22, 2016 at 15:41:13 +0100, Alex Bennée wrote:
>> Emilio G. Cota <cota@braap.org> writes:
> (snip)
>> > Known issues:
>> > - Basically compile-tested only, since I've only run this with
>> >   single-threaded TCG; I also tried running it with linux-user,
>> >   but in order to trigger tb_flush I had to make code_gen_buffer
>> >   so small that the CPU calling tb_flush would immediately fill
>> >   the 2nd buffer, triggering the assert. If you have a working
>> >   multi-threaded workload that would be good to test this, please
>> >   let me know.
>>
>> With my latest mttcg unit tests:
>>
>> ./arm-softmmu/qemu-system-arm -machine virt,accel=tcg -cpu cortex-a15 \
>>   -device virtio-serial-device -device virtconsole,chardev=ctd \
>>   -chardev testdev,id=ctd -display none -serial stdio \
>>   -kernel arm/tcg-test.flat -smp 4 -tcg mttcg=on \
>>   -append "tight smc irq mod=1 rounds=100000"  -name arm,debug-threads=on
>
> This is useful. Never mind the need for testing linux-user, I can test
> both code paths (i.e. dynamic allocation and static buf) with qemu-system
> by simply defining USE_STATIC_CODE_GEN_BUFFER.
>
> After applying a modified version of this patch (that I'll send in
> a jiffy) to your enable-mttcg-for-armv7-v1 branch (reverting first
> "translate-all: introduces tb_flush_safe"), I can easily trigger
> this error when setting a low enough TB size, e.g. -tb-size 32:
>
>  CPU1: online and setting up with pattern 0xa0b78cbf
>  CPU2: online and setting up with pattern 0x22287c45
>  CPU3: online and setting up with pattern 0x6262c5c5
>  CPU0: online and setting up with pattern 0xa65e7ad6
>  qemu: flush code_size=10622184 nb_tbs=83886 avg_tb_size=126
>  qemu: flush code_size=10469016 nb_tbs=83886 avg_tb_size=124
>  qemu: flush code_size=10492920 nb_tbs=83886 avg_tb_size=125
>  qemu: flush code_size=10477464 nb_tbs=83886 avg_tb_size=124
>  qemu: flush code_size=10495800 nb_tbs=83886 avg_tb_size=125
>  PASS: smc: irq: 0 errors, IRQs not checked
>  Unhandled exception 3 (pabt)
>  Exception frame registers:
>  pc : [<e59f2028>]    lr : [<40010700>]    psr: a0000153
>  sp : 400ac5c0  ip : 400ab4e8  fp : 40032ca8
>  r10: 00000000  r9 : 00000000  r8 : 00000000
>  r7 : 00000000  r6 : 00000000  r5 : 00000000  r4 : 00000000
>  r3 : 00000000  r2 : 00000000  r1 : e59f2028  r0 : 00000000
>  Flags: NzCv  IRQs on  FIQs off  Mode SVC_32
>  Control: 00c5107d  Table: 40060000  DAC: 00000000
>  IFAR: e59f2028    IFSR: 00000205
>
> Any input on where to look would be appreciated. Thanks,

I'll have a look and see if I can replicate.

>
> 		Emilio


--
Alex Bennée
diff mbox

Patch

diff --git a/translate-all.c b/translate-all.c
index bba9b62..4c14b4d 100644
--- a/translate-all.c
+++ b/translate-all.c
@@ -536,8 +536,13 @@  static inline void *split_cross_256mb(void *buf1, size_t size1)
 #endif
 
 #ifdef USE_STATIC_CODE_GEN_BUFFER
-static uint8_t static_code_gen_buffer[DEFAULT_CODE_GEN_BUFFER_SIZE]
+static uint8_t static_code_gen_buffer1[DEFAULT_CODE_GEN_BUFFER_SIZE]
     __attribute__((aligned(CODE_GEN_ALIGN)));
+static uint8_t static_code_gen_buffer2[DEFAULT_CODE_GEN_BUFFER_SIZE]
+    __attribute__((aligned(CODE_GEN_ALIGN)));
+static int static_buf_mask = 1;
+static void *static_buf1;
+static void *static_buf2;
 
 # ifdef _WIN32
 static inline void do_protect(void *addr, long size, int prot)
@@ -580,13 +585,12 @@  static inline void map_none(void *addr, long size)
 }
 # endif /* WIN32 */
 
-static inline void *alloc_code_gen_buffer(void)
+static void *alloc_static_code_gen_buffer(void *buf)
 {
-    void *buf = static_code_gen_buffer;
     size_t full_size, size;
 
     /* The size of the buffer, rounded down to end on a page boundary.  */
-    full_size = (((uintptr_t)buf + sizeof(static_code_gen_buffer))
+    full_size = (((uintptr_t)buf + sizeof(static_code_gen_buffer1))
                  & qemu_real_host_page_mask) - (uintptr_t)buf;
 
     /* Reserve a guard page.  */
@@ -612,6 +616,15 @@  static inline void *alloc_code_gen_buffer(void)
 
     return buf;
 }
+
+static inline void *alloc_code_gen_buffer(void)
+{
+    static_buf1 = alloc_static_code_gen_buffer(static_code_gen_buffer1);
+    static_buf2 = alloc_static_code_gen_buffer(static_code_gen_buffer2);
+
+    assert(static_buf_mask == 1);
+    return static_buf1;
+}
 #elif defined(_WIN32)
 static inline void *alloc_code_gen_buffer(void)
 {
@@ -829,8 +842,100 @@  static void page_flush_tb(void)
     }
 }
 
+#ifdef USE_STATIC_CODE_GEN_BUFFER
+
+struct code_gen_desc {
+    struct rcu_head rcu;
+    int clear_bit;
+};
+
+static void code_gen_buffer_clear(struct rcu_head *rcu)
+{
+    struct code_gen_desc *desc = container_of(rcu, struct code_gen_desc, rcu);
+
+    tb_lock();
+    static_buf_mask &= ~desc->clear_bit;
+    tb_unlock();
+    g_free(desc);
+}
+
+static void *code_gen_buffer_replace(void)
+{
+    struct code_gen_desc *desc = g_malloc0(sizeof(*desc));
+
+    /*
+     * If both bits are set, we're having two concurrent flushes. This
+     * can easily happen if the buffers are heavily undersized.
+     */
+    assert(static_buf_mask == 1 || static_buf_mask == 2);
+
+    desc->clear_bit = static_buf_mask;
+    call_rcu1(&desc->rcu, code_gen_buffer_clear);
+
+    if (static_buf_mask == 1) {
+        static_buf_mask |= 2;
+        return static_buf2;
+    }
+    static_buf_mask |= 1;
+    return static_buf1;
+}
+
+#elif defined(_WIN32)
+
+struct code_gen_desc {
+    struct rcu_head rcu;
+    void *buf;
+};
+
+static void code_gen_buffer_vfree(struct rcu_head *rcu)
+{
+    struct code_gen_desc *desc = container_of(rcu, struct code_gen_desc, rcu);
+
+    VirtualFree(desc->buf, 0, MEM_RELEASE);
+    g_free(desc);
+}
+
+static void *code_gen_buffer_replace(void)
+{
+    struct code_gen_desc *desc;
+
+    desc = g_malloc0(sizeof(*desc));
+    desc->buf = tcg_ctx.code_gen_buffer;
+    call_rcu1(&desc->rcu, code_gen_buffer_vfree);
+
+    return alloc_code_gen_buffer();
+}
+
+#else /* UNIX, dynamically-allocated code buffer */
+
+struct code_gen_desc {
+    struct rcu_head rcu;
+    void *buf;
+    size_t size;
+};
+
+static void code_gen_buffer_unmap(struct rcu_head *rcu)
+{
+    struct code_gen_desc *desc = container_of(rcu, struct code_gen_desc, rcu);
+
+    munmap(desc->buf, desc->size + qemu_real_host_page_size);
+    g_free(desc);
+}
+
+static void *code_gen_buffer_replace(void)
+{
+    struct code_gen_desc *desc;
+
+    desc = g_malloc0(sizeof(*desc));
+    desc->buf = tcg_ctx.code_gen_buffer;
+    desc->size = tcg_ctx.code_gen_buffer_size;
+    call_rcu1(&desc->rcu, code_gen_buffer_unmap);
+
+    return alloc_code_gen_buffer();
+}
+#endif /* USE_STATIC_CODE_GEN_BUFFER */
+
 /* flush all the translation blocks */
-/* XXX: tb_flush is currently not thread safe */
 void tb_flush(CPUState *cpu)
 {
 #if defined(DEBUG_FLUSH)
@@ -853,10 +958,17 @@  void tb_flush(CPUState *cpu)
     qht_reset_size(&tcg_ctx.tb_ctx.htable, CODE_GEN_HTABLE_SIZE);
     page_flush_tb();
 
+    tcg_ctx.code_gen_buffer = code_gen_buffer_replace();
     tcg_ctx.code_gen_ptr = tcg_ctx.code_gen_buffer;
+    tcg_prologue_init(&tcg_ctx);
     /* XXX: flush processor icache at this point if cache flush is
        expensive */
     tcg_ctx.tb_ctx.tb_flush_count++;
+
+    /* exit all CPUs so that the old buffer is quickly cleared. */
+    CPU_FOREACH(cpu) {
+        cpu_exit(cpu);
+    }
 }
 
 #ifdef DEBUG_TB_CHECK