diff mbox

[RFC,v3,19/19] cpu-exec: remove tb_lock from the hot-path

Message ID 1464986428-6739-20-git-send-email-alex.bennee@linaro.org (mailing list archive)
State New, archived
Headers show

Commit Message

Alex Bennée June 3, 2016, 8:40 p.m. UTC
Lock contention in the hot path of moving between existing patched
TranslationBlocks is the main drag on MTTCG performance. This patch
pushes the tb_lock() usage down to the two places that really need it:

  - code generation (tb_gen_code)
  - jump patching (tb_add_jump)

The rest of the code doesn't really need to hold a lock as it is either
using per-CPU structures or designed to be used in concurrent read
situations (qht_lookup).

Signed-off-by: Alex Bennée <alex.bennee@linaro.org>

---
v3
  - fix merge conflicts with Sergey's patch
---
 cpu-exec.c | 59 ++++++++++++++++++++++++++++++-----------------------------
 1 file changed, 30 insertions(+), 29 deletions(-)

Comments

Sergey Fedorov June 29, 2016, 2:35 p.m. UTC | #1
On 03/06/16 23:40, Alex Bennée wrote:
> Lock contention in the hot path of moving between existing patched
> TranslationBlocks is the main drag on MTTCG performance. This patch
> pushes the tb_lock() usage down to the two places that really need it:
>
>   - code generation (tb_gen_code)
>   - jump patching (tb_add_jump)
>
> The rest of the code doesn't really need to hold a lock as it is either
> using per-CPU structures or designed to be used in concurrent read
> situations (qht_lookup).
>
> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
>
> ---
> v3
>   - fix merge conflicts with Sergey's patch
> ---
>  cpu-exec.c | 59 ++++++++++++++++++++++++++++++-----------------------------
>  1 file changed, 30 insertions(+), 29 deletions(-)
>
> diff --git a/cpu-exec.c b/cpu-exec.c
> index b017643..4af0b52 100644
> --- a/cpu-exec.c
> +++ b/cpu-exec.c
> @@ -298,41 +298,38 @@ static TranslationBlock *tb_find_slow(CPUState *cpu,
>       * Pairs with smp_wmb() in tb_phys_invalidate(). */
>      smp_rmb();
>      tb = tb_find_physical(cpu, pc, cs_base, flags);
> -    if (tb) {
> -        goto found;
> -    }
> +    if (!tb) {
>  
> -    /* mmap_lock is needed by tb_gen_code, and mmap_lock must be
> -     * taken outside tb_lock.  Since we're momentarily dropping
> -     * tb_lock, there's a chance that our desired tb has been
> -     * translated.
> -     */
> -    tb_unlock();
> -    mmap_lock();
> -    tb_lock();
> -    tb = tb_find_physical(cpu, pc, cs_base, flags);
> -    if (tb) {
> -        mmap_unlock();
> -        goto found;
> -    }
> +        /* mmap_lock is needed by tb_gen_code, and mmap_lock must be
> +         * taken outside tb_lock.
> +         */
> +        mmap_lock();
> +        tb_lock();
>  
> -    /* if no translated code available, then translate it now */
> -    tb = tb_gen_code(cpu, pc, cs_base, flags, 0);
> +        /* There's a chance that our desired tb has been translated while
> +         * taking the locks so we check again inside the lock.
> +         */
> +        tb = tb_find_physical(cpu, pc, cs_base, flags);
> +        if (!tb) {
> +            /* if no translated code available, then translate it now */
> +            tb = tb_gen_code(cpu, pc, cs_base, flags, 0);
> +        }
>  
> -    mmap_unlock();
> +        tb_unlock();
> +        mmap_unlock();
> +    }
>  
> -found:
> -    /* we add the TB in the virtual pc hash table */
> +    /* We add the TB in the virtual pc hash table for the fast lookup */
>      cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)] = tb;

Hmm, seems like I forgot to convert this into atomic_set() in the
previous patch...

>      return tb;
>  }
>  
>  static inline TranslationBlock *tb_find_fast(CPUState *cpu,
> -                                             TranslationBlock **last_tb,
> +                                             TranslationBlock **ltbp,

I'm not sure if it is more readable...

>                                               int tb_exit)
>  {
>      CPUArchState *env = (CPUArchState *)cpu->env_ptr;
> -    TranslationBlock *tb;
> +    TranslationBlock *tb, *last_tb;
>      target_ulong cs_base, pc;
>      uint32_t flags;
>  
> @@ -340,7 +337,6 @@ static inline TranslationBlock *tb_find_fast(CPUState *cpu,
>         always be the same before a given translated block
>         is executed. */
>      cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);
> -    tb_lock();
>      tb = atomic_read(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)]);
>      if (unlikely(!tb || tb->pc != pc || tb->cs_base != cs_base ||
>                   tb->flags != flags)) {
> @@ -350,7 +346,7 @@ static inline TranslationBlock *tb_find_fast(CPUState *cpu,
>          /* Ensure that no TB jump will be modified as the
>           * translation buffer has been flushed.
>           */
> -        *last_tb = NULL;
> +        *ltbp = NULL;
>          cpu->tb_flushed = false;
>      }
>  #ifndef CONFIG_USER_ONLY
> @@ -359,14 +355,19 @@ static inline TranslationBlock *tb_find_fast(CPUState *cpu,
>       * spanning two pages because the mapping for the second page can change.
>       */
>      if (tb->page_addr[1] != -1) {
> -        *last_tb = NULL;
> +        *ltbp = NULL;
>      }
>  #endif
> +
>      /* See if we can patch the calling TB. */
> -    if (*last_tb && !qemu_loglevel_mask(CPU_LOG_TB_NOCHAIN)) {
> -        tb_add_jump(*last_tb, tb_exit, tb);
> +    last_tb = *ltbp;
> +    if (!qemu_loglevel_mask(CPU_LOG_TB_NOCHAIN) &&
> +        last_tb &&
> +        !last_tb->jmp_list_next[tb_exit]) {

If we're going to check this outside of tb_lock we have to do this with
atomic_{read,set}(). However, I think it is rare case to race on
tb_add_jump() so probably it is okay to do the check under tb_lock.

> +        tb_lock();
> +        tb_add_jump(last_tb, tb_exit, tb);
> +        tb_unlock();
>      }
> -    tb_unlock();
>      return tb;
>  }
>  

Kind regards,
Sergey
Alex Bennée June 29, 2016, 2:47 p.m. UTC | #2
Sergey Fedorov <serge.fdrv@gmail.com> writes:

> On 03/06/16 23:40, Alex Bennée wrote:
>> Lock contention in the hot path of moving between existing patched
>> TranslationBlocks is the main drag on MTTCG performance. This patch
>> pushes the tb_lock() usage down to the two places that really need it:
>>
>>   - code generation (tb_gen_code)
>>   - jump patching (tb_add_jump)
>>
>> The rest of the code doesn't really need to hold a lock as it is either
>> using per-CPU structures or designed to be used in concurrent read
>> situations (qht_lookup).
>>
>> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
>>
>> ---
>> v3
>>   - fix merge conflicts with Sergey's patch
>> ---
>>  cpu-exec.c | 59 ++++++++++++++++++++++++++++++-----------------------------
>>  1 file changed, 30 insertions(+), 29 deletions(-)
>>
>> diff --git a/cpu-exec.c b/cpu-exec.c
>> index b017643..4af0b52 100644
>> --- a/cpu-exec.c
>> +++ b/cpu-exec.c
>> @@ -298,41 +298,38 @@ static TranslationBlock *tb_find_slow(CPUState *cpu,
>>       * Pairs with smp_wmb() in tb_phys_invalidate(). */
>>      smp_rmb();
>>      tb = tb_find_physical(cpu, pc, cs_base, flags);
>> -    if (tb) {
>> -        goto found;
>> -    }
>> +    if (!tb) {
>>
>> -    /* mmap_lock is needed by tb_gen_code, and mmap_lock must be
>> -     * taken outside tb_lock.  Since we're momentarily dropping
>> -     * tb_lock, there's a chance that our desired tb has been
>> -     * translated.
>> -     */
>> -    tb_unlock();
>> -    mmap_lock();
>> -    tb_lock();
>> -    tb = tb_find_physical(cpu, pc, cs_base, flags);
>> -    if (tb) {
>> -        mmap_unlock();
>> -        goto found;
>> -    }
>> +        /* mmap_lock is needed by tb_gen_code, and mmap_lock must be
>> +         * taken outside tb_lock.
>> +         */
>> +        mmap_lock();
>> +        tb_lock();
>>
>> -    /* if no translated code available, then translate it now */
>> -    tb = tb_gen_code(cpu, pc, cs_base, flags, 0);
>> +        /* There's a chance that our desired tb has been translated while
>> +         * taking the locks so we check again inside the lock.
>> +         */
>> +        tb = tb_find_physical(cpu, pc, cs_base, flags);
>> +        if (!tb) {
>> +            /* if no translated code available, then translate it now */
>> +            tb = tb_gen_code(cpu, pc, cs_base, flags, 0);
>> +        }
>>
>> -    mmap_unlock();
>> +        tb_unlock();
>> +        mmap_unlock();
>> +    }
>>
>> -found:
>> -    /* we add the TB in the virtual pc hash table */
>> +    /* We add the TB in the virtual pc hash table for the fast lookup */
>>      cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)] = tb;
>
> Hmm, seems like I forgot to convert this into atomic_set() in the
> previous patch...

OK, can you fix that in your quick fixes series?

>
>>      return tb;
>>  }
>>
>>  static inline TranslationBlock *tb_find_fast(CPUState *cpu,
>> -                                             TranslationBlock **last_tb,
>> +                                             TranslationBlock **ltbp,
>
> I'm not sure if it is more readable...

I'll revert. I was trying to keep line lengths short :-/

>
>>                                               int tb_exit)
>>  {
>>      CPUArchState *env = (CPUArchState *)cpu->env_ptr;
>> -    TranslationBlock *tb;
>> +    TranslationBlock *tb, *last_tb;
>>      target_ulong cs_base, pc;
>>      uint32_t flags;
>>
>> @@ -340,7 +337,6 @@ static inline TranslationBlock *tb_find_fast(CPUState *cpu,
>>         always be the same before a given translated block
>>         is executed. */
>>      cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);
>> -    tb_lock();
>>      tb = atomic_read(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)]);
>>      if (unlikely(!tb || tb->pc != pc || tb->cs_base != cs_base ||
>>                   tb->flags != flags)) {
>> @@ -350,7 +346,7 @@ static inline TranslationBlock *tb_find_fast(CPUState *cpu,
>>          /* Ensure that no TB jump will be modified as the
>>           * translation buffer has been flushed.
>>           */
>> -        *last_tb = NULL;
>> +        *ltbp = NULL;
>>          cpu->tb_flushed = false;
>>      }
>>  #ifndef CONFIG_USER_ONLY
>> @@ -359,14 +355,19 @@ static inline TranslationBlock *tb_find_fast(CPUState *cpu,
>>       * spanning two pages because the mapping for the second page can change.
>>       */
>>      if (tb->page_addr[1] != -1) {
>> -        *last_tb = NULL;
>> +        *ltbp = NULL;
>>      }
>>  #endif
>> +
>>      /* See if we can patch the calling TB. */
>> -    if (*last_tb && !qemu_loglevel_mask(CPU_LOG_TB_NOCHAIN)) {
>> -        tb_add_jump(*last_tb, tb_exit, tb);
>> +    last_tb = *ltbp;
>> +    if (!qemu_loglevel_mask(CPU_LOG_TB_NOCHAIN) &&
>> +        last_tb &&
>> +        !last_tb->jmp_list_next[tb_exit]) {
>
> If we're going to check this outside of tb_lock we have to do this with
> atomic_{read,set}(). However, I think it is rare case to race on
> tb_add_jump() so probably it is okay to do the check under tb_lock.

It's checking for NULL, it gets re-checked in tb_add_jump while under
lock so I the setting race should be OK I think.

>
>> +        tb_lock();
>> +        tb_add_jump(last_tb, tb_exit, tb);
>> +        tb_unlock();
>>      }
>> -    tb_unlock();
>>      return tb;
>>  }
>>
>
> Kind regards,
> Sergey


--
Alex Bennée
Sergey Fedorov June 29, 2016, 2:52 p.m. UTC | #3
On 29/06/16 17:47, Alex Bennée wrote:
> Sergey Fedorov <serge.fdrv@gmail.com> writes:
>
>> On 03/06/16 23:40, Alex Bennée wrote:
>>> Lock contention in the hot path of moving between existing patched
>>> TranslationBlocks is the main drag on MTTCG performance. This patch
>>> pushes the tb_lock() usage down to the two places that really need it:
>>>
>>>   - code generation (tb_gen_code)
>>>   - jump patching (tb_add_jump)
>>>
>>> The rest of the code doesn't really need to hold a lock as it is either
>>> using per-CPU structures or designed to be used in concurrent read
>>> situations (qht_lookup).
>>>
>>> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
>>>
>>> ---
>>> v3
>>>   - fix merge conflicts with Sergey's patch
>>> ---
>>>  cpu-exec.c | 59 ++++++++++++++++++++++++++++++-----------------------------
>>>  1 file changed, 30 insertions(+), 29 deletions(-)
>>>
>>> diff --git a/cpu-exec.c b/cpu-exec.c
>>> index b017643..4af0b52 100644
>>> --- a/cpu-exec.c
>>> +++ b/cpu-exec.c
>>> @@ -298,41 +298,38 @@ static TranslationBlock *tb_find_slow(CPUState *cpu,
>>>       * Pairs with smp_wmb() in tb_phys_invalidate(). */
>>>      smp_rmb();
>>>      tb = tb_find_physical(cpu, pc, cs_base, flags);
>>> -    if (tb) {
>>> -        goto found;
>>> -    }
>>> +    if (!tb) {
>>>
>>> -    /* mmap_lock is needed by tb_gen_code, and mmap_lock must be
>>> -     * taken outside tb_lock.  Since we're momentarily dropping
>>> -     * tb_lock, there's a chance that our desired tb has been
>>> -     * translated.
>>> -     */
>>> -    tb_unlock();
>>> -    mmap_lock();
>>> -    tb_lock();
>>> -    tb = tb_find_physical(cpu, pc, cs_base, flags);
>>> -    if (tb) {
>>> -        mmap_unlock();
>>> -        goto found;
>>> -    }
>>> +        /* mmap_lock is needed by tb_gen_code, and mmap_lock must be
>>> +         * taken outside tb_lock.
>>> +         */
>>> +        mmap_lock();
>>> +        tb_lock();
>>>
>>> -    /* if no translated code available, then translate it now */
>>> -    tb = tb_gen_code(cpu, pc, cs_base, flags, 0);
>>> +        /* There's a chance that our desired tb has been translated while
>>> +         * taking the locks so we check again inside the lock.
>>> +         */
>>> +        tb = tb_find_physical(cpu, pc, cs_base, flags);
>>> +        if (!tb) {
>>> +            /* if no translated code available, then translate it now */
>>> +            tb = tb_gen_code(cpu, pc, cs_base, flags, 0);
>>> +        }
>>>
>>> -    mmap_unlock();
>>> +        tb_unlock();
>>> +        mmap_unlock();
>>> +    }
>>>
>>> -found:
>>> -    /* we add the TB in the virtual pc hash table */
>>> +    /* We add the TB in the virtual pc hash table for the fast lookup */
>>>      cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)] = tb;
>> Hmm, seems like I forgot to convert this into atomic_set() in the
>> previous patch...
> OK, can you fix that in your quick fixes series?

Sure. I think that patch and this are both ready-to-go into mainline.

>
>>>      return tb;
>>>  }
>>>
>>>  static inline TranslationBlock *tb_find_fast(CPUState *cpu,
>>> -                                             TranslationBlock **last_tb,
>>> +                                             TranslationBlock **ltbp,
>> I'm not sure if it is more readable...
> I'll revert. I was trying to keep line lengths short :-/
>
>>>                                               int tb_exit)
>>>  {
>>>      CPUArchState *env = (CPUArchState *)cpu->env_ptr;
>>> -    TranslationBlock *tb;
>>> +    TranslationBlock *tb, *last_tb;
>>>      target_ulong cs_base, pc;
>>>      uint32_t flags;
>>>
>>> @@ -340,7 +337,6 @@ static inline TranslationBlock *tb_find_fast(CPUState *cpu,
>>>         always be the same before a given translated block
>>>         is executed. */
>>>      cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);
>>> -    tb_lock();
>>>      tb = atomic_read(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)]);
>>>      if (unlikely(!tb || tb->pc != pc || tb->cs_base != cs_base ||
>>>                   tb->flags != flags)) {
>>> @@ -350,7 +346,7 @@ static inline TranslationBlock *tb_find_fast(CPUState *cpu,
>>>          /* Ensure that no TB jump will be modified as the
>>>           * translation buffer has been flushed.
>>>           */
>>> -        *last_tb = NULL;
>>> +        *ltbp = NULL;
>>>          cpu->tb_flushed = false;
>>>      }
>>>  #ifndef CONFIG_USER_ONLY
>>> @@ -359,14 +355,19 @@ static inline TranslationBlock *tb_find_fast(CPUState *cpu,
>>>       * spanning two pages because the mapping for the second page can change.
>>>       */
>>>      if (tb->page_addr[1] != -1) {
>>> -        *last_tb = NULL;
>>> +        *ltbp = NULL;
>>>      }
>>>  #endif
>>> +
>>>      /* See if we can patch the calling TB. */
>>> -    if (*last_tb && !qemu_loglevel_mask(CPU_LOG_TB_NOCHAIN)) {
>>> -        tb_add_jump(*last_tb, tb_exit, tb);
>>> +    last_tb = *ltbp;
>>> +    if (!qemu_loglevel_mask(CPU_LOG_TB_NOCHAIN) &&
>>> +        last_tb &&
>>> +        !last_tb->jmp_list_next[tb_exit]) {
>> If we're going to check this outside of tb_lock we have to do this with
>> atomic_{read,set}(). However, I think it is rare case to race on
>> tb_add_jump() so probably it is okay to do the check under tb_lock.
> It's checking for NULL, it gets re-checked in tb_add_jump while under
> lock so I the setting race should be OK I think.

I think we could just skip this check and be fine. What do you think
regarding this?

Thanks,
Sergey

>
>>> +        tb_lock();
>>> +        tb_add_jump(last_tb, tb_exit, tb);
>>> +        tb_unlock();
>>>      }
>>> -    tb_unlock();
>>>      return tb;
>>>  }
>>>
>> Kind regards,
>> Sergey
>
> --
> Alex Bennée
Alex Bennée June 29, 2016, 4:08 p.m. UTC | #4
Sergey Fedorov <serge.fdrv@gmail.com> writes:

> On 29/06/16 17:47, Alex Bennée wrote:
>> Sergey Fedorov <serge.fdrv@gmail.com> writes:
>>
>>> On 03/06/16 23:40, Alex Bennée wrote:
>>>> Lock contention in the hot path of moving between existing patched
>>>> TranslationBlocks is the main drag on MTTCG performance. This patch
>>>> pushes the tb_lock() usage down to the two places that really need it:
>>>>
>>>>   - code generation (tb_gen_code)
>>>>   - jump patching (tb_add_jump)
>>>>
>>>> The rest of the code doesn't really need to hold a lock as it is either
>>>> using per-CPU structures or designed to be used in concurrent read
>>>> situations (qht_lookup).
>>>>
>>>> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
>>>>
>>>> ---
>>>> v3
>>>>   - fix merge conflicts with Sergey's patch
>>>> ---
>>>>  cpu-exec.c | 59 ++++++++++++++++++++++++++++++-----------------------------
>>>>  1 file changed, 30 insertions(+), 29 deletions(-)
>>>>
>>>> diff --git a/cpu-exec.c b/cpu-exec.c
>>>> index b017643..4af0b52 100644
>>>> --- a/cpu-exec.c
>>>> +++ b/cpu-exec.c
>>>> @@ -298,41 +298,38 @@ static TranslationBlock *tb_find_slow(CPUState *cpu,
>>>>       * Pairs with smp_wmb() in tb_phys_invalidate(). */
>>>>      smp_rmb();
>>>>      tb = tb_find_physical(cpu, pc, cs_base, flags);
>>>> -    if (tb) {
>>>> -        goto found;
>>>> -    }
>>>> +    if (!tb) {
>>>>
>>>> -    /* mmap_lock is needed by tb_gen_code, and mmap_lock must be
>>>> -     * taken outside tb_lock.  Since we're momentarily dropping
>>>> -     * tb_lock, there's a chance that our desired tb has been
>>>> -     * translated.
>>>> -     */
>>>> -    tb_unlock();
>>>> -    mmap_lock();
>>>> -    tb_lock();
>>>> -    tb = tb_find_physical(cpu, pc, cs_base, flags);
>>>> -    if (tb) {
>>>> -        mmap_unlock();
>>>> -        goto found;
>>>> -    }
>>>> +        /* mmap_lock is needed by tb_gen_code, and mmap_lock must be
>>>> +         * taken outside tb_lock.
>>>> +         */
>>>> +        mmap_lock();
>>>> +        tb_lock();
>>>>
>>>> -    /* if no translated code available, then translate it now */
>>>> -    tb = tb_gen_code(cpu, pc, cs_base, flags, 0);
>>>> +        /* There's a chance that our desired tb has been translated while
>>>> +         * taking the locks so we check again inside the lock.
>>>> +         */
>>>> +        tb = tb_find_physical(cpu, pc, cs_base, flags);
>>>> +        if (!tb) {
>>>> +            /* if no translated code available, then translate it now */
>>>> +            tb = tb_gen_code(cpu, pc, cs_base, flags, 0);
>>>> +        }
>>>>
>>>> -    mmap_unlock();
>>>> +        tb_unlock();
>>>> +        mmap_unlock();
>>>> +    }
>>>>
>>>> -found:
>>>> -    /* we add the TB in the virtual pc hash table */
>>>> +    /* We add the TB in the virtual pc hash table for the fast lookup */
>>>>      cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)] = tb;
>>> Hmm, seems like I forgot to convert this into atomic_set() in the
>>> previous patch...
>> OK, can you fix that in your quick fixes series?
>
> Sure. I think that patch and this are both ready-to-go into mainline.
>
>>
>>>>      return tb;
>>>>  }
>>>>
>>>>  static inline TranslationBlock *tb_find_fast(CPUState *cpu,
>>>> -                                             TranslationBlock **last_tb,
>>>> +                                             TranslationBlock **ltbp,
>>> I'm not sure if it is more readable...
>> I'll revert. I was trying to keep line lengths short :-/
>>
>>>>                                               int tb_exit)
>>>>  {
>>>>      CPUArchState *env = (CPUArchState *)cpu->env_ptr;
>>>> -    TranslationBlock *tb;
>>>> +    TranslationBlock *tb, *last_tb;
>>>>      target_ulong cs_base, pc;
>>>>      uint32_t flags;
>>>>
>>>> @@ -340,7 +337,6 @@ static inline TranslationBlock *tb_find_fast(CPUState *cpu,
>>>>         always be the same before a given translated block
>>>>         is executed. */
>>>>      cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);
>>>> -    tb_lock();
>>>>      tb = atomic_read(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)]);
>>>>      if (unlikely(!tb || tb->pc != pc || tb->cs_base != cs_base ||
>>>>                   tb->flags != flags)) {
>>>> @@ -350,7 +346,7 @@ static inline TranslationBlock *tb_find_fast(CPUState *cpu,
>>>>          /* Ensure that no TB jump will be modified as the
>>>>           * translation buffer has been flushed.
>>>>           */
>>>> -        *last_tb = NULL;
>>>> +        *ltbp = NULL;
>>>>          cpu->tb_flushed = false;
>>>>      }
>>>>  #ifndef CONFIG_USER_ONLY
>>>> @@ -359,14 +355,19 @@ static inline TranslationBlock *tb_find_fast(CPUState *cpu,
>>>>       * spanning two pages because the mapping for the second page can change.
>>>>       */
>>>>      if (tb->page_addr[1] != -1) {
>>>> -        *last_tb = NULL;
>>>> +        *ltbp = NULL;
>>>>      }
>>>>  #endif
>>>> +
>>>>      /* See if we can patch the calling TB. */
>>>> -    if (*last_tb && !qemu_loglevel_mask(CPU_LOG_TB_NOCHAIN)) {
>>>> -        tb_add_jump(*last_tb, tb_exit, tb);
>>>> +    last_tb = *ltbp;
>>>> +    if (!qemu_loglevel_mask(CPU_LOG_TB_NOCHAIN) &&
>>>> +        last_tb &&
>>>> +        !last_tb->jmp_list_next[tb_exit]) {
>>> If we're going to check this outside of tb_lock we have to do this with
>>> atomic_{read,set}(). However, I think it is rare case to race on
>>> tb_add_jump() so probably it is okay to do the check under tb_lock.
>> It's checking for NULL, it gets re-checked in tb_add_jump while under
>> lock so I the setting race should be OK I think.
>
> I think we could just skip this check and be fine. What do you think
> regarding this?

I think that means we'll take the lock more frequently than we want to.
I'll have to check.

>
> Thanks,
> Sergey
>
>>
>>>> +        tb_lock();
>>>> +        tb_add_jump(last_tb, tb_exit, tb);
>>>> +        tb_unlock();
>>>>      }
>>>> -    tb_unlock();
>>>>      return tb;
>>>>  }
>>>>
>>> Kind regards,
>>> Sergey
>>
>> --
>> Alex Bennée


--
Alex Bennée
Sergey Fedorov June 30, 2016, 9:24 a.m. UTC | #5
On 29/06/16 19:08, Alex Bennée wrote:
> Sergey Fedorov <serge.fdrv@gmail.com> writes:
>
>> On 29/06/16 17:47, Alex Bennée wrote:
>>> Sergey Fedorov <serge.fdrv@gmail.com> writes:
>>>
>>>> On 03/06/16 23:40, Alex Bennée wrote:
(snip)
>>>>>
>>>>>  static inline TranslationBlock *tb_find_fast(CPUState *cpu,
>>>>> -                                             TranslationBlock **last_tb,
>>>>> +                                             TranslationBlock **ltbp,
>>>> I'm not sure if it is more readable...
>>> I'll revert. I was trying to keep line lengths short :-/
>>>
>>>>>                                               int tb_exit)
>>>>>  {
>>>>>      CPUArchState *env = (CPUArchState *)cpu->env_ptr;
>>>>> -    TranslationBlock *tb;
>>>>> +    TranslationBlock *tb, *last_tb;
>>>>>      target_ulong cs_base, pc;
>>>>>      uint32_t flags;
>>>>>
>>>>> @@ -340,7 +337,6 @@ static inline TranslationBlock *tb_find_fast(CPUState *cpu,
>>>>>         always be the same before a given translated block
>>>>>         is executed. */
>>>>>      cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);
>>>>> -    tb_lock();
>>>>>      tb = atomic_read(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)]);
>>>>>      if (unlikely(!tb || tb->pc != pc || tb->cs_base != cs_base ||
>>>>>                   tb->flags != flags)) {
>>>>> @@ -350,7 +346,7 @@ static inline TranslationBlock *tb_find_fast(CPUState *cpu,
>>>>>          /* Ensure that no TB jump will be modified as the
>>>>>           * translation buffer has been flushed.
>>>>>           */
>>>>> -        *last_tb = NULL;
>>>>> +        *ltbp = NULL;
>>>>>          cpu->tb_flushed = false;
>>>>>      }
>>>>>  #ifndef CONFIG_USER_ONLY
>>>>> @@ -359,14 +355,19 @@ static inline TranslationBlock *tb_find_fast(CPUState *cpu,
>>>>>       * spanning two pages because the mapping for the second page can change.
>>>>>       */
>>>>>      if (tb->page_addr[1] != -1) {
>>>>> -        *last_tb = NULL;
>>>>> +        *ltbp = NULL;
>>>>>      }
>>>>>  #endif
>>>>> +
>>>>>      /* See if we can patch the calling TB. */
>>>>> -    if (*last_tb && !qemu_loglevel_mask(CPU_LOG_TB_NOCHAIN)) {
>>>>> -        tb_add_jump(*last_tb, tb_exit, tb);
>>>>> +    last_tb = *ltbp;
>>>>> +    if (!qemu_loglevel_mask(CPU_LOG_TB_NOCHAIN) &&
>>>>> +        last_tb &&
>>>>> +        !last_tb->jmp_list_next[tb_exit]) {
>>>> If we're going to check this outside of tb_lock we have to do this with
>>>> atomic_{read,set}(). However, I think it is rare case to race on
>>>> tb_add_jump() so probably it is okay to do the check under tb_lock.
>>> It's checking for NULL, it gets re-checked in tb_add_jump while under
>>> lock so I the setting race should be OK I think.
>> I think we could just skip this check and be fine. What do you think
>> regarding this?
> I think that means we'll take the lock more frequently than we want to.
> I'll have to check.

If two blocks have already been chained then we don't go here, otherwise
the chances that some other thread has just done the chaining of the
blocks are rare, I think.

Regards,
Sergey

>>>>> +        tb_lock();
>>>>> +        tb_add_jump(last_tb, tb_exit, tb);
>>>>> +        tb_unlock();
>>>>>      }
>>>>> -    tb_unlock();
>>>>>      return tb;
>>>>>  }
>>>>>
>>>>
diff mbox

Patch

diff --git a/cpu-exec.c b/cpu-exec.c
index b017643..4af0b52 100644
--- a/cpu-exec.c
+++ b/cpu-exec.c
@@ -298,41 +298,38 @@  static TranslationBlock *tb_find_slow(CPUState *cpu,
      * Pairs with smp_wmb() in tb_phys_invalidate(). */
     smp_rmb();
     tb = tb_find_physical(cpu, pc, cs_base, flags);
-    if (tb) {
-        goto found;
-    }
+    if (!tb) {
 
-    /* mmap_lock is needed by tb_gen_code, and mmap_lock must be
-     * taken outside tb_lock.  Since we're momentarily dropping
-     * tb_lock, there's a chance that our desired tb has been
-     * translated.
-     */
-    tb_unlock();
-    mmap_lock();
-    tb_lock();
-    tb = tb_find_physical(cpu, pc, cs_base, flags);
-    if (tb) {
-        mmap_unlock();
-        goto found;
-    }
+        /* mmap_lock is needed by tb_gen_code, and mmap_lock must be
+         * taken outside tb_lock.
+         */
+        mmap_lock();
+        tb_lock();
 
-    /* if no translated code available, then translate it now */
-    tb = tb_gen_code(cpu, pc, cs_base, flags, 0);
+        /* There's a chance that our desired tb has been translated while
+         * taking the locks so we check again inside the lock.
+         */
+        tb = tb_find_physical(cpu, pc, cs_base, flags);
+        if (!tb) {
+            /* if no translated code available, then translate it now */
+            tb = tb_gen_code(cpu, pc, cs_base, flags, 0);
+        }
 
-    mmap_unlock();
+        tb_unlock();
+        mmap_unlock();
+    }
 
-found:
-    /* we add the TB in the virtual pc hash table */
+    /* We add the TB in the virtual pc hash table for the fast lookup */
     cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)] = tb;
     return tb;
 }
 
 static inline TranslationBlock *tb_find_fast(CPUState *cpu,
-                                             TranslationBlock **last_tb,
+                                             TranslationBlock **ltbp,
                                              int tb_exit)
 {
     CPUArchState *env = (CPUArchState *)cpu->env_ptr;
-    TranslationBlock *tb;
+    TranslationBlock *tb, *last_tb;
     target_ulong cs_base, pc;
     uint32_t flags;
 
@@ -340,7 +337,6 @@  static inline TranslationBlock *tb_find_fast(CPUState *cpu,
        always be the same before a given translated block
        is executed. */
     cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);
-    tb_lock();
     tb = atomic_read(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)]);
     if (unlikely(!tb || tb->pc != pc || tb->cs_base != cs_base ||
                  tb->flags != flags)) {
@@ -350,7 +346,7 @@  static inline TranslationBlock *tb_find_fast(CPUState *cpu,
         /* Ensure that no TB jump will be modified as the
          * translation buffer has been flushed.
          */
-        *last_tb = NULL;
+        *ltbp = NULL;
         cpu->tb_flushed = false;
     }
 #ifndef CONFIG_USER_ONLY
@@ -359,14 +355,19 @@  static inline TranslationBlock *tb_find_fast(CPUState *cpu,
      * spanning two pages because the mapping for the second page can change.
      */
     if (tb->page_addr[1] != -1) {
-        *last_tb = NULL;
+        *ltbp = NULL;
     }
 #endif
+
     /* See if we can patch the calling TB. */
-    if (*last_tb && !qemu_loglevel_mask(CPU_LOG_TB_NOCHAIN)) {
-        tb_add_jump(*last_tb, tb_exit, tb);
+    last_tb = *ltbp;
+    if (!qemu_loglevel_mask(CPU_LOG_TB_NOCHAIN) &&
+        last_tb &&
+        !last_tb->jmp_list_next[tb_exit]) {
+        tb_lock();
+        tb_add_jump(last_tb, tb_exit, tb);
+        tb_unlock();
     }
-    tb_unlock();
     return tb;
 }