diff mbox

[v2,3/6] tcg: cpu-exec: remove tb_lock from the hot-path

Message ID 1467735496-16256-4-git-send-email-alex.bennee@linaro.org (mailing list archive)
State New, archived
Headers show

Commit Message

Alex Bennée July 5, 2016, 4:18 p.m. UTC
Lock contention in the hot path of moving between existing patched
TranslationBlocks is the main drag in multithreaded performance. This
patch pushes the tb_lock() usage down to the two places that really need
it:

  - code generation (tb_gen_code)
  - jump patching (tb_add_jump)

The rest of the code doesn't really need to hold a lock as it is either
using per-CPU structures, atomically updated or designed to be used in
concurrent read situations (qht_lookup).

To keep things simple I removed the #ifdef CONFIG_USER_ONLY stuff as the
locks become NOPs anyway until the MTTCG work is completed.

Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Richard Henderson <rth@twiddle.net>

---
v3 (base-patches)
  - fix merge conflicts with Sergey's patch
v1 (hot path, split from base-patches series)
  - revert name tweaking
  - drop test jmp_list_next outside lock
  - mention lock NOPs in comments
v2 (hot path)
  - Add r-b tags
---
 cpu-exec.c | 48 +++++++++++++++++++++---------------------------
 1 file changed, 21 insertions(+), 27 deletions(-)

Comments

Sergey Fedorov July 7, 2016, 2:18 p.m. UTC | #1
On 05/07/16 19:18, Alex Bennée wrote:
> Lock contention in the hot path of moving between existing patched
> TranslationBlocks is the main drag in multithreaded performance. This
> patch pushes the tb_lock() usage down to the two places that really need
> it:
>
>   - code generation (tb_gen_code)
>   - jump patching (tb_add_jump)
>
> The rest of the code doesn't really need to hold a lock as it is either
> using per-CPU structures, atomically updated or designed to be used in
> concurrent read situations (qht_lookup).
>
> To keep things simple I removed the #ifdef CONFIG_USER_ONLY stuff as the
> locks become NOPs anyway until the MTTCG work is completed.
>
> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
> Reviewed-by: Richard Henderson <rth@twiddle.net>

Reviewed-by: Sergey Fedorov <sergey.fedorov@linaro.org>

>
> ---
> v3 (base-patches)
>   - fix merge conflicts with Sergey's patch
> v1 (hot path, split from base-patches series)
>   - revert name tweaking
>   - drop test jmp_list_next outside lock
>   - mention lock NOPs in comments
> v2 (hot path)
>   - Add r-b tags
> ---
>  cpu-exec.c | 48 +++++++++++++++++++++---------------------------
>  1 file changed, 21 insertions(+), 27 deletions(-)
>
> diff --git a/cpu-exec.c b/cpu-exec.c
> index 10ce1cb..dd0bd50 100644
> --- a/cpu-exec.c
> +++ b/cpu-exec.c
> @@ -291,35 +291,29 @@ static TranslationBlock *tb_find_slow(CPUState *cpu,
>       * Pairs with smp_wmb() in tb_phys_invalidate(). */
>      smp_rmb();
>      tb = tb_find_physical(cpu, pc, cs_base, flags);
> -    if (tb) {
> -        goto found;
> -    }
> +    if (!tb) {
>  
> -#ifdef CONFIG_USER_ONLY
> -    /* mmap_lock is needed by tb_gen_code, and mmap_lock must be
> -     * taken outside tb_lock.  Since we're momentarily dropping
> -     * tb_lock, there's a chance that our desired tb has been
> -     * translated.
> -     */
> -    tb_unlock();
> -    mmap_lock();
> -    tb_lock();
> -    tb = tb_find_physical(cpu, pc, cs_base, flags);
> -    if (tb) {
> -        mmap_unlock();
> -        goto found;
> -    }
> -#endif
> +        /* mmap_lock is needed by tb_gen_code, and mmap_lock must be
> +         * taken outside tb_lock. As system emulation is currently
> +         * single threaded the locks are NOPs.
> +         */
> +        mmap_lock();
> +        tb_lock();
>  
> -    /* if no translated code available, then translate it now */
> -    tb = tb_gen_code(cpu, pc, cs_base, flags, 0);
> +        /* There's a chance that our desired tb has been translated while
> +         * taking the locks so we check again inside the lock.
> +         */
> +        tb = tb_find_physical(cpu, pc, cs_base, flags);
> +        if (!tb) {
> +            /* if no translated code available, then translate it now */
> +            tb = tb_gen_code(cpu, pc, cs_base, flags, 0);
> +        }
>  
> -#ifdef CONFIG_USER_ONLY
> -    mmap_unlock();
> -#endif
> +        tb_unlock();
> +        mmap_unlock();
> +    }
>  
> -found:
> -    /* we add the TB in the virtual pc hash table */
> +    /* We add the TB in the virtual pc hash table for the fast lookup */
>      atomic_set(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)], tb);
>      return tb;
>  }
> @@ -337,7 +331,6 @@ static inline TranslationBlock *tb_find_fast(CPUState *cpu,
>         always be the same before a given translated block
>         is executed. */
>      cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);
> -    tb_lock();
>      tb = atomic_read(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)]);
>      if (unlikely(!tb || tb->pc != pc || tb->cs_base != cs_base ||
>                   tb->flags != flags)) {
> @@ -361,9 +354,10 @@ static inline TranslationBlock *tb_find_fast(CPUState *cpu,
>  #endif
>      /* See if we can patch the calling TB. */
>      if (*last_tb && !qemu_loglevel_mask(CPU_LOG_TB_NOCHAIN)) {
> +        tb_lock();
>          tb_add_jump(*last_tb, tb_exit, tb);
> +        tb_unlock();
>      }
> -    tb_unlock();
>      return tb;
>  }
>
Sergey Fedorov July 8, 2016, 3:50 p.m. UTC | #2
On 07/07/16 17:18, Sergey Fedorov wrote:
> On 05/07/16 19:18, Alex Bennée wrote:
>> Lock contention in the hot path of moving between existing patched
>> TranslationBlocks is the main drag in multithreaded performance. This
>> patch pushes the tb_lock() usage down to the two places that really need
>> it:
>>
>>   - code generation (tb_gen_code)
>>   - jump patching (tb_add_jump)
>>
>> The rest of the code doesn't really need to hold a lock as it is either
>> using per-CPU structures, atomically updated or designed to be used in
>> concurrent read situations (qht_lookup).
>>
>> To keep things simple I removed the #ifdef CONFIG_USER_ONLY stuff as the
>> locks become NOPs anyway until the MTTCG work is completed.
>>
>> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
>> Reviewed-by: Richard Henderson <rth@twiddle.net>
> Reviewed-by: Sergey Fedorov <sergey.fedorov@linaro.org>

Revoked: CPUState::tb_flushed must also be protected by tb_lock.

>
>> ---
>> v3 (base-patches)
>>   - fix merge conflicts with Sergey's patch
>> v1 (hot path, split from base-patches series)
>>   - revert name tweaking
>>   - drop test jmp_list_next outside lock
>>   - mention lock NOPs in comments
>> v2 (hot path)
>>   - Add r-b tags
>> ---
>>  cpu-exec.c | 48 +++++++++++++++++++++---------------------------
>>  1 file changed, 21 insertions(+), 27 deletions(-)
>>
>> diff --git a/cpu-exec.c b/cpu-exec.c
>> index 10ce1cb..dd0bd50 100644
>> --- a/cpu-exec.c
>> +++ b/cpu-exec.c
>> @@ -291,35 +291,29 @@ static TranslationBlock *tb_find_slow(CPUState *cpu,
>>       * Pairs with smp_wmb() in tb_phys_invalidate(). */
>>      smp_rmb();
>>      tb = tb_find_physical(cpu, pc, cs_base, flags);
>> -    if (tb) {
>> -        goto found;
>> -    }
>> +    if (!tb) {
>>  
>> -#ifdef CONFIG_USER_ONLY
>> -    /* mmap_lock is needed by tb_gen_code, and mmap_lock must be
>> -     * taken outside tb_lock.  Since we're momentarily dropping
>> -     * tb_lock, there's a chance that our desired tb has been
>> -     * translated.
>> -     */
>> -    tb_unlock();
>> -    mmap_lock();
>> -    tb_lock();
>> -    tb = tb_find_physical(cpu, pc, cs_base, flags);
>> -    if (tb) {
>> -        mmap_unlock();
>> -        goto found;
>> -    }
>> -#endif
>> +        /* mmap_lock is needed by tb_gen_code, and mmap_lock must be
>> +         * taken outside tb_lock. As system emulation is currently
>> +         * single threaded the locks are NOPs.
>> +         */
>> +        mmap_lock();
>> +        tb_lock();
>>  
>> -    /* if no translated code available, then translate it now */
>> -    tb = tb_gen_code(cpu, pc, cs_base, flags, 0);
>> +        /* There's a chance that our desired tb has been translated while
>> +         * taking the locks so we check again inside the lock.
>> +         */
>> +        tb = tb_find_physical(cpu, pc, cs_base, flags);
>> +        if (!tb) {
>> +            /* if no translated code available, then translate it now */
>> +            tb = tb_gen_code(cpu, pc, cs_base, flags, 0);
>> +        }
>>  
>> -#ifdef CONFIG_USER_ONLY
>> -    mmap_unlock();
>> -#endif
>> +        tb_unlock();
>> +        mmap_unlock();
>> +    }
>>  
>> -found:
>> -    /* we add the TB in the virtual pc hash table */
>> +    /* We add the TB in the virtual pc hash table for the fast lookup */
>>      atomic_set(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)], tb);
>>      return tb;
>>  }
>> @@ -337,7 +331,6 @@ static inline TranslationBlock *tb_find_fast(CPUState *cpu,
>>         always be the same before a given translated block
>>         is executed. */
>>      cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);
>> -    tb_lock();
>>      tb = atomic_read(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)]);
>>      if (unlikely(!tb || tb->pc != pc || tb->cs_base != cs_base ||
>>                   tb->flags != flags)) {
>> @@ -361,9 +354,10 @@ static inline TranslationBlock *tb_find_fast(CPUState *cpu,
>>  #endif
>>      /* See if we can patch the calling TB. */
>>      if (*last_tb && !qemu_loglevel_mask(CPU_LOG_TB_NOCHAIN)) {
>> +        tb_lock();
>>          tb_add_jump(*last_tb, tb_exit, tb);
>> +        tb_unlock();
>>      }
>> -    tb_unlock();
>>      return tb;
>>  }
>>
Sergey Fedorov July 8, 2016, 5:34 p.m. UTC | #3
On 07/07/16 17:18, Sergey Fedorov wrote:
> On 05/07/16 19:18, Alex Bennée wrote:
>> Lock contention in the hot path of moving between existing patched
>> TranslationBlocks is the main drag in multithreaded performance. This
>> patch pushes the tb_lock() usage down to the two places that really need
>> it:
>>
>>   - code generation (tb_gen_code)
>>   - jump patching (tb_add_jump)
>>
>> The rest of the code doesn't really need to hold a lock as it is either
>> using per-CPU structures, atomically updated or designed to be used in
>> concurrent read situations (qht_lookup).
>>
>> To keep things simple I removed the #ifdef CONFIG_USER_ONLY stuff as the
>> locks become NOPs anyway until the MTTCG work is completed.
>>
>> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
>> Reviewed-by: Richard Henderson <rth@twiddle.net>
> Reviewed-by: Sergey Fedorov <sergey.fedorov@linaro.org>

Revoked:
(1) Acess to 'cpu->tb_flushed' must be safe (atomic_xchg() can help)
(2) A read from 'cpu->tb_jmp_cache' in tb_find_fast() can see a torn
write by memset() from tb_flush()

So I'm afraid, this series is dependent on safe tb_flush() implementation.

Kind regards,
Sergey

>
>> ---
>> v3 (base-patches)
>>   - fix merge conflicts with Sergey's patch
>> v1 (hot path, split from base-patches series)
>>   - revert name tweaking
>>   - drop test jmp_list_next outside lock
>>   - mention lock NOPs in comments
>> v2 (hot path)
>>   - Add r-b tags
>> ---
>>  cpu-exec.c | 48 +++++++++++++++++++++---------------------------
>>  1 file changed, 21 insertions(+), 27 deletions(-)
>>
>> diff --git a/cpu-exec.c b/cpu-exec.c
>> index 10ce1cb..dd0bd50 100644
>> --- a/cpu-exec.c
>> +++ b/cpu-exec.c
>> @@ -291,35 +291,29 @@ static TranslationBlock *tb_find_slow(CPUState *cpu,
>>       * Pairs with smp_wmb() in tb_phys_invalidate(). */
>>      smp_rmb();
>>      tb = tb_find_physical(cpu, pc, cs_base, flags);
>> -    if (tb) {
>> -        goto found;
>> -    }
>> +    if (!tb) {
>>  
>> -#ifdef CONFIG_USER_ONLY
>> -    /* mmap_lock is needed by tb_gen_code, and mmap_lock must be
>> -     * taken outside tb_lock.  Since we're momentarily dropping
>> -     * tb_lock, there's a chance that our desired tb has been
>> -     * translated.
>> -     */
>> -    tb_unlock();
>> -    mmap_lock();
>> -    tb_lock();
>> -    tb = tb_find_physical(cpu, pc, cs_base, flags);
>> -    if (tb) {
>> -        mmap_unlock();
>> -        goto found;
>> -    }
>> -#endif
>> +        /* mmap_lock is needed by tb_gen_code, and mmap_lock must be
>> +         * taken outside tb_lock. As system emulation is currently
>> +         * single threaded the locks are NOPs.
>> +         */
>> +        mmap_lock();
>> +        tb_lock();
>>  
>> -    /* if no translated code available, then translate it now */
>> -    tb = tb_gen_code(cpu, pc, cs_base, flags, 0);
>> +        /* There's a chance that our desired tb has been translated while
>> +         * taking the locks so we check again inside the lock.
>> +         */
>> +        tb = tb_find_physical(cpu, pc, cs_base, flags);
>> +        if (!tb) {
>> +            /* if no translated code available, then translate it now */
>> +            tb = tb_gen_code(cpu, pc, cs_base, flags, 0);
>> +        }
>>  
>> -#ifdef CONFIG_USER_ONLY
>> -    mmap_unlock();
>> -#endif
>> +        tb_unlock();
>> +        mmap_unlock();
>> +    }
>>  
>> -found:
>> -    /* we add the TB in the virtual pc hash table */
>> +    /* We add the TB in the virtual pc hash table for the fast lookup */
>>      atomic_set(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)], tb);
>>      return tb;
>>  }
>> @@ -337,7 +331,6 @@ static inline TranslationBlock *tb_find_fast(CPUState *cpu,
>>         always be the same before a given translated block
>>         is executed. */
>>      cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);
>> -    tb_lock();
>>      tb = atomic_read(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)]);
>>      if (unlikely(!tb || tb->pc != pc || tb->cs_base != cs_base ||
>>                   tb->flags != flags)) {
>> @@ -361,9 +354,10 @@ static inline TranslationBlock *tb_find_fast(CPUState *cpu,
>>  #endif
>>      /* See if we can patch the calling TB. */
>>      if (*last_tb && !qemu_loglevel_mask(CPU_LOG_TB_NOCHAIN)) {
>> +        tb_lock();
>>          tb_add_jump(*last_tb, tb_exit, tb);
>> +        tb_unlock();
>>      }
>> -    tb_unlock();
>>      return tb;
>>  }
>>
Alex Bennée July 8, 2016, 6:03 p.m. UTC | #4
Sergey Fedorov <serge.fdrv@gmail.com> writes:

> On 07/07/16 17:18, Sergey Fedorov wrote:
>> On 05/07/16 19:18, Alex Bennée wrote:
>>> Lock contention in the hot path of moving between existing patched
>>> TranslationBlocks is the main drag in multithreaded performance. This
>>> patch pushes the tb_lock() usage down to the two places that really need
>>> it:
>>>
>>>   - code generation (tb_gen_code)
>>>   - jump patching (tb_add_jump)
>>>
>>> The rest of the code doesn't really need to hold a lock as it is either
>>> using per-CPU structures, atomically updated or designed to be used in
>>> concurrent read situations (qht_lookup).
>>>
>>> To keep things simple I removed the #ifdef CONFIG_USER_ONLY stuff as the
>>> locks become NOPs anyway until the MTTCG work is completed.
>>>
>>> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
>>> Reviewed-by: Richard Henderson <rth@twiddle.net>
>> Reviewed-by: Sergey Fedorov <sergey.fedorov@linaro.org>
>
> Revoked:
> (1) Acess to 'cpu->tb_flushed' must be safe (atomic_xchg() can help)
> (2) A read from 'cpu->tb_jmp_cache' in tb_find_fast() can see a torn
> write by memset() from tb_flush()

But that is still the case already isn't it? While system emulation mode
is still single-threaded is it still safe?

>
> So I'm afraid, this series is dependent on safe tb_flush()
> implementation.

I've argue if it doesn't worsen the situation for either mode it's not
> dependant.


>
> Kind regards,
> Sergey
>
>>
>>> ---
>>> v3 (base-patches)
>>>   - fix merge conflicts with Sergey's patch
>>> v1 (hot path, split from base-patches series)
>>>   - revert name tweaking
>>>   - drop test jmp_list_next outside lock
>>>   - mention lock NOPs in comments
>>> v2 (hot path)
>>>   - Add r-b tags
>>> ---
>>>  cpu-exec.c | 48 +++++++++++++++++++++---------------------------
>>>  1 file changed, 21 insertions(+), 27 deletions(-)
>>>
>>> diff --git a/cpu-exec.c b/cpu-exec.c
>>> index 10ce1cb..dd0bd50 100644
>>> --- a/cpu-exec.c
>>> +++ b/cpu-exec.c
>>> @@ -291,35 +291,29 @@ static TranslationBlock *tb_find_slow(CPUState *cpu,
>>>       * Pairs with smp_wmb() in tb_phys_invalidate(). */
>>>      smp_rmb();
>>>      tb = tb_find_physical(cpu, pc, cs_base, flags);
>>> -    if (tb) {
>>> -        goto found;
>>> -    }
>>> +    if (!tb) {
>>>  
>>> -#ifdef CONFIG_USER_ONLY
>>> -    /* mmap_lock is needed by tb_gen_code, and mmap_lock must be
>>> -     * taken outside tb_lock.  Since we're momentarily dropping
>>> -     * tb_lock, there's a chance that our desired tb has been
>>> -     * translated.
>>> -     */
>>> -    tb_unlock();
>>> -    mmap_lock();
>>> -    tb_lock();
>>> -    tb = tb_find_physical(cpu, pc, cs_base, flags);
>>> -    if (tb) {
>>> -        mmap_unlock();
>>> -        goto found;
>>> -    }
>>> -#endif
>>> +        /* mmap_lock is needed by tb_gen_code, and mmap_lock must be
>>> +         * taken outside tb_lock. As system emulation is currently
>>> +         * single threaded the locks are NOPs.
>>> +         */
>>> +        mmap_lock();
>>> +        tb_lock();
>>>  
>>> -    /* if no translated code available, then translate it now */
>>> -    tb = tb_gen_code(cpu, pc, cs_base, flags, 0);
>>> +        /* There's a chance that our desired tb has been translated while
>>> +         * taking the locks so we check again inside the lock.
>>> +         */
>>> +        tb = tb_find_physical(cpu, pc, cs_base, flags);
>>> +        if (!tb) {
>>> +            /* if no translated code available, then translate it now */
>>> +            tb = tb_gen_code(cpu, pc, cs_base, flags, 0);
>>> +        }
>>>  
>>> -#ifdef CONFIG_USER_ONLY
>>> -    mmap_unlock();
>>> -#endif
>>> +        tb_unlock();
>>> +        mmap_unlock();
>>> +    }
>>>  
>>> -found:
>>> -    /* we add the TB in the virtual pc hash table */
>>> +    /* We add the TB in the virtual pc hash table for the fast lookup */
>>>      atomic_set(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)], tb);
>>>      return tb;
>>>  }
>>> @@ -337,7 +331,6 @@ static inline TranslationBlock *tb_find_fast(CPUState *cpu,
>>>         always be the same before a given translated block
>>>         is executed. */
>>>      cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);
>>> -    tb_lock();
>>>      tb = atomic_read(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)]);
>>>      if (unlikely(!tb || tb->pc != pc || tb->cs_base != cs_base ||
>>>                   tb->flags != flags)) {
>>> @@ -361,9 +354,10 @@ static inline TranslationBlock *tb_find_fast(CPUState *cpu,
>>>  #endif
>>>      /* See if we can patch the calling TB. */
>>>      if (*last_tb && !qemu_loglevel_mask(CPU_LOG_TB_NOCHAIN)) {
>>> +        tb_lock();
>>>          tb_add_jump(*last_tb, tb_exit, tb);
>>> +        tb_unlock();
>>>      }
>>> -    tb_unlock();
>>>      return tb;
>>>  }
>>>
Sergey Fedorov July 8, 2016, 6:20 p.m. UTC | #5
On 08/07/16 21:03, Alex Bennée wrote:
> Sergey Fedorov <serge.fdrv@gmail.com> writes:
>
>> On 07/07/16 17:18, Sergey Fedorov wrote:
>>> On 05/07/16 19:18, Alex Bennée wrote:
>>>> Lock contention in the hot path of moving between existing patched
>>>> TranslationBlocks is the main drag in multithreaded performance. This
>>>> patch pushes the tb_lock() usage down to the two places that really need
>>>> it:
>>>>
>>>>   - code generation (tb_gen_code)
>>>>   - jump patching (tb_add_jump)
>>>>
>>>> The rest of the code doesn't really need to hold a lock as it is either
>>>> using per-CPU structures, atomically updated or designed to be used in
>>>> concurrent read situations (qht_lookup).
>>>>
>>>> To keep things simple I removed the #ifdef CONFIG_USER_ONLY stuff as the
>>>> locks become NOPs anyway until the MTTCG work is completed.
>>>>
>>>> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
>>>> Reviewed-by: Richard Henderson <rth@twiddle.net>
>>> Reviewed-by: Sergey Fedorov <sergey.fedorov@linaro.org>
>> Revoked:
>> (1) Acess to 'cpu->tb_flushed' must be safe (atomic_xchg() can help)
>> (2) A read from 'cpu->tb_jmp_cache' in tb_find_fast() can see a torn
>> write by memset() from tb_flush()
> But that is still the case already isn't it? While system emulation mode
> is still single-threaded is it still safe?

Before this patch, tb_flush(), 'cpu->tb_jmp_cache' lookup, and
'cpu->tb_flushed' are protected by tb_lock. The only problem for
multi-threaded user mode I know is that we can flush translation buffer
and start new translation while there can be some other thread still
executing old translated code from it. After this patch we have two more
possible races in addition.

>
>> So I'm afraid, this series is dependent on safe tb_flush()
>> implementation.
> I've argue if it doesn't worsen the situation for either mode it's not
> dependant.

It does for user mode, see the explanation above.

Kind regards,
Sergey

>
>> Kind regards,
>> Sergey
>>
>>>> ---
>>>> v3 (base-patches)
>>>>   - fix merge conflicts with Sergey's patch
>>>> v1 (hot path, split from base-patches series)
>>>>   - revert name tweaking
>>>>   - drop test jmp_list_next outside lock
>>>>   - mention lock NOPs in comments
>>>> v2 (hot path)
>>>>   - Add r-b tags
>>>> ---
>>>>  cpu-exec.c | 48 +++++++++++++++++++++---------------------------
>>>>  1 file changed, 21 insertions(+), 27 deletions(-)
>>>>
>>>> diff --git a/cpu-exec.c b/cpu-exec.c
>>>> index 10ce1cb..dd0bd50 100644
>>>> --- a/cpu-exec.c
>>>> +++ b/cpu-exec.c
>>>> @@ -291,35 +291,29 @@ static TranslationBlock *tb_find_slow(CPUState *cpu,
>>>>       * Pairs with smp_wmb() in tb_phys_invalidate(). */
>>>>      smp_rmb();
>>>>      tb = tb_find_physical(cpu, pc, cs_base, flags);
>>>> -    if (tb) {
>>>> -        goto found;
>>>> -    }
>>>> +    if (!tb) {
>>>>  
>>>> -#ifdef CONFIG_USER_ONLY
>>>> -    /* mmap_lock is needed by tb_gen_code, and mmap_lock must be
>>>> -     * taken outside tb_lock.  Since we're momentarily dropping
>>>> -     * tb_lock, there's a chance that our desired tb has been
>>>> -     * translated.
>>>> -     */
>>>> -    tb_unlock();
>>>> -    mmap_lock();
>>>> -    tb_lock();
>>>> -    tb = tb_find_physical(cpu, pc, cs_base, flags);
>>>> -    if (tb) {
>>>> -        mmap_unlock();
>>>> -        goto found;
>>>> -    }
>>>> -#endif
>>>> +        /* mmap_lock is needed by tb_gen_code, and mmap_lock must be
>>>> +         * taken outside tb_lock. As system emulation is currently
>>>> +         * single threaded the locks are NOPs.
>>>> +         */
>>>> +        mmap_lock();
>>>> +        tb_lock();
>>>>  
>>>> -    /* if no translated code available, then translate it now */
>>>> -    tb = tb_gen_code(cpu, pc, cs_base, flags, 0);
>>>> +        /* There's a chance that our desired tb has been translated while
>>>> +         * taking the locks so we check again inside the lock.
>>>> +         */
>>>> +        tb = tb_find_physical(cpu, pc, cs_base, flags);
>>>> +        if (!tb) {
>>>> +            /* if no translated code available, then translate it now */
>>>> +            tb = tb_gen_code(cpu, pc, cs_base, flags, 0);
>>>> +        }
>>>>  
>>>> -#ifdef CONFIG_USER_ONLY
>>>> -    mmap_unlock();
>>>> -#endif
>>>> +        tb_unlock();
>>>> +        mmap_unlock();
>>>> +    }
>>>>  
>>>> -found:
>>>> -    /* we add the TB in the virtual pc hash table */
>>>> +    /* We add the TB in the virtual pc hash table for the fast lookup */
>>>>      atomic_set(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)], tb);
>>>>      return tb;
>>>>  }
>>>> @@ -337,7 +331,6 @@ static inline TranslationBlock *tb_find_fast(CPUState *cpu,
>>>>         always be the same before a given translated block
>>>>         is executed. */
>>>>      cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);
>>>> -    tb_lock();
>>>>      tb = atomic_read(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)]);
>>>>      if (unlikely(!tb || tb->pc != pc || tb->cs_base != cs_base ||
>>>>                   tb->flags != flags)) {
>>>> @@ -361,9 +354,10 @@ static inline TranslationBlock *tb_find_fast(CPUState *cpu,
>>>>  #endif
>>>>      /* See if we can patch the calling TB. */
>>>>      if (*last_tb && !qemu_loglevel_mask(CPU_LOG_TB_NOCHAIN)) {
>>>> +        tb_lock();
>>>>          tb_add_jump(*last_tb, tb_exit, tb);
>>>> +        tb_unlock();
>>>>      }
>>>> -    tb_unlock();
>>>>      return tb;
>>>>  }
>>>>  
>
Sergey Fedorov July 8, 2016, 8:09 p.m. UTC | #6
On 05/07/16 19:18, Alex Bennée wrote:
> Lock contention in the hot path of moving between existing patched
> TranslationBlocks is the main drag in multithreaded performance. This
> patch pushes the tb_lock() usage down to the two places that really need
> it:
>
>   - code generation (tb_gen_code)
>   - jump patching (tb_add_jump)
>
> The rest of the code doesn't really need to hold a lock as it is either
> using per-CPU structures, atomically updated or designed to be used in
> concurrent read situations (qht_lookup).
>
> To keep things simple I removed the #ifdef CONFIG_USER_ONLY stuff as the
> locks become NOPs anyway until the MTTCG work is completed.
>
> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
> Reviewed-by: Richard Henderson <rth@twiddle.net>
>
> ---
> v3 (base-patches)
>   - fix merge conflicts with Sergey's patch
> v1 (hot path, split from base-patches series)
>   - revert name tweaking
>   - drop test jmp_list_next outside lock
>   - mention lock NOPs in comments
> v2 (hot path)
>   - Add r-b tags
> ---
>  cpu-exec.c | 48 +++++++++++++++++++++---------------------------
>  1 file changed, 21 insertions(+), 27 deletions(-)
>
> diff --git a/cpu-exec.c b/cpu-exec.c
> index 10ce1cb..dd0bd50 100644
> --- a/cpu-exec.c
> +++ b/cpu-exec.c
> @@ -291,35 +291,29 @@ static TranslationBlock *tb_find_slow(CPUState *cpu,
>       * Pairs with smp_wmb() in tb_phys_invalidate(). */
>      smp_rmb();
>      tb = tb_find_physical(cpu, pc, cs_base, flags);
> -    if (tb) {
> -        goto found;
> -    }
> +    if (!tb) {
>  
> -#ifdef CONFIG_USER_ONLY
> -    /* mmap_lock is needed by tb_gen_code, and mmap_lock must be
> -     * taken outside tb_lock.  Since we're momentarily dropping
> -     * tb_lock, there's a chance that our desired tb has been
> -     * translated.
> -     */
> -    tb_unlock();
> -    mmap_lock();
> -    tb_lock();
> -    tb = tb_find_physical(cpu, pc, cs_base, flags);
> -    if (tb) {
> -        mmap_unlock();
> -        goto found;
> -    }
> -#endif
> +        /* mmap_lock is needed by tb_gen_code, and mmap_lock must be
> +         * taken outside tb_lock. As system emulation is currently
> +         * single threaded the locks are NOPs.
> +         */
> +        mmap_lock();
> +        tb_lock();
>  
> -    /* if no translated code available, then translate it now */
> -    tb = tb_gen_code(cpu, pc, cs_base, flags, 0);
> +        /* There's a chance that our desired tb has been translated while
> +         * taking the locks so we check again inside the lock.
> +         */
> +        tb = tb_find_physical(cpu, pc, cs_base, flags);
> +        if (!tb) {
> +            /* if no translated code available, then translate it now */
> +            tb = tb_gen_code(cpu, pc, cs_base, flags, 0);
> +        }
>  
> -#ifdef CONFIG_USER_ONLY
> -    mmap_unlock();
> -#endif
> +        tb_unlock();
> +        mmap_unlock();
> +    }
>  
> -found:
> -    /* we add the TB in the virtual pc hash table */
> +    /* We add the TB in the virtual pc hash table for the fast lookup */
>      atomic_set(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)], tb);

There can be a race with tb_phys_invalidate() if we do this out of
tb_lock. Suppose:
(1) Thread 1: The first tb_find_phys() out of the lock is successful
(2) Thread 2: Remove the TB from the global hash table with qht_remove()
(3) Thread 2: Reset the 'tb_jmp_cache' entry
(4) Thread 1: Set the 'tb_jmp_cache' entry back (this line)

So it is only safe to do 'tb_jmp_cache' lookup out of lock, not to set it.

Kind regards,
Sergey

>      return tb;
>  }
> @@ -337,7 +331,6 @@ static inline TranslationBlock *tb_find_fast(CPUState *cpu,
>         always be the same before a given translated block
>         is executed. */
>      cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);
> -    tb_lock();
>      tb = atomic_read(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)]);
>      if (unlikely(!tb || tb->pc != pc || tb->cs_base != cs_base ||
>                   tb->flags != flags)) {
> @@ -361,9 +354,10 @@ static inline TranslationBlock *tb_find_fast(CPUState *cpu,
>  #endif
>      /* See if we can patch the calling TB. */
>      if (*last_tb && !qemu_loglevel_mask(CPU_LOG_TB_NOCHAIN)) {
> +        tb_lock();
>          tb_add_jump(*last_tb, tb_exit, tb);
> +        tb_unlock();
>      }
> -    tb_unlock();
>      return tb;
>  }
>
diff mbox

Patch

diff --git a/cpu-exec.c b/cpu-exec.c
index 10ce1cb..dd0bd50 100644
--- a/cpu-exec.c
+++ b/cpu-exec.c
@@ -291,35 +291,29 @@  static TranslationBlock *tb_find_slow(CPUState *cpu,
      * Pairs with smp_wmb() in tb_phys_invalidate(). */
     smp_rmb();
     tb = tb_find_physical(cpu, pc, cs_base, flags);
-    if (tb) {
-        goto found;
-    }
+    if (!tb) {
 
-#ifdef CONFIG_USER_ONLY
-    /* mmap_lock is needed by tb_gen_code, and mmap_lock must be
-     * taken outside tb_lock.  Since we're momentarily dropping
-     * tb_lock, there's a chance that our desired tb has been
-     * translated.
-     */
-    tb_unlock();
-    mmap_lock();
-    tb_lock();
-    tb = tb_find_physical(cpu, pc, cs_base, flags);
-    if (tb) {
-        mmap_unlock();
-        goto found;
-    }
-#endif
+        /* mmap_lock is needed by tb_gen_code, and mmap_lock must be
+         * taken outside tb_lock. As system emulation is currently
+         * single threaded the locks are NOPs.
+         */
+        mmap_lock();
+        tb_lock();
 
-    /* if no translated code available, then translate it now */
-    tb = tb_gen_code(cpu, pc, cs_base, flags, 0);
+        /* There's a chance that our desired tb has been translated while
+         * taking the locks so we check again inside the lock.
+         */
+        tb = tb_find_physical(cpu, pc, cs_base, flags);
+        if (!tb) {
+            /* if no translated code available, then translate it now */
+            tb = tb_gen_code(cpu, pc, cs_base, flags, 0);
+        }
 
-#ifdef CONFIG_USER_ONLY
-    mmap_unlock();
-#endif
+        tb_unlock();
+        mmap_unlock();
+    }
 
-found:
-    /* we add the TB in the virtual pc hash table */
+    /* We add the TB in the virtual pc hash table for the fast lookup */
     atomic_set(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)], tb);
     return tb;
 }
@@ -337,7 +331,6 @@  static inline TranslationBlock *tb_find_fast(CPUState *cpu,
        always be the same before a given translated block
        is executed. */
     cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);
-    tb_lock();
     tb = atomic_read(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)]);
     if (unlikely(!tb || tb->pc != pc || tb->cs_base != cs_base ||
                  tb->flags != flags)) {
@@ -361,9 +354,10 @@  static inline TranslationBlock *tb_find_fast(CPUState *cpu,
 #endif
     /* See if we can patch the calling TB. */
     if (*last_tb && !qemu_loglevel_mask(CPU_LOG_TB_NOCHAIN)) {
+        tb_lock();
         tb_add_jump(*last_tb, tb_exit, tb);
+        tb_unlock();
     }
-    tb_unlock();
     return tb;
 }