Message ID | 1467735496-16256-4-git-send-email-alex.bennee@linaro.org (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On 05/07/16 19:18, Alex Bennée wrote: > Lock contention in the hot path of moving between existing patched > TranslationBlocks is the main drag in multithreaded performance. This > patch pushes the tb_lock() usage down to the two places that really need > it: > > - code generation (tb_gen_code) > - jump patching (tb_add_jump) > > The rest of the code doesn't really need to hold a lock as it is either > using per-CPU structures, atomically updated or designed to be used in > concurrent read situations (qht_lookup). > > To keep things simple I removed the #ifdef CONFIG_USER_ONLY stuff as the > locks become NOPs anyway until the MTTCG work is completed. > > Signed-off-by: Alex Bennée <alex.bennee@linaro.org> > Reviewed-by: Richard Henderson <rth@twiddle.net> Reviewed-by: Sergey Fedorov <sergey.fedorov@linaro.org> > > --- > v3 (base-patches) > - fix merge conflicts with Sergey's patch > v1 (hot path, split from base-patches series) > - revert name tweaking > - drop test jmp_list_next outside lock > - mention lock NOPs in comments > v2 (hot path) > - Add r-b tags > --- > cpu-exec.c | 48 +++++++++++++++++++++--------------------------- > 1 file changed, 21 insertions(+), 27 deletions(-) > > diff --git a/cpu-exec.c b/cpu-exec.c > index 10ce1cb..dd0bd50 100644 > --- a/cpu-exec.c > +++ b/cpu-exec.c > @@ -291,35 +291,29 @@ static TranslationBlock *tb_find_slow(CPUState *cpu, > * Pairs with smp_wmb() in tb_phys_invalidate(). */ > smp_rmb(); > tb = tb_find_physical(cpu, pc, cs_base, flags); > - if (tb) { > - goto found; > - } > + if (!tb) { > > -#ifdef CONFIG_USER_ONLY > - /* mmap_lock is needed by tb_gen_code, and mmap_lock must be > - * taken outside tb_lock. Since we're momentarily dropping > - * tb_lock, there's a chance that our desired tb has been > - * translated. > - */ > - tb_unlock(); > - mmap_lock(); > - tb_lock(); > - tb = tb_find_physical(cpu, pc, cs_base, flags); > - if (tb) { > - mmap_unlock(); > - goto found; > - } > -#endif > + /* mmap_lock is needed by tb_gen_code, and mmap_lock must be > + * taken outside tb_lock. As system emulation is currently > + * single threaded the locks are NOPs. > + */ > + mmap_lock(); > + tb_lock(); > > - /* if no translated code available, then translate it now */ > - tb = tb_gen_code(cpu, pc, cs_base, flags, 0); > + /* There's a chance that our desired tb has been translated while > + * taking the locks so we check again inside the lock. > + */ > + tb = tb_find_physical(cpu, pc, cs_base, flags); > + if (!tb) { > + /* if no translated code available, then translate it now */ > + tb = tb_gen_code(cpu, pc, cs_base, flags, 0); > + } > > -#ifdef CONFIG_USER_ONLY > - mmap_unlock(); > -#endif > + tb_unlock(); > + mmap_unlock(); > + } > > -found: > - /* we add the TB in the virtual pc hash table */ > + /* We add the TB in the virtual pc hash table for the fast lookup */ > atomic_set(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)], tb); > return tb; > } > @@ -337,7 +331,6 @@ static inline TranslationBlock *tb_find_fast(CPUState *cpu, > always be the same before a given translated block > is executed. */ > cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags); > - tb_lock(); > tb = atomic_read(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)]); > if (unlikely(!tb || tb->pc != pc || tb->cs_base != cs_base || > tb->flags != flags)) { > @@ -361,9 +354,10 @@ static inline TranslationBlock *tb_find_fast(CPUState *cpu, > #endif > /* See if we can patch the calling TB. */ > if (*last_tb && !qemu_loglevel_mask(CPU_LOG_TB_NOCHAIN)) { > + tb_lock(); > tb_add_jump(*last_tb, tb_exit, tb); > + tb_unlock(); > } > - tb_unlock(); > return tb; > } >
On 07/07/16 17:18, Sergey Fedorov wrote: > On 05/07/16 19:18, Alex Bennée wrote: >> Lock contention in the hot path of moving between existing patched >> TranslationBlocks is the main drag in multithreaded performance. This >> patch pushes the tb_lock() usage down to the two places that really need >> it: >> >> - code generation (tb_gen_code) >> - jump patching (tb_add_jump) >> >> The rest of the code doesn't really need to hold a lock as it is either >> using per-CPU structures, atomically updated or designed to be used in >> concurrent read situations (qht_lookup). >> >> To keep things simple I removed the #ifdef CONFIG_USER_ONLY stuff as the >> locks become NOPs anyway until the MTTCG work is completed. >> >> Signed-off-by: Alex Bennée <alex.bennee@linaro.org> >> Reviewed-by: Richard Henderson <rth@twiddle.net> > Reviewed-by: Sergey Fedorov <sergey.fedorov@linaro.org> Revoked: CPUState::tb_flushed must also be protected by tb_lock. > >> --- >> v3 (base-patches) >> - fix merge conflicts with Sergey's patch >> v1 (hot path, split from base-patches series) >> - revert name tweaking >> - drop test jmp_list_next outside lock >> - mention lock NOPs in comments >> v2 (hot path) >> - Add r-b tags >> --- >> cpu-exec.c | 48 +++++++++++++++++++++--------------------------- >> 1 file changed, 21 insertions(+), 27 deletions(-) >> >> diff --git a/cpu-exec.c b/cpu-exec.c >> index 10ce1cb..dd0bd50 100644 >> --- a/cpu-exec.c >> +++ b/cpu-exec.c >> @@ -291,35 +291,29 @@ static TranslationBlock *tb_find_slow(CPUState *cpu, >> * Pairs with smp_wmb() in tb_phys_invalidate(). */ >> smp_rmb(); >> tb = tb_find_physical(cpu, pc, cs_base, flags); >> - if (tb) { >> - goto found; >> - } >> + if (!tb) { >> >> -#ifdef CONFIG_USER_ONLY >> - /* mmap_lock is needed by tb_gen_code, and mmap_lock must be >> - * taken outside tb_lock. Since we're momentarily dropping >> - * tb_lock, there's a chance that our desired tb has been >> - * translated. >> - */ >> - tb_unlock(); >> - mmap_lock(); >> - tb_lock(); >> - tb = tb_find_physical(cpu, pc, cs_base, flags); >> - if (tb) { >> - mmap_unlock(); >> - goto found; >> - } >> -#endif >> + /* mmap_lock is needed by tb_gen_code, and mmap_lock must be >> + * taken outside tb_lock. As system emulation is currently >> + * single threaded the locks are NOPs. >> + */ >> + mmap_lock(); >> + tb_lock(); >> >> - /* if no translated code available, then translate it now */ >> - tb = tb_gen_code(cpu, pc, cs_base, flags, 0); >> + /* There's a chance that our desired tb has been translated while >> + * taking the locks so we check again inside the lock. >> + */ >> + tb = tb_find_physical(cpu, pc, cs_base, flags); >> + if (!tb) { >> + /* if no translated code available, then translate it now */ >> + tb = tb_gen_code(cpu, pc, cs_base, flags, 0); >> + } >> >> -#ifdef CONFIG_USER_ONLY >> - mmap_unlock(); >> -#endif >> + tb_unlock(); >> + mmap_unlock(); >> + } >> >> -found: >> - /* we add the TB in the virtual pc hash table */ >> + /* We add the TB in the virtual pc hash table for the fast lookup */ >> atomic_set(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)], tb); >> return tb; >> } >> @@ -337,7 +331,6 @@ static inline TranslationBlock *tb_find_fast(CPUState *cpu, >> always be the same before a given translated block >> is executed. */ >> cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags); >> - tb_lock(); >> tb = atomic_read(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)]); >> if (unlikely(!tb || tb->pc != pc || tb->cs_base != cs_base || >> tb->flags != flags)) { >> @@ -361,9 +354,10 @@ static inline TranslationBlock *tb_find_fast(CPUState *cpu, >> #endif >> /* See if we can patch the calling TB. */ >> if (*last_tb && !qemu_loglevel_mask(CPU_LOG_TB_NOCHAIN)) { >> + tb_lock(); >> tb_add_jump(*last_tb, tb_exit, tb); >> + tb_unlock(); >> } >> - tb_unlock(); >> return tb; >> } >>
On 07/07/16 17:18, Sergey Fedorov wrote: > On 05/07/16 19:18, Alex Bennée wrote: >> Lock contention in the hot path of moving between existing patched >> TranslationBlocks is the main drag in multithreaded performance. This >> patch pushes the tb_lock() usage down to the two places that really need >> it: >> >> - code generation (tb_gen_code) >> - jump patching (tb_add_jump) >> >> The rest of the code doesn't really need to hold a lock as it is either >> using per-CPU structures, atomically updated or designed to be used in >> concurrent read situations (qht_lookup). >> >> To keep things simple I removed the #ifdef CONFIG_USER_ONLY stuff as the >> locks become NOPs anyway until the MTTCG work is completed. >> >> Signed-off-by: Alex Bennée <alex.bennee@linaro.org> >> Reviewed-by: Richard Henderson <rth@twiddle.net> > Reviewed-by: Sergey Fedorov <sergey.fedorov@linaro.org> Revoked: (1) Acess to 'cpu->tb_flushed' must be safe (atomic_xchg() can help) (2) A read from 'cpu->tb_jmp_cache' in tb_find_fast() can see a torn write by memset() from tb_flush() So I'm afraid, this series is dependent on safe tb_flush() implementation. Kind regards, Sergey > >> --- >> v3 (base-patches) >> - fix merge conflicts with Sergey's patch >> v1 (hot path, split from base-patches series) >> - revert name tweaking >> - drop test jmp_list_next outside lock >> - mention lock NOPs in comments >> v2 (hot path) >> - Add r-b tags >> --- >> cpu-exec.c | 48 +++++++++++++++++++++--------------------------- >> 1 file changed, 21 insertions(+), 27 deletions(-) >> >> diff --git a/cpu-exec.c b/cpu-exec.c >> index 10ce1cb..dd0bd50 100644 >> --- a/cpu-exec.c >> +++ b/cpu-exec.c >> @@ -291,35 +291,29 @@ static TranslationBlock *tb_find_slow(CPUState *cpu, >> * Pairs with smp_wmb() in tb_phys_invalidate(). */ >> smp_rmb(); >> tb = tb_find_physical(cpu, pc, cs_base, flags); >> - if (tb) { >> - goto found; >> - } >> + if (!tb) { >> >> -#ifdef CONFIG_USER_ONLY >> - /* mmap_lock is needed by tb_gen_code, and mmap_lock must be >> - * taken outside tb_lock. Since we're momentarily dropping >> - * tb_lock, there's a chance that our desired tb has been >> - * translated. >> - */ >> - tb_unlock(); >> - mmap_lock(); >> - tb_lock(); >> - tb = tb_find_physical(cpu, pc, cs_base, flags); >> - if (tb) { >> - mmap_unlock(); >> - goto found; >> - } >> -#endif >> + /* mmap_lock is needed by tb_gen_code, and mmap_lock must be >> + * taken outside tb_lock. As system emulation is currently >> + * single threaded the locks are NOPs. >> + */ >> + mmap_lock(); >> + tb_lock(); >> >> - /* if no translated code available, then translate it now */ >> - tb = tb_gen_code(cpu, pc, cs_base, flags, 0); >> + /* There's a chance that our desired tb has been translated while >> + * taking the locks so we check again inside the lock. >> + */ >> + tb = tb_find_physical(cpu, pc, cs_base, flags); >> + if (!tb) { >> + /* if no translated code available, then translate it now */ >> + tb = tb_gen_code(cpu, pc, cs_base, flags, 0); >> + } >> >> -#ifdef CONFIG_USER_ONLY >> - mmap_unlock(); >> -#endif >> + tb_unlock(); >> + mmap_unlock(); >> + } >> >> -found: >> - /* we add the TB in the virtual pc hash table */ >> + /* We add the TB in the virtual pc hash table for the fast lookup */ >> atomic_set(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)], tb); >> return tb; >> } >> @@ -337,7 +331,6 @@ static inline TranslationBlock *tb_find_fast(CPUState *cpu, >> always be the same before a given translated block >> is executed. */ >> cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags); >> - tb_lock(); >> tb = atomic_read(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)]); >> if (unlikely(!tb || tb->pc != pc || tb->cs_base != cs_base || >> tb->flags != flags)) { >> @@ -361,9 +354,10 @@ static inline TranslationBlock *tb_find_fast(CPUState *cpu, >> #endif >> /* See if we can patch the calling TB. */ >> if (*last_tb && !qemu_loglevel_mask(CPU_LOG_TB_NOCHAIN)) { >> + tb_lock(); >> tb_add_jump(*last_tb, tb_exit, tb); >> + tb_unlock(); >> } >> - tb_unlock(); >> return tb; >> } >>
Sergey Fedorov <serge.fdrv@gmail.com> writes: > On 07/07/16 17:18, Sergey Fedorov wrote: >> On 05/07/16 19:18, Alex Bennée wrote: >>> Lock contention in the hot path of moving between existing patched >>> TranslationBlocks is the main drag in multithreaded performance. This >>> patch pushes the tb_lock() usage down to the two places that really need >>> it: >>> >>> - code generation (tb_gen_code) >>> - jump patching (tb_add_jump) >>> >>> The rest of the code doesn't really need to hold a lock as it is either >>> using per-CPU structures, atomically updated or designed to be used in >>> concurrent read situations (qht_lookup). >>> >>> To keep things simple I removed the #ifdef CONFIG_USER_ONLY stuff as the >>> locks become NOPs anyway until the MTTCG work is completed. >>> >>> Signed-off-by: Alex Bennée <alex.bennee@linaro.org> >>> Reviewed-by: Richard Henderson <rth@twiddle.net> >> Reviewed-by: Sergey Fedorov <sergey.fedorov@linaro.org> > > Revoked: > (1) Acess to 'cpu->tb_flushed' must be safe (atomic_xchg() can help) > (2) A read from 'cpu->tb_jmp_cache' in tb_find_fast() can see a torn > write by memset() from tb_flush() But that is still the case already isn't it? While system emulation mode is still single-threaded is it still safe? > > So I'm afraid, this series is dependent on safe tb_flush() > implementation. I've argue if it doesn't worsen the situation for either mode it's not > dependant. > > Kind regards, > Sergey > >> >>> --- >>> v3 (base-patches) >>> - fix merge conflicts with Sergey's patch >>> v1 (hot path, split from base-patches series) >>> - revert name tweaking >>> - drop test jmp_list_next outside lock >>> - mention lock NOPs in comments >>> v2 (hot path) >>> - Add r-b tags >>> --- >>> cpu-exec.c | 48 +++++++++++++++++++++--------------------------- >>> 1 file changed, 21 insertions(+), 27 deletions(-) >>> >>> diff --git a/cpu-exec.c b/cpu-exec.c >>> index 10ce1cb..dd0bd50 100644 >>> --- a/cpu-exec.c >>> +++ b/cpu-exec.c >>> @@ -291,35 +291,29 @@ static TranslationBlock *tb_find_slow(CPUState *cpu, >>> * Pairs with smp_wmb() in tb_phys_invalidate(). */ >>> smp_rmb(); >>> tb = tb_find_physical(cpu, pc, cs_base, flags); >>> - if (tb) { >>> - goto found; >>> - } >>> + if (!tb) { >>> >>> -#ifdef CONFIG_USER_ONLY >>> - /* mmap_lock is needed by tb_gen_code, and mmap_lock must be >>> - * taken outside tb_lock. Since we're momentarily dropping >>> - * tb_lock, there's a chance that our desired tb has been >>> - * translated. >>> - */ >>> - tb_unlock(); >>> - mmap_lock(); >>> - tb_lock(); >>> - tb = tb_find_physical(cpu, pc, cs_base, flags); >>> - if (tb) { >>> - mmap_unlock(); >>> - goto found; >>> - } >>> -#endif >>> + /* mmap_lock is needed by tb_gen_code, and mmap_lock must be >>> + * taken outside tb_lock. As system emulation is currently >>> + * single threaded the locks are NOPs. >>> + */ >>> + mmap_lock(); >>> + tb_lock(); >>> >>> - /* if no translated code available, then translate it now */ >>> - tb = tb_gen_code(cpu, pc, cs_base, flags, 0); >>> + /* There's a chance that our desired tb has been translated while >>> + * taking the locks so we check again inside the lock. >>> + */ >>> + tb = tb_find_physical(cpu, pc, cs_base, flags); >>> + if (!tb) { >>> + /* if no translated code available, then translate it now */ >>> + tb = tb_gen_code(cpu, pc, cs_base, flags, 0); >>> + } >>> >>> -#ifdef CONFIG_USER_ONLY >>> - mmap_unlock(); >>> -#endif >>> + tb_unlock(); >>> + mmap_unlock(); >>> + } >>> >>> -found: >>> - /* we add the TB in the virtual pc hash table */ >>> + /* We add the TB in the virtual pc hash table for the fast lookup */ >>> atomic_set(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)], tb); >>> return tb; >>> } >>> @@ -337,7 +331,6 @@ static inline TranslationBlock *tb_find_fast(CPUState *cpu, >>> always be the same before a given translated block >>> is executed. */ >>> cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags); >>> - tb_lock(); >>> tb = atomic_read(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)]); >>> if (unlikely(!tb || tb->pc != pc || tb->cs_base != cs_base || >>> tb->flags != flags)) { >>> @@ -361,9 +354,10 @@ static inline TranslationBlock *tb_find_fast(CPUState *cpu, >>> #endif >>> /* See if we can patch the calling TB. */ >>> if (*last_tb && !qemu_loglevel_mask(CPU_LOG_TB_NOCHAIN)) { >>> + tb_lock(); >>> tb_add_jump(*last_tb, tb_exit, tb); >>> + tb_unlock(); >>> } >>> - tb_unlock(); >>> return tb; >>> } >>>
On 08/07/16 21:03, Alex Bennée wrote: > Sergey Fedorov <serge.fdrv@gmail.com> writes: > >> On 07/07/16 17:18, Sergey Fedorov wrote: >>> On 05/07/16 19:18, Alex Bennée wrote: >>>> Lock contention in the hot path of moving between existing patched >>>> TranslationBlocks is the main drag in multithreaded performance. This >>>> patch pushes the tb_lock() usage down to the two places that really need >>>> it: >>>> >>>> - code generation (tb_gen_code) >>>> - jump patching (tb_add_jump) >>>> >>>> The rest of the code doesn't really need to hold a lock as it is either >>>> using per-CPU structures, atomically updated or designed to be used in >>>> concurrent read situations (qht_lookup). >>>> >>>> To keep things simple I removed the #ifdef CONFIG_USER_ONLY stuff as the >>>> locks become NOPs anyway until the MTTCG work is completed. >>>> >>>> Signed-off-by: Alex Bennée <alex.bennee@linaro.org> >>>> Reviewed-by: Richard Henderson <rth@twiddle.net> >>> Reviewed-by: Sergey Fedorov <sergey.fedorov@linaro.org> >> Revoked: >> (1) Acess to 'cpu->tb_flushed' must be safe (atomic_xchg() can help) >> (2) A read from 'cpu->tb_jmp_cache' in tb_find_fast() can see a torn >> write by memset() from tb_flush() > But that is still the case already isn't it? While system emulation mode > is still single-threaded is it still safe? Before this patch, tb_flush(), 'cpu->tb_jmp_cache' lookup, and 'cpu->tb_flushed' are protected by tb_lock. The only problem for multi-threaded user mode I know is that we can flush translation buffer and start new translation while there can be some other thread still executing old translated code from it. After this patch we have two more possible races in addition. > >> So I'm afraid, this series is dependent on safe tb_flush() >> implementation. > I've argue if it doesn't worsen the situation for either mode it's not > dependant. It does for user mode, see the explanation above. Kind regards, Sergey > >> Kind regards, >> Sergey >> >>>> --- >>>> v3 (base-patches) >>>> - fix merge conflicts with Sergey's patch >>>> v1 (hot path, split from base-patches series) >>>> - revert name tweaking >>>> - drop test jmp_list_next outside lock >>>> - mention lock NOPs in comments >>>> v2 (hot path) >>>> - Add r-b tags >>>> --- >>>> cpu-exec.c | 48 +++++++++++++++++++++--------------------------- >>>> 1 file changed, 21 insertions(+), 27 deletions(-) >>>> >>>> diff --git a/cpu-exec.c b/cpu-exec.c >>>> index 10ce1cb..dd0bd50 100644 >>>> --- a/cpu-exec.c >>>> +++ b/cpu-exec.c >>>> @@ -291,35 +291,29 @@ static TranslationBlock *tb_find_slow(CPUState *cpu, >>>> * Pairs with smp_wmb() in tb_phys_invalidate(). */ >>>> smp_rmb(); >>>> tb = tb_find_physical(cpu, pc, cs_base, flags); >>>> - if (tb) { >>>> - goto found; >>>> - } >>>> + if (!tb) { >>>> >>>> -#ifdef CONFIG_USER_ONLY >>>> - /* mmap_lock is needed by tb_gen_code, and mmap_lock must be >>>> - * taken outside tb_lock. Since we're momentarily dropping >>>> - * tb_lock, there's a chance that our desired tb has been >>>> - * translated. >>>> - */ >>>> - tb_unlock(); >>>> - mmap_lock(); >>>> - tb_lock(); >>>> - tb = tb_find_physical(cpu, pc, cs_base, flags); >>>> - if (tb) { >>>> - mmap_unlock(); >>>> - goto found; >>>> - } >>>> -#endif >>>> + /* mmap_lock is needed by tb_gen_code, and mmap_lock must be >>>> + * taken outside tb_lock. As system emulation is currently >>>> + * single threaded the locks are NOPs. >>>> + */ >>>> + mmap_lock(); >>>> + tb_lock(); >>>> >>>> - /* if no translated code available, then translate it now */ >>>> - tb = tb_gen_code(cpu, pc, cs_base, flags, 0); >>>> + /* There's a chance that our desired tb has been translated while >>>> + * taking the locks so we check again inside the lock. >>>> + */ >>>> + tb = tb_find_physical(cpu, pc, cs_base, flags); >>>> + if (!tb) { >>>> + /* if no translated code available, then translate it now */ >>>> + tb = tb_gen_code(cpu, pc, cs_base, flags, 0); >>>> + } >>>> >>>> -#ifdef CONFIG_USER_ONLY >>>> - mmap_unlock(); >>>> -#endif >>>> + tb_unlock(); >>>> + mmap_unlock(); >>>> + } >>>> >>>> -found: >>>> - /* we add the TB in the virtual pc hash table */ >>>> + /* We add the TB in the virtual pc hash table for the fast lookup */ >>>> atomic_set(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)], tb); >>>> return tb; >>>> } >>>> @@ -337,7 +331,6 @@ static inline TranslationBlock *tb_find_fast(CPUState *cpu, >>>> always be the same before a given translated block >>>> is executed. */ >>>> cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags); >>>> - tb_lock(); >>>> tb = atomic_read(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)]); >>>> if (unlikely(!tb || tb->pc != pc || tb->cs_base != cs_base || >>>> tb->flags != flags)) { >>>> @@ -361,9 +354,10 @@ static inline TranslationBlock *tb_find_fast(CPUState *cpu, >>>> #endif >>>> /* See if we can patch the calling TB. */ >>>> if (*last_tb && !qemu_loglevel_mask(CPU_LOG_TB_NOCHAIN)) { >>>> + tb_lock(); >>>> tb_add_jump(*last_tb, tb_exit, tb); >>>> + tb_unlock(); >>>> } >>>> - tb_unlock(); >>>> return tb; >>>> } >>>> >
On 05/07/16 19:18, Alex Bennée wrote: > Lock contention in the hot path of moving between existing patched > TranslationBlocks is the main drag in multithreaded performance. This > patch pushes the tb_lock() usage down to the two places that really need > it: > > - code generation (tb_gen_code) > - jump patching (tb_add_jump) > > The rest of the code doesn't really need to hold a lock as it is either > using per-CPU structures, atomically updated or designed to be used in > concurrent read situations (qht_lookup). > > To keep things simple I removed the #ifdef CONFIG_USER_ONLY stuff as the > locks become NOPs anyway until the MTTCG work is completed. > > Signed-off-by: Alex Bennée <alex.bennee@linaro.org> > Reviewed-by: Richard Henderson <rth@twiddle.net> > > --- > v3 (base-patches) > - fix merge conflicts with Sergey's patch > v1 (hot path, split from base-patches series) > - revert name tweaking > - drop test jmp_list_next outside lock > - mention lock NOPs in comments > v2 (hot path) > - Add r-b tags > --- > cpu-exec.c | 48 +++++++++++++++++++++--------------------------- > 1 file changed, 21 insertions(+), 27 deletions(-) > > diff --git a/cpu-exec.c b/cpu-exec.c > index 10ce1cb..dd0bd50 100644 > --- a/cpu-exec.c > +++ b/cpu-exec.c > @@ -291,35 +291,29 @@ static TranslationBlock *tb_find_slow(CPUState *cpu, > * Pairs with smp_wmb() in tb_phys_invalidate(). */ > smp_rmb(); > tb = tb_find_physical(cpu, pc, cs_base, flags); > - if (tb) { > - goto found; > - } > + if (!tb) { > > -#ifdef CONFIG_USER_ONLY > - /* mmap_lock is needed by tb_gen_code, and mmap_lock must be > - * taken outside tb_lock. Since we're momentarily dropping > - * tb_lock, there's a chance that our desired tb has been > - * translated. > - */ > - tb_unlock(); > - mmap_lock(); > - tb_lock(); > - tb = tb_find_physical(cpu, pc, cs_base, flags); > - if (tb) { > - mmap_unlock(); > - goto found; > - } > -#endif > + /* mmap_lock is needed by tb_gen_code, and mmap_lock must be > + * taken outside tb_lock. As system emulation is currently > + * single threaded the locks are NOPs. > + */ > + mmap_lock(); > + tb_lock(); > > - /* if no translated code available, then translate it now */ > - tb = tb_gen_code(cpu, pc, cs_base, flags, 0); > + /* There's a chance that our desired tb has been translated while > + * taking the locks so we check again inside the lock. > + */ > + tb = tb_find_physical(cpu, pc, cs_base, flags); > + if (!tb) { > + /* if no translated code available, then translate it now */ > + tb = tb_gen_code(cpu, pc, cs_base, flags, 0); > + } > > -#ifdef CONFIG_USER_ONLY > - mmap_unlock(); > -#endif > + tb_unlock(); > + mmap_unlock(); > + } > > -found: > - /* we add the TB in the virtual pc hash table */ > + /* We add the TB in the virtual pc hash table for the fast lookup */ > atomic_set(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)], tb); There can be a race with tb_phys_invalidate() if we do this out of tb_lock. Suppose: (1) Thread 1: The first tb_find_phys() out of the lock is successful (2) Thread 2: Remove the TB from the global hash table with qht_remove() (3) Thread 2: Reset the 'tb_jmp_cache' entry (4) Thread 1: Set the 'tb_jmp_cache' entry back (this line) So it is only safe to do 'tb_jmp_cache' lookup out of lock, not to set it. Kind regards, Sergey > return tb; > } > @@ -337,7 +331,6 @@ static inline TranslationBlock *tb_find_fast(CPUState *cpu, > always be the same before a given translated block > is executed. */ > cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags); > - tb_lock(); > tb = atomic_read(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)]); > if (unlikely(!tb || tb->pc != pc || tb->cs_base != cs_base || > tb->flags != flags)) { > @@ -361,9 +354,10 @@ static inline TranslationBlock *tb_find_fast(CPUState *cpu, > #endif > /* See if we can patch the calling TB. */ > if (*last_tb && !qemu_loglevel_mask(CPU_LOG_TB_NOCHAIN)) { > + tb_lock(); > tb_add_jump(*last_tb, tb_exit, tb); > + tb_unlock(); > } > - tb_unlock(); > return tb; > } >
diff --git a/cpu-exec.c b/cpu-exec.c index 10ce1cb..dd0bd50 100644 --- a/cpu-exec.c +++ b/cpu-exec.c @@ -291,35 +291,29 @@ static TranslationBlock *tb_find_slow(CPUState *cpu, * Pairs with smp_wmb() in tb_phys_invalidate(). */ smp_rmb(); tb = tb_find_physical(cpu, pc, cs_base, flags); - if (tb) { - goto found; - } + if (!tb) { -#ifdef CONFIG_USER_ONLY - /* mmap_lock is needed by tb_gen_code, and mmap_lock must be - * taken outside tb_lock. Since we're momentarily dropping - * tb_lock, there's a chance that our desired tb has been - * translated. - */ - tb_unlock(); - mmap_lock(); - tb_lock(); - tb = tb_find_physical(cpu, pc, cs_base, flags); - if (tb) { - mmap_unlock(); - goto found; - } -#endif + /* mmap_lock is needed by tb_gen_code, and mmap_lock must be + * taken outside tb_lock. As system emulation is currently + * single threaded the locks are NOPs. + */ + mmap_lock(); + tb_lock(); - /* if no translated code available, then translate it now */ - tb = tb_gen_code(cpu, pc, cs_base, flags, 0); + /* There's a chance that our desired tb has been translated while + * taking the locks so we check again inside the lock. + */ + tb = tb_find_physical(cpu, pc, cs_base, flags); + if (!tb) { + /* if no translated code available, then translate it now */ + tb = tb_gen_code(cpu, pc, cs_base, flags, 0); + } -#ifdef CONFIG_USER_ONLY - mmap_unlock(); -#endif + tb_unlock(); + mmap_unlock(); + } -found: - /* we add the TB in the virtual pc hash table */ + /* We add the TB in the virtual pc hash table for the fast lookup */ atomic_set(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)], tb); return tb; } @@ -337,7 +331,6 @@ static inline TranslationBlock *tb_find_fast(CPUState *cpu, always be the same before a given translated block is executed. */ cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags); - tb_lock(); tb = atomic_read(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)]); if (unlikely(!tb || tb->pc != pc || tb->cs_base != cs_base || tb->flags != flags)) { @@ -361,9 +354,10 @@ static inline TranslationBlock *tb_find_fast(CPUState *cpu, #endif /* See if we can patch the calling TB. */ if (*last_tb && !qemu_loglevel_mask(CPU_LOG_TB_NOCHAIN)) { + tb_lock(); tb_add_jump(*last_tb, tb_exit, tb); + tb_unlock(); } - tb_unlock(); return tb; }