diff mbox series

target/riscv: Implement dynamic establishment of custom decoder

Message ID 20240306093308.35188-1-eric.huang@linux.alibaba.com (mailing list archive)
State New, archived
Headers show
Series target/riscv: Implement dynamic establishment of custom decoder | expand

Commit Message

Huang Tao March 6, 2024, 9:33 a.m. UTC
In this patch, we modify the decoder to be a freely composable data
structure instead of a hardcoded one. It can be dynamically builded up
according to the extensions.
This approach has several benefits:
1. Provides support for heterogeneous cpu architectures. As we add decoder in
   CPUArchState, each cpu can have their own decoder, and the decoders can be
   different due to cpu's features.
2. Improve the decoding efficiency. We run the guard_func to see if the decoder
   can be added to the dynamic_decoder when building up the decoder. Therefore,
   there is no need to run the guard_func when decoding each instruction. It can
   improve the decoding efficiency
3. For vendor or dynamic cpus, it allows them to customize their own decoder
   functions to improve decoding efficiency, especially when vendor-defined
   instruction sets increase. Because of dynamic building up, it can skip the other
   decoder guard functions when decoding.

Signed-off-by: Huang Tao <eric.huang@linux.alibaba.com>
Suggested-by: Christoph Muellner <christoph.muellner@vrull.eu>
Co-authored-by: LIU Zhiwei <zhiwei_liu@linux.alibaba.com>
---
 target/riscv/cpu.c         | 20 ++++++++++++++++++++
 target/riscv/cpu.h         |  2 ++
 target/riscv/cpu_decoder.h | 34 ++++++++++++++++++++++++++++++++++
 target/riscv/translate.c   | 28 ++++++++++++----------------
 4 files changed, 68 insertions(+), 16 deletions(-)
 create mode 100644 target/riscv/cpu_decoder.h

Comments

Daniel Henrique Barboza March 7, 2024, 7:55 p.m. UTC | #1
(--- adding Richard ---)


On 3/6/24 06:33, Huang Tao wrote:
> In this patch, we modify the decoder to be a freely composable data
> structure instead of a hardcoded one. It can be dynamically builded up
> according to the extensions.
> This approach has several benefits:
> 1. Provides support for heterogeneous cpu architectures. As we add decoder in
>     CPUArchState, each cpu can have their own decoder, and the decoders can be
>     different due to cpu's features.
> 2. Improve the decoding efficiency. We run the guard_func to see if the decoder
>     can be added to the dynamic_decoder when building up the decoder. Therefore,
>     there is no need to run the guard_func when decoding each instruction. It can
>     improve the decoding efficiency
> 3. For vendor or dynamic cpus, it allows them to customize their own decoder
>     functions to improve decoding efficiency, especially when vendor-defined
>     instruction sets increase. Because of dynamic building up, it can skip the other
>     decoder guard functions when decoding.
> 
> Signed-off-by: Huang Tao <eric.huang@linux.alibaba.com>
> Suggested-by: Christoph Muellner <christoph.muellner@vrull.eu>
> Co-authored-by: LIU Zhiwei <zhiwei_liu@linux.alibaba.com>
> ---
>   target/riscv/cpu.c         | 20 ++++++++++++++++++++
>   target/riscv/cpu.h         |  2 ++
>   target/riscv/cpu_decoder.h | 34 ++++++++++++++++++++++++++++++++++
>   target/riscv/translate.c   | 28 ++++++++++++----------------
>   4 files changed, 68 insertions(+), 16 deletions(-)
>   create mode 100644 target/riscv/cpu_decoder.h
> 
> diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
> index 5ff0192c52..cadcd51b4f 100644
> --- a/target/riscv/cpu.c
> +++ b/target/riscv/cpu.c
> @@ -38,6 +38,7 @@
>   #include "kvm/kvm_riscv.h"
>   #include "tcg/tcg-cpu.h"
>   #include "tcg/tcg.h"
> +#include "cpu_decoder.h"
>   
>   /* RISC-V CPU definitions */
>   static const char riscv_single_letter_exts[] = "IEMAFDQCBPVH";
> @@ -1102,6 +1103,23 @@ static void riscv_cpu_satp_mode_finalize(RISCVCPU *cpu, Error **errp)
>   }
>   #endif
>   
> +static void riscv_cpu_finalize_dynamic_decoder(RISCVCPU *cpu)
> +{
> +    CPURISCVState *env = &cpu->env;
> +    decode_fn *dynamic_decoder;
> +    dynamic_decoder = g_new0(decode_fn, decoder_table_size);
> +    int j = 0;
> +    for (size_t i = 0; i < decoder_table_size; ++i) {
> +        if (decoder_table[i].guard_func &&
> +            decoder_table[i].guard_func(&cpu->cfg)) {
> +            dynamic_decoder[j] = decoder_table[i].decode_fn;
> +            j++;
> +        }
> +    }
> +
> +    env->decoder = dynamic_decoder;
> +}
> +
>   void riscv_cpu_finalize_features(RISCVCPU *cpu, Error **errp)
>   {
>       Error *local_err = NULL;
> @@ -1127,6 +1145,8 @@ void riscv_cpu_finalize_features(RISCVCPU *cpu, Error **errp)
>               return;
>           }
>       }
> +
> +    riscv_cpu_finalize_dynamic_decoder(cpu);
>   }
>   
>   static void riscv_cpu_realize(DeviceState *dev, Error **errp)
> diff --git a/target/riscv/cpu.h b/target/riscv/cpu.h
> index 5d291a7092..42bfed065c 100644
> --- a/target/riscv/cpu.h
> +++ b/target/riscv/cpu.h
> @@ -30,6 +30,7 @@
>   #include "qemu/int128.h"
>   #include "cpu_bits.h"
>   #include "cpu_cfg.h"
> +#include "cpu_decoder.h"
>   #include "qapi/qapi-types-common.h"
>   #include "cpu-qom.h"
>   
> @@ -433,6 +434,7 @@ struct CPUArchState {
>       uint64_t kvm_timer_state;
>       uint64_t kvm_timer_frequency;
>   #endif /* CONFIG_KVM */
> +    const decode_fn *decoder;
>   };
>   
>   /*
> diff --git a/target/riscv/cpu_decoder.h b/target/riscv/cpu_decoder.h
> new file mode 100644
> index 0000000000..549414ce4c
> --- /dev/null
> +++ b/target/riscv/cpu_decoder.h
> @@ -0,0 +1,34 @@
> +/*
> + * QEMU RISC-V CPU Decoder
> + *
> + * Copyright (c) 2023-2024 Alibaba Group
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms and conditions of the GNU General Public License,
> + * version 2 or later, as published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope it will be useful, but WITHOUT
> + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
> + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
> + * more details.
> + *
> + * You should have received a copy of the GNU General Public License along with
> + * this program.  If not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#ifndef RISCV_CPU_DECODER_H
> +#define RISCV_CPU_DECODER_H
> +
> +struct DisasContext;
> +struct RISCVCPUConfig;
> +typedef struct RISCVDecoder {
> +    bool (*guard_func)(const struct RISCVCPUConfig *);
> +    bool (*decode_fn)(struct DisasContext *, uint32_t);
> +} RISCVDecoder;
> +
> +typedef bool (*decode_fn)(struct DisasContext *, uint32_t);
> +
> +extern const size_t decoder_table_size;
> +
> +extern const RISCVDecoder decoder_table[];
> +#endif
> diff --git a/target/riscv/translate.c b/target/riscv/translate.c
> index 177418b2b9..23c5321bce 100644
> --- a/target/riscv/translate.c
> +++ b/target/riscv/translate.c
> @@ -115,6 +115,7 @@ typedef struct DisasContext {
>       bool frm_valid;
>       /* TCG of the current insn_start */
>       TCGOp *insn_start;
> +    const decode_fn *decoder;
>   } DisasContext;
>   
>   static inline bool has_ext(DisasContext *ctx, uint32_t ext)
> @@ -1118,21 +1119,16 @@ static inline int insn_len(uint16_t first_word)
>       return (first_word & 3) == 3 ? 4 : 2;
>   }
>   
> +const RISCVDecoder decoder_table[] = {
> +    { always_true_p, decode_insn32 },
> +    { has_xthead_p, decode_xthead},
> +    { has_XVentanaCondOps_p, decode_XVentanaCodeOps},
> +};
> +
> +const size_t decoder_table_size = ARRAY_SIZE(decoder_table);
> +
>   static void decode_opc(CPURISCVState *env, DisasContext *ctx, uint16_t opcode)
>   {
> -    /*
> -     * A table with predicate (i.e., guard) functions and decoder functions
> -     * that are tested in-order until a decoder matches onto the opcode.
> -     */
> -    static const struct {
> -        bool (*guard_func)(const RISCVCPUConfig *);
> -        bool (*decode_func)(DisasContext *, uint32_t);
> -    } decoders[] = {
> -        { always_true_p,  decode_insn32 },
> -        { has_xthead_p, decode_xthead },
> -        { has_XVentanaCondOps_p,  decode_XVentanaCodeOps },
> -    };
> -
>       ctx->virt_inst_excp = false;
>       ctx->cur_insn_len = insn_len(opcode);
>       /* Check for compressed insn */
> @@ -1153,9 +1149,8 @@ static void decode_opc(CPURISCVState *env, DisasContext *ctx, uint16_t opcode)
>                                                ctx->base.pc_next + 2));
>           ctx->opcode = opcode32;
>   
> -        for (size_t i = 0; i < ARRAY_SIZE(decoders); ++i) {
> -            if (decoders[i].guard_func(ctx->cfg_ptr) &&
> -                decoders[i].decode_func(ctx, opcode32)) {
> +        for (size_t i = 0; i < decoder_table_size; ++i) {
> +            if (ctx->decoder[i](ctx, opcode32)) {
>                   return;
>               }
>           }
> @@ -1199,6 +1194,7 @@ static void riscv_tr_init_disas_context(DisasContextBase *dcbase, CPUState *cs)
>       ctx->itrigger = FIELD_EX32(tb_flags, TB_FLAGS, ITRIGGER);
>       ctx->zero = tcg_constant_tl(0);
>       ctx->virt_inst_excp = false;
> +    ctx->decoder = env->decoder;


Up there, in riscv_cpu_finalize_dynamic_decoder(), we assigned a dynamic decoder in
env->decoder. The DisasContext 'decoder' pointer is then assigned to env->decoder in
every tr_init_disas_context() here.

'ctx->decoder' is used in decode_opc(), that receives a CPURISCVState *env pointer as
a parameter.

Am I missing something? It seems we can just remove the ctx->decode pointer altogether
and use env->decoder.


Thanks,

Daniel



>   }
>   
>   static void riscv_tr_tb_start(DisasContextBase *db, CPUState *cpu)
Richard Henderson March 7, 2024, 8:11 p.m. UTC | #2
On 3/7/24 09:55, Daniel Henrique Barboza wrote:
> (--- adding Richard ---)
> 
> 
> On 3/6/24 06:33, Huang Tao wrote:
>> In this patch, we modify the decoder to be a freely composable data
>> structure instead of a hardcoded one. It can be dynamically builded up
>> according to the extensions.
>> This approach has several benefits:
>> 1. Provides support for heterogeneous cpu architectures. As we add decoder in
>>     CPUArchState, each cpu can have their own decoder, and the decoders can be
>>     different due to cpu's features.
>> 2. Improve the decoding efficiency. We run the guard_func to see if the decoder
>>     can be added to the dynamic_decoder when building up the decoder. Therefore,
>>     there is no need to run the guard_func when decoding each instruction. It can
>>     improve the decoding efficiency
>> 3. For vendor or dynamic cpus, it allows them to customize their own decoder
>>     functions to improve decoding efficiency, especially when vendor-defined
>>     instruction sets increase. Because of dynamic building up, it can skip the other
>>     decoder guard functions when decoding.
>>
>> Signed-off-by: Huang Tao <eric.huang@linux.alibaba.com>
>> Suggested-by: Christoph Muellner <christoph.muellner@vrull.eu>
>> Co-authored-by: LIU Zhiwei <zhiwei_liu@linux.alibaba.com>
>> ---
>>   target/riscv/cpu.c         | 20 ++++++++++++++++++++
>>   target/riscv/cpu.h         |  2 ++
>>   target/riscv/cpu_decoder.h | 34 ++++++++++++++++++++++++++++++++++
>>   target/riscv/translate.c   | 28 ++++++++++++----------------
>>   4 files changed, 68 insertions(+), 16 deletions(-)
>>   create mode 100644 target/riscv/cpu_decoder.h
>>
>> diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
>> index 5ff0192c52..cadcd51b4f 100644
>> --- a/target/riscv/cpu.c
>> +++ b/target/riscv/cpu.c
>> @@ -38,6 +38,7 @@
>>   #include "kvm/kvm_riscv.h"
>>   #include "tcg/tcg-cpu.h"
>>   #include "tcg/tcg.h"
>> +#include "cpu_decoder.h"
>>   /* RISC-V CPU definitions */
>>   static const char riscv_single_letter_exts[] = "IEMAFDQCBPVH";
>> @@ -1102,6 +1103,23 @@ static void riscv_cpu_satp_mode_finalize(RISCVCPU *cpu, Error 
>> **errp)
>>   }
>>   #endif
>> +static void riscv_cpu_finalize_dynamic_decoder(RISCVCPU *cpu)
>> +{
>> +    CPURISCVState *env = &cpu->env;
>> +    decode_fn *dynamic_decoder;
>> +    dynamic_decoder = g_new0(decode_fn, decoder_table_size);
>> +    int j = 0;
>> +    for (size_t i = 0; i < decoder_table_size; ++i) {
>> +        if (decoder_table[i].guard_func &&
>> +            decoder_table[i].guard_func(&cpu->cfg)) {
>> +            dynamic_decoder[j] = decoder_table[i].decode_fn;
>> +            j++;
>> +        }
>> +    }
>> +
>> +    env->decoder = dynamic_decoder;

You should save J into ENV as well, or use GPtrArray which would carry the length along 
with it naturally.

Is the cpu configuration on each cpu, or on the cpu class?

Even if the configuration is per cpu, this new element belongs in ArchCPU not 
CPUArchState.  Usually all of CPUArchState is zeroed during reset.

>> @@ -1153,9 +1149,8 @@ static void decode_opc(CPURISCVState *env, DisasContext *ctx, 
>> uint16_t opcode)
>>                                                ctx->base.pc_next + 2));
>>           ctx->opcode = opcode32;
>> -        for (size_t i = 0; i < ARRAY_SIZE(decoders); ++i) {
>> -            if (decoders[i].guard_func(ctx->cfg_ptr) &&
>> -                decoders[i].decode_func(ctx, opcode32)) {
>> +        for (size_t i = 0; i < decoder_table_size; ++i) {
>> +            if (ctx->decoder[i](ctx, opcode32)) {
>>                   return;

You definitely should not be using decoder_table_size here, because you eliminated 
elements, which are in fact NULL here.

> Am I missing something? It seems we can just remove the ctx->decode pointer altogether
> and use env->decoder.

We try to avoid placing env into DisasContext, so that it is much harder to make the 
mistake of referencing env fields at translation-time, when you really needed to generate 
tcg code to reference the fields at runtime.



r~
Richard Henderson March 7, 2024, 8:35 p.m. UTC | #3
>>> -        for (size_t i = 0; i < ARRAY_SIZE(decoders); ++i) {
>>> -            if (decoders[i].guard_func(ctx->cfg_ptr) &&
>>> -                decoders[i].decode_func(ctx, opcode32)) {
>>> +        for (size_t i = 0; i < decoder_table_size; ++i) {
>>> +            if (ctx->decoder[i](ctx, opcode32)) {
>>>                   return;

By the way, you're adding layers of pointer chasing, so I suspect you'll find all of this 
is a wash or worse, performance-wise.

Indeed, since some of the predicates are trivial, going the other way might help: allow 
everything to be inlined:

     if (decode_insn32(...)) {
         return;
     }
     if (has_xthead_p(...) && decode_xthead(...)) {
         return;
     }
     ...

Even if there are 10 entries here, so what?  All of the code has to be compiled into QEMU. 
  You're not going to somehow add truly dynamic code that you've loaded from a module.

You could perhaps streamline predicates such as has_xthead_p to not test 11 variables by 
adding an artificial "ext_xthead_any" configuration entry that is the sum of all of those.


r~
Daniel Henrique Barboza March 7, 2024, 9:43 p.m. UTC | #4
On 3/7/24 17:11, Richard Henderson wrote:
> On 3/7/24 09:55, Daniel Henrique Barboza wrote:
>> (--- adding Richard ---)
>>
>>
>> On 3/6/24 06:33, Huang Tao wrote:
>>> In this patch, we modify the decoder to be a freely composable data
>>> structure instead of a hardcoded one. It can be dynamically builded up
>>> according to the extensions.
>>> This approach has several benefits:
>>> 1. Provides support for heterogeneous cpu architectures. As we add decoder in
>>>     CPUArchState, each cpu can have their own decoder, and the decoders can be
>>>     different due to cpu's features.
>>> 2. Improve the decoding efficiency. We run the guard_func to see if the decoder
>>>     can be added to the dynamic_decoder when building up the decoder. Therefore,
>>>     there is no need to run the guard_func when decoding each instruction. It can
>>>     improve the decoding efficiency
>>> 3. For vendor or dynamic cpus, it allows them to customize their own decoder
>>>     functions to improve decoding efficiency, especially when vendor-defined
>>>     instruction sets increase. Because of dynamic building up, it can skip the other
>>>     decoder guard functions when decoding.
>>>
>>> Signed-off-by: Huang Tao <eric.huang@linux.alibaba.com>
>>> Suggested-by: Christoph Muellner <christoph.muellner@vrull.eu>
>>> Co-authored-by: LIU Zhiwei <zhiwei_liu@linux.alibaba.com>
>>> ---
>>>   target/riscv/cpu.c         | 20 ++++++++++++++++++++
>>>   target/riscv/cpu.h         |  2 ++
>>>   target/riscv/cpu_decoder.h | 34 ++++++++++++++++++++++++++++++++++
>>>   target/riscv/translate.c   | 28 ++++++++++++----------------
>>>   4 files changed, 68 insertions(+), 16 deletions(-)
>>>   create mode 100644 target/riscv/cpu_decoder.h
>>>
>>> diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
>>> index 5ff0192c52..cadcd51b4f 100644
>>> --- a/target/riscv/cpu.c
>>> +++ b/target/riscv/cpu.c
>>> @@ -38,6 +38,7 @@
>>>   #include "kvm/kvm_riscv.h"
>>>   #include "tcg/tcg-cpu.h"
>>>   #include "tcg/tcg.h"
>>> +#include "cpu_decoder.h"
>>>   /* RISC-V CPU definitions */
>>>   static const char riscv_single_letter_exts[] = "IEMAFDQCBPVH";
>>> @@ -1102,6 +1103,23 @@ static void riscv_cpu_satp_mode_finalize(RISCVCPU *cpu, Error **errp)
>>>   }
>>>   #endif
>>> +static void riscv_cpu_finalize_dynamic_decoder(RISCVCPU *cpu)
>>> +{
>>> +    CPURISCVState *env = &cpu->env;
>>> +    decode_fn *dynamic_decoder;
>>> +    dynamic_decoder = g_new0(decode_fn, decoder_table_size);
>>> +    int j = 0;
>>> +    for (size_t i = 0; i < decoder_table_size; ++i) {
>>> +        if (decoder_table[i].guard_func &&
>>> +            decoder_table[i].guard_func(&cpu->cfg)) {
>>> +            dynamic_decoder[j] = decoder_table[i].decode_fn;
>>> +            j++;
>>> +        }
>>> +    }
>>> +
>>> +    env->decoder = dynamic_decoder;

In time: I think we should rename this to 'decoders' to make it easier to figure out
that it's an array instead of a single element. Same thing with the ctx->decoder pointer.

> 
> You should save J into ENV as well, or use GPtrArray which would carry the length along with it naturally.
> 
> Is the cpu configuration on each cpu, or on the cpu class?
> 
> Even if the configuration is per cpu, this new element belongs in ArchCPU not CPUArchState.  Usually all of CPUArchState is zeroed during reset.
> 
>>> @@ -1153,9 +1149,8 @@ static void decode_opc(CPURISCVState *env, DisasContext *ctx, uint16_t opcode)
>>>                                                ctx->base.pc_next + 2));
>>>           ctx->opcode = opcode32;
>>> -        for (size_t i = 0; i < ARRAY_SIZE(decoders); ++i) {
>>> -            if (decoders[i].guard_func(ctx->cfg_ptr) &&
>>> -                decoders[i].decode_func(ctx, opcode32)) {
>>> +        for (size_t i = 0; i < decoder_table_size; ++i) {
>>> +            if (ctx->decoder[i](ctx, opcode32)) {
>>>                   return;
> 
> You definitely should not be using decoder_table_size here, because you eliminated elements, which are in fact NULL here.
> 
>> Am I missing something? It seems we can just remove the ctx->decode pointer altogether
>> and use env->decoder.
> 
> We try to avoid placing env into DisasContext, so that it is much harder to make the mistake of referencing env fields at translation-time, when you really needed to generate tcg code to reference the fields at runtime.

Right. And you mentioned that ArchState isn't the ideal place and we should put the
decoders in ArchCPU, so there's that.

In this case, if we were not to use ctx->decoders, we would need to retrieve a RISCVCPU
in riscv_tr_translate_insn() to get access to them ....


I'd rather keep ctx->decoders and assign it in tr_init_disas() since that function already
uses a RISCVCPU pointer. The extra pointer in DisasContext seems worthy in this case.


Thanks,

Daniel





> 
> 
> 
> r~
>
Huang Tao March 8, 2024, 9:41 a.m. UTC | #5
Hello, Richard and Daniel,

Thanks to your review, the suggestions about decoder_table_size and 
decoder's place will be adopted in the next version of the patch.

But I would not agree that this patch is a wash or worse. On average, 
though, the two approaches may be comparable. However, as more and more 
vendors join in, this approach will have scalability issues.

For example, if you add 10 vendors, it is not fair to treat the 10th 
vendor with the lowest performance. Our approach works for most 
scenarios, which are basic RV extensions + vendor-specific extensions.

Thanks,

Huang Tao


在 2024/3/8 04:35, Richard Henderson 写道:
>>>> -        for (size_t i = 0; i < ARRAY_SIZE(decoders); ++i) {
>>>> -            if (decoders[i].guard_func(ctx->cfg_ptr) &&
>>>> -                decoders[i].decode_func(ctx, opcode32)) {
>>>> +        for (size_t i = 0; i < decoder_table_size; ++i) {
>>>> +            if (ctx->decoder[i](ctx, opcode32)) {
>>>>                   return;
>
> By the way, you're adding layers of pointer chasing, so I suspect 
> you'll find all of this is a wash or worse, performance-wise.
>
> Indeed, since some of the predicates are trivial, going the other way 
> might help: allow everything to be inlined:
>
>     if (decode_insn32(...)) {
>         return;
>     }
>     if (has_xthead_p(...) && decode_xthead(...)) {
>         return;
>     }
>     ...
>
> Even if there are 10 entries here, so what?  All of the code has to be 
> compiled into QEMU.  You're not going to somehow add truly dynamic 
> code that you've loaded from a module.
>
> You could perhaps streamline predicates such as has_xthead_p to not 
> test 11 variables by adding an artificial "ext_xthead_any" 
> configuration entry that is the sum of all of those.
>
>
> r~
Christoph Müllner March 8, 2024, 11:56 a.m. UTC | #6
On Thu, Mar 7, 2024 at 9:35 PM Richard Henderson
<richard.henderson@linaro.org> wrote:
>
> >>> -        for (size_t i = 0; i < ARRAY_SIZE(decoders); ++i) {
> >>> -            if (decoders[i].guard_func(ctx->cfg_ptr) &&
> >>> -                decoders[i].decode_func(ctx, opcode32)) {
> >>> +        for (size_t i = 0; i < decoder_table_size; ++i) {
> >>> +            if (ctx->decoder[i](ctx, opcode32)) {
> >>>                   return;
>
> By the way, you're adding layers of pointer chasing, so I suspect you'll find all of this
> is a wash or worse, performance-wise.
>
>
> Indeed, since some of the predicates are trivial, going the other way might help: allow
> everything to be inlined:
>
>      if (decode_insn32(...)) {
>          return;
>      }
>      if (has_xthead_p(...) && decode_xthead(...)) {
>          return;
>      }
>      ...
>
>
> Even if there are 10 entries here, so what?  All of the code has to be compiled into QEMU.
>   You're not going to somehow add truly dynamic code that you've loaded from a module.

I just tested this with GCC -O2/-O3. The generated code from the
existing decoder loop will
result in exactly what you have listed here (loop unrolling,
transforming the indirect calls
to direct calls, inlining, and evaluation of statically known conditions).
has_xthead_p() can get inlined as well, if the inlining costs are
considered low enough
(thank you, Richard, for giving some hints about that below).

What the commit message is not mentioning (and what this patch was
actually addressing and
therefore should have been mentioned):
Having dynamic control of the decoder order was meant to allow adding
a vendor_decoder
before decode_insn32() with minimal overhead (no guard_func) for users
that don't need
this particular vendor_decoder.

Being more explicit: there is interest in supporting the
non-conforming (conflicting) instruction
extension XTheadVector:
  https://github.com/T-head-Semi/thead-extension-spec/blob/master/xtheadvector.adoc
XTheadVector uses the RVV 0.7.1 draft encoding, which conflicts with
the ratified RVV spec.
The idea is to avoid these conflicts with a call to
decode_xtheadvector() before decode_insn32().
This implies that everyone has to call has_xtheadvector_p() before
calling decode_insn32().
And the intent of this patch is to provide a mechanism to reduce this overhead.

When suggesting the dynamic decoder list, I was not aware that
always_true_p() will be
eliminated by the compiler. Since this is the case, I agree with the
"wash or worse" for
decode_insn32(). The elimination of following guard functions for
vendor decoders is likely
less performance relevant.

I don't think we should discuss efficiency any further unless we have
some data to
justify any changes. E.g. emulating a RISC-V SPEC CPU 2017 run on x86_64 and
looking at the runtimes could give the relevant insights for the
following scenarios:
* existing code on upstream/master
* existing code + adding a new extension that comes before
decode_insn32 in decoders[]
* existing code + this patch (dynamic decoders)
Related overheads that could be measured: adding 20 new instructions
to decode_insn32(),
which are not executed (to put the costs into perspective).

> You could perhaps streamline predicates such as has_xthead_p to not test 11 variables by
> adding an artificial "ext_xthead_any" configuration entry that is the sum of all of those.
>
>
> r~
diff mbox series

Patch

diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
index 5ff0192c52..cadcd51b4f 100644
--- a/target/riscv/cpu.c
+++ b/target/riscv/cpu.c
@@ -38,6 +38,7 @@ 
 #include "kvm/kvm_riscv.h"
 #include "tcg/tcg-cpu.h"
 #include "tcg/tcg.h"
+#include "cpu_decoder.h"
 
 /* RISC-V CPU definitions */
 static const char riscv_single_letter_exts[] = "IEMAFDQCBPVH";
@@ -1102,6 +1103,23 @@  static void riscv_cpu_satp_mode_finalize(RISCVCPU *cpu, Error **errp)
 }
 #endif
 
+static void riscv_cpu_finalize_dynamic_decoder(RISCVCPU *cpu)
+{
+    CPURISCVState *env = &cpu->env;
+    decode_fn *dynamic_decoder;
+    dynamic_decoder = g_new0(decode_fn, decoder_table_size);
+    int j = 0;
+    for (size_t i = 0; i < decoder_table_size; ++i) {
+        if (decoder_table[i].guard_func &&
+            decoder_table[i].guard_func(&cpu->cfg)) {
+            dynamic_decoder[j] = decoder_table[i].decode_fn;
+            j++;
+        }
+    }
+
+    env->decoder = dynamic_decoder;
+}
+
 void riscv_cpu_finalize_features(RISCVCPU *cpu, Error **errp)
 {
     Error *local_err = NULL;
@@ -1127,6 +1145,8 @@  void riscv_cpu_finalize_features(RISCVCPU *cpu, Error **errp)
             return;
         }
     }
+
+    riscv_cpu_finalize_dynamic_decoder(cpu);
 }
 
 static void riscv_cpu_realize(DeviceState *dev, Error **errp)
diff --git a/target/riscv/cpu.h b/target/riscv/cpu.h
index 5d291a7092..42bfed065c 100644
--- a/target/riscv/cpu.h
+++ b/target/riscv/cpu.h
@@ -30,6 +30,7 @@ 
 #include "qemu/int128.h"
 #include "cpu_bits.h"
 #include "cpu_cfg.h"
+#include "cpu_decoder.h"
 #include "qapi/qapi-types-common.h"
 #include "cpu-qom.h"
 
@@ -433,6 +434,7 @@  struct CPUArchState {
     uint64_t kvm_timer_state;
     uint64_t kvm_timer_frequency;
 #endif /* CONFIG_KVM */
+    const decode_fn *decoder;
 };
 
 /*
diff --git a/target/riscv/cpu_decoder.h b/target/riscv/cpu_decoder.h
new file mode 100644
index 0000000000..549414ce4c
--- /dev/null
+++ b/target/riscv/cpu_decoder.h
@@ -0,0 +1,34 @@ 
+/*
+ * QEMU RISC-V CPU Decoder
+ *
+ * Copyright (c) 2023-2024 Alibaba Group
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2 or later, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef RISCV_CPU_DECODER_H
+#define RISCV_CPU_DECODER_H
+
+struct DisasContext;
+struct RISCVCPUConfig;
+typedef struct RISCVDecoder {
+    bool (*guard_func)(const struct RISCVCPUConfig *);
+    bool (*decode_fn)(struct DisasContext *, uint32_t);
+} RISCVDecoder;
+
+typedef bool (*decode_fn)(struct DisasContext *, uint32_t);
+
+extern const size_t decoder_table_size;
+
+extern const RISCVDecoder decoder_table[];
+#endif
diff --git a/target/riscv/translate.c b/target/riscv/translate.c
index 177418b2b9..23c5321bce 100644
--- a/target/riscv/translate.c
+++ b/target/riscv/translate.c
@@ -115,6 +115,7 @@  typedef struct DisasContext {
     bool frm_valid;
     /* TCG of the current insn_start */
     TCGOp *insn_start;
+    const decode_fn *decoder;
 } DisasContext;
 
 static inline bool has_ext(DisasContext *ctx, uint32_t ext)
@@ -1118,21 +1119,16 @@  static inline int insn_len(uint16_t first_word)
     return (first_word & 3) == 3 ? 4 : 2;
 }
 
+const RISCVDecoder decoder_table[] = {
+    { always_true_p, decode_insn32 },
+    { has_xthead_p, decode_xthead},
+    { has_XVentanaCondOps_p, decode_XVentanaCodeOps},
+};
+
+const size_t decoder_table_size = ARRAY_SIZE(decoder_table);
+
 static void decode_opc(CPURISCVState *env, DisasContext *ctx, uint16_t opcode)
 {
-    /*
-     * A table with predicate (i.e., guard) functions and decoder functions
-     * that are tested in-order until a decoder matches onto the opcode.
-     */
-    static const struct {
-        bool (*guard_func)(const RISCVCPUConfig *);
-        bool (*decode_func)(DisasContext *, uint32_t);
-    } decoders[] = {
-        { always_true_p,  decode_insn32 },
-        { has_xthead_p, decode_xthead },
-        { has_XVentanaCondOps_p,  decode_XVentanaCodeOps },
-    };
-
     ctx->virt_inst_excp = false;
     ctx->cur_insn_len = insn_len(opcode);
     /* Check for compressed insn */
@@ -1153,9 +1149,8 @@  static void decode_opc(CPURISCVState *env, DisasContext *ctx, uint16_t opcode)
                                              ctx->base.pc_next + 2));
         ctx->opcode = opcode32;
 
-        for (size_t i = 0; i < ARRAY_SIZE(decoders); ++i) {
-            if (decoders[i].guard_func(ctx->cfg_ptr) &&
-                decoders[i].decode_func(ctx, opcode32)) {
+        for (size_t i = 0; i < decoder_table_size; ++i) {
+            if (ctx->decoder[i](ctx, opcode32)) {
                 return;
             }
         }
@@ -1199,6 +1194,7 @@  static void riscv_tr_init_disas_context(DisasContextBase *dcbase, CPUState *cs)
     ctx->itrigger = FIELD_EX32(tb_flags, TB_FLAGS, ITRIGGER);
     ctx->zero = tcg_constant_tl(0);
     ctx->virt_inst_excp = false;
+    ctx->decoder = env->decoder;
 }
 
 static void riscv_tr_tb_start(DisasContextBase *db, CPUState *cpu)