Message ID | 20240717133936.713642-6-max.chou@sifive.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Improve the performance of RISC-V vector unit-stride/whole register ld/st instructions | expand |
On 7/17/24 23:39, Max Chou wrote: > In the vector unit-stride load/store helper functions. the vext_ldst_us > & vext_ldst_whole functions corresponding most of the execution time. > Inline the functions can avoid the function call overhead to improve the > helper function performance. > > Signed-off-by: Max Chou <max.chou@sifive.com> > Reviewed-by: Richard Henderson <richard.henderson@linaro.org> > --- > target/riscv/vector_helper.c | 56 +++++++++++++++++++----------------- > 1 file changed, 30 insertions(+), 26 deletions(-) You'll want to mark vext_page_ldst_us similarly. r~ > > diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c > index 2e675b4220c..95394c425ed 100644 > --- a/target/riscv/vector_helper.c > +++ b/target/riscv/vector_helper.c > @@ -150,18 +150,20 @@ typedef void vext_ldst_elem_fn_tlb(CPURISCVState *env, abi_ptr addr, > uint32_t idx, void *vd, uintptr_t retaddr); > typedef void vext_ldst_elem_fn_host(void *vd, uint32_t idx, void *host); > > -#define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF) \ > -static void NAME##_tlb(CPURISCVState *env, abi_ptr addr, \ > - uint32_t byte_off, void *vd, uintptr_t retaddr) \ > -{ \ > - ETYPE *cur = vd + byte_off; \ > - *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr); \ > -} \ > - \ > -static void NAME##_host(void *vd, uint32_t byte_off, void *host) \ > -{ \ > - ETYPE val = LDSUF##_p(host); \ > - *(ETYPE *)(vd + byte_off) = val; \ > +#define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF) \ > +static inline QEMU_ALWAYS_INLINE \ > +void NAME##_tlb(CPURISCVState *env, abi_ptr addr, \ > + uint32_t byte_off, void *vd, uintptr_t retaddr) \ > +{ \ > + ETYPE *cur = vd + byte_off; \ > + *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr); \ > +} \ > + \ > +static inline QEMU_ALWAYS_INLINE \ > +void NAME##_host(void *vd, uint32_t byte_off, void *host) \ > +{ \ > + ETYPE val = LDSUF##_p(host); \ > + *(ETYPE *)(vd + byte_off) = val; \ > } > > GEN_VEXT_LD_ELEM(lde_b, uint8_t, H1, ldub) > @@ -169,18 +171,20 @@ GEN_VEXT_LD_ELEM(lde_h, uint16_t, H2, lduw) > GEN_VEXT_LD_ELEM(lde_w, uint32_t, H4, ldl) > GEN_VEXT_LD_ELEM(lde_d, uint64_t, H8, ldq) > > -#define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF) \ > -static void NAME##_tlb(CPURISCVState *env, abi_ptr addr, \ > - uint32_t byte_off, void *vd, uintptr_t retaddr) \ > -{ \ > - ETYPE data = *(ETYPE *)(vd + byte_off); \ > - cpu_##STSUF##_data_ra(env, addr, data, retaddr); \ > -} \ > - \ > -static void NAME##_host(void *vd, uint32_t byte_off, void *host) \ > -{ \ > - ETYPE val = *(ETYPE *)(vd + byte_off); \ > - STSUF##_p(host, val); \ > +#define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF) \ > +static inline QEMU_ALWAYS_INLINE \ > +void NAME##_tlb(CPURISCVState *env, abi_ptr addr, \ > + uint32_t byte_off, void *vd, uintptr_t retaddr) \ > +{ \ > + ETYPE data = *(ETYPE *)(vd + byte_off); \ > + cpu_##STSUF##_data_ra(env, addr, data, retaddr); \ > +} \ > + \ > +static inline QEMU_ALWAYS_INLINE \ > +void NAME##_host(void *vd, uint32_t byte_off, void *host) \ > +{ \ > + ETYPE val = *(ETYPE *)(vd + byte_off); \ > + STSUF##_p(host, val); \ > } > > GEN_VEXT_ST_ELEM(ste_b, uint8_t, H1, stb) > @@ -366,7 +370,7 @@ vext_page_ldst_us(CPURISCVState *env, void *vd, target_ulong addr, > } > } > > -static void > +static inline QEMU_ALWAYS_INLINE void > vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc, > vext_ldst_elem_fn_tlb *ldst_tlb, > vext_ldst_elem_fn_host *ldst_host, uint32_t log2_esz, > @@ -695,7 +699,7 @@ GEN_VEXT_LDFF(vle64ff_v, int64_t, lde_d_tlb) > /* > * load and store whole register instructions > */ > -static void > +static inline QEMU_ALWAYS_INLINE void > vext_ldst_whole(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc, > vext_ldst_elem_fn_tlb *ldst_tlb, > vext_ldst_elem_fn_host *ldst_host, uint32_t log2_esz,
On 2024/7/25 2:05 PM, Richard Henderson wrote: > On 7/17/24 23:39, Max Chou wrote: >> In the vector unit-stride load/store helper functions. the vext_ldst_us >> & vext_ldst_whole functions corresponding most of the execution time. >> Inline the functions can avoid the function call overhead to improve the >> helper function performance. >> >> Signed-off-by: Max Chou <max.chou@sifive.com> >> Reviewed-by: Richard Henderson <richard.henderson@linaro.org> >> --- >> target/riscv/vector_helper.c | 56 +++++++++++++++++++----------------- >> 1 file changed, 30 insertions(+), 26 deletions(-) > > You'll want to mark vext_page_ldst_us similarly. > > > r~ Yes, I'll mark vext_page_ldst_us at v6. Thanks. Max.
diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c index 2e675b4220c..95394c425ed 100644 --- a/target/riscv/vector_helper.c +++ b/target/riscv/vector_helper.c @@ -150,18 +150,20 @@ typedef void vext_ldst_elem_fn_tlb(CPURISCVState *env, abi_ptr addr, uint32_t idx, void *vd, uintptr_t retaddr); typedef void vext_ldst_elem_fn_host(void *vd, uint32_t idx, void *host); -#define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF) \ -static void NAME##_tlb(CPURISCVState *env, abi_ptr addr, \ - uint32_t byte_off, void *vd, uintptr_t retaddr) \ -{ \ - ETYPE *cur = vd + byte_off; \ - *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr); \ -} \ - \ -static void NAME##_host(void *vd, uint32_t byte_off, void *host) \ -{ \ - ETYPE val = LDSUF##_p(host); \ - *(ETYPE *)(vd + byte_off) = val; \ +#define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF) \ +static inline QEMU_ALWAYS_INLINE \ +void NAME##_tlb(CPURISCVState *env, abi_ptr addr, \ + uint32_t byte_off, void *vd, uintptr_t retaddr) \ +{ \ + ETYPE *cur = vd + byte_off; \ + *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr); \ +} \ + \ +static inline QEMU_ALWAYS_INLINE \ +void NAME##_host(void *vd, uint32_t byte_off, void *host) \ +{ \ + ETYPE val = LDSUF##_p(host); \ + *(ETYPE *)(vd + byte_off) = val; \ } GEN_VEXT_LD_ELEM(lde_b, uint8_t, H1, ldub) @@ -169,18 +171,20 @@ GEN_VEXT_LD_ELEM(lde_h, uint16_t, H2, lduw) GEN_VEXT_LD_ELEM(lde_w, uint32_t, H4, ldl) GEN_VEXT_LD_ELEM(lde_d, uint64_t, H8, ldq) -#define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF) \ -static void NAME##_tlb(CPURISCVState *env, abi_ptr addr, \ - uint32_t byte_off, void *vd, uintptr_t retaddr) \ -{ \ - ETYPE data = *(ETYPE *)(vd + byte_off); \ - cpu_##STSUF##_data_ra(env, addr, data, retaddr); \ -} \ - \ -static void NAME##_host(void *vd, uint32_t byte_off, void *host) \ -{ \ - ETYPE val = *(ETYPE *)(vd + byte_off); \ - STSUF##_p(host, val); \ +#define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF) \ +static inline QEMU_ALWAYS_INLINE \ +void NAME##_tlb(CPURISCVState *env, abi_ptr addr, \ + uint32_t byte_off, void *vd, uintptr_t retaddr) \ +{ \ + ETYPE data = *(ETYPE *)(vd + byte_off); \ + cpu_##STSUF##_data_ra(env, addr, data, retaddr); \ +} \ + \ +static inline QEMU_ALWAYS_INLINE \ +void NAME##_host(void *vd, uint32_t byte_off, void *host) \ +{ \ + ETYPE val = *(ETYPE *)(vd + byte_off); \ + STSUF##_p(host, val); \ } GEN_VEXT_ST_ELEM(ste_b, uint8_t, H1, stb) @@ -366,7 +370,7 @@ vext_page_ldst_us(CPURISCVState *env, void *vd, target_ulong addr, } } -static void +static inline QEMU_ALWAYS_INLINE void vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc, vext_ldst_elem_fn_tlb *ldst_tlb, vext_ldst_elem_fn_host *ldst_host, uint32_t log2_esz, @@ -695,7 +699,7 @@ GEN_VEXT_LDFF(vle64ff_v, int64_t, lde_d_tlb) /* * load and store whole register instructions */ -static void +static inline QEMU_ALWAYS_INLINE void vext_ldst_whole(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc, vext_ldst_elem_fn_tlb *ldst_tlb, vext_ldst_elem_fn_host *ldst_host, uint32_t log2_esz,