Message ID | 20241220122109.2083215-2-craig.blackmore@embecosm.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | target/riscv: rvv: Use wider accesses for unit stride load/store | expand |
On 12/20/24 04:21, Craig Blackmore wrote: > Use atomic load/store functions to access multiple elements from host. > > Co-authored-by: Paolo Savini <paolo.savini@embecosm.com> > > Signed-off-by: Paolo Savini <paolo.savini@embecosm.com> > Signed-off-by: Craig Blackmore <craig.blackmore@embecosm.com> > --- > target/riscv/vector_helper.c | 107 +++++++++++++++++++++++++++++++++-- > 1 file changed, 101 insertions(+), 6 deletions(-) > > diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c > index a85dd1d200..c0179165ce 100644 > --- a/target/riscv/vector_helper.c > +++ b/target/riscv/vector_helper.c > @@ -206,10 +206,102 @@ vext_continus_ldst_tlb(CPURISCVState *env, vext_ldst_elem_fn_tlb *ldst_tlb, > } > } > > +#if !HOST_BIG_ENDIAN > +/* Atomic operations for load/store */ > + > +/* > + * Return true if there are enough elements for this size access and the > + * pointer is aligned to the required atomicity. > + */ > + > +static inline QEMU_ALWAYS_INLINE bool > +ok_for_atomic(uint32_t size, void *host, uint32_t reg_start, uint32_t evl, > + uint32_t log2_esz) > +{ > + return (reg_start + (size >> log2_esz)) <= evl > + && ((uintptr_t) host % size) == 0; > +} > + > +#define GEN_VEXT_LDST_ATOMIC_HOST(SIZE, TYPE) \ > +static inline QEMU_ALWAYS_INLINE void \ > +vext_ldst_atom_##SIZE##_host(void *vd, uint32_t byte_offset, TYPE *host, \ > + bool is_load) \ > +{ \ > + TYPE *vd_ptr = (TYPE *) (vd + byte_offset); \ > + if (is_load) { \ > + *vd_ptr = qatomic_read__nocheck(host); \ > + } else { \ > + qatomic_set__nocheck(host, *vd_ptr); \ > + } \ > +} \ > + > +GEN_VEXT_LDST_ATOMIC_HOST(2, uint16_t) > +GEN_VEXT_LDST_ATOMIC_HOST(4, uint32_t) > +#ifdef CONFIG_ATOMIC64 > +GEN_VEXT_LDST_ATOMIC_HOST(8, uint64_t) > +#endif > + > +#ifdef CONFIG_INT128_TYPE > +static inline QEMU_ALWAYS_INLINE void > +vext_ldst_atom_16_host(void *vd, uint32_t byte_offset, Int128 *host, > + bool is_load) > +{ > + Int128 *vd_ptr = (Int128 *) (vd + byte_offset); > + if (is_load) { > + *vd_ptr = atomic16_read_ro(host); > + } else { > + atomic16_set(host, *vd_ptr); > + } > +} > +#endif > +#endif Using CONFIG_INT128_TYPE is peeking under the hood a bit too far. As it happens, the ifdef should not be required, because the atomic16_{read,set} symbols are always present. > +#ifdef CONFIG_INT128_TYPE > + if (((is_load && HAVE_ATOMIC128_RO) || (!is_load && HAVE_ATOMIC128_RW)) > + && ok_for_atomic(16, host, reg_start, evl, log2_esz)) { > + vext_ldst_atom_16_host(vd, byte_offset, host, is_load); > + return 16; > + } > +#endif The tricky part is that if !HAVE_ATOMIC128_RO, then atomic16_set may be implemented with a 16-byte compare and store loop. This is expensive, and you certainly don't want to use that. You'd prefer two 8-byte stores. So I think the condition should be if (HAVE_ATOMIC128_RO && (is_load || HAVE_ATOMIC128_RW)) with the above paragraph added as a comment. That said, RVV does not have 128-bit elements, so at no point do you actually require 128-bit atomicity. > + > +#ifdef CONFIG_ATOMIC64 > + if (ok_for_atomic(8, host, reg_start, evl, log2_esz)) { > + vext_ldst_atom_8_host(vd, byte_offset, host, is_load); > + return 8; > + } > +#endif > + > + if (ok_for_atomic(4, host, reg_start, evl, log2_esz)) { > + vext_ldst_atom_4_host(vd, byte_offset, host, is_load); > + return 4; > + } > + > + if (ok_for_atomic(2, host, reg_start, evl, log2_esz)) { > + vext_ldst_atom_2_host(vd, byte_offset, host, is_load); > + return 2; > + } > +#endif > + > + ldst_host(vd, reg_start, host); > + return 1; > +} > + > static inline QEMU_ALWAYS_INLINE void > vext_continus_ldst_host(CPURISCVState *env, vext_ldst_elem_fn_host *ldst_host, > void *vd, uint32_t evl, uint32_t reg_start, void *host, > - uint32_t esz, bool is_load) > + uint32_t esz, bool is_load, uint32_t log2_esz) > { > #if HOST_BIG_ENDIAN > for (; reg_start < evl; reg_start++, host += esz) { > @@ -225,10 +317,13 @@ vext_continus_ldst_host(CPURISCVState *env, vext_ldst_elem_fn_host *ldst_host, > } else { > memcpy(host, vd + byte_offset, size); > } > - } else { > - for (; reg_start < evl; reg_start++, host += esz) { > - ldst_host(vd, reg_start, host); > - } > + return; > + } > + uint32_t chunk = 0; > + for (; reg_start < evl; reg_start += (chunk >> log2_esz), > + host += chunk) { > + chunk = vext_ldst_atomic_chunk_host(env, ldst_host, vd, evl, reg_start, > + host, esz, is_load, log2_esz); > } My guess is the arrangement of the loop is not great, in that ok_for_atomic is buried at the bottom of the call stack within this loop. It would be better to test for alignment once, here at the outside. If the alignment is below esz, then the individual elements are misaligned, at which point the architecture does not require *any* atomicity. At which point, I believe you could use the memcpy path just above. Perhaps something like uint32_t byte_offset = reg_start * esz; uint32_t size = (evl - reg_start) * esz; uint32_t test = (uintptr_t)host; if (esz == 1 || unlikely(test % esz != 0)) { if (is_load) { memcpy(vd + byte_offset, host, size); } else { memcpy(host, vd + byte_offset, size); } return; } /* Test that both alignment and size are multiples. */ test |= size; if (HAVE_ATOMIC128_RO && (is_load || HAVE_ATOMIC128_RW) && (test % 16 == 0) { do { vext_ldst_atom_16_host(vd, reg_offset, host, is_load); reg_offset += 16 >> log2_esz; host += 16; } while (reg_offset < evl); return; } if (esz < 8 && test % 8 == 0) { ... } ... for (; reg_start < evl; reg_start++, host += esz) { ldst_host(vd, reg_start, host); } r~
diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c index a85dd1d200..c0179165ce 100644 --- a/target/riscv/vector_helper.c +++ b/target/riscv/vector_helper.c @@ -206,10 +206,102 @@ vext_continus_ldst_tlb(CPURISCVState *env, vext_ldst_elem_fn_tlb *ldst_tlb, } } +#if !HOST_BIG_ENDIAN +/* Atomic operations for load/store */ + +/* + * Return true if there are enough elements for this size access and the + * pointer is aligned to the required atomicity. + */ + +static inline QEMU_ALWAYS_INLINE bool +ok_for_atomic(uint32_t size, void *host, uint32_t reg_start, uint32_t evl, + uint32_t log2_esz) +{ + return (reg_start + (size >> log2_esz)) <= evl + && ((uintptr_t) host % size) == 0; +} + +#define GEN_VEXT_LDST_ATOMIC_HOST(SIZE, TYPE) \ +static inline QEMU_ALWAYS_INLINE void \ +vext_ldst_atom_##SIZE##_host(void *vd, uint32_t byte_offset, TYPE *host, \ + bool is_load) \ +{ \ + TYPE *vd_ptr = (TYPE *) (vd + byte_offset); \ + if (is_load) { \ + *vd_ptr = qatomic_read__nocheck(host); \ + } else { \ + qatomic_set__nocheck(host, *vd_ptr); \ + } \ +} \ + +GEN_VEXT_LDST_ATOMIC_HOST(2, uint16_t) +GEN_VEXT_LDST_ATOMIC_HOST(4, uint32_t) +#ifdef CONFIG_ATOMIC64 +GEN_VEXT_LDST_ATOMIC_HOST(8, uint64_t) +#endif + +#ifdef CONFIG_INT128_TYPE +static inline QEMU_ALWAYS_INLINE void +vext_ldst_atom_16_host(void *vd, uint32_t byte_offset, Int128 *host, + bool is_load) +{ + Int128 *vd_ptr = (Int128 *) (vd + byte_offset); + if (is_load) { + *vd_ptr = atomic16_read_ro(host); + } else { + atomic16_set(host, *vd_ptr); + } +} +#endif +#endif + +/* Perform the largest atomic load/store possible for the evl and alignment. */ + +static inline QEMU_ALWAYS_INLINE uint32_t +vext_ldst_atomic_chunk_host(CPURISCVState *env, + vext_ldst_elem_fn_host *ldst_host, + void *vd, uint32_t evl, uint32_t reg_start, + void *host, uint32_t esz, bool is_load, + uint32_t log2_esz) +{ +#if !HOST_BIG_ENDIAN + uint32_t byte_offset = reg_start * esz; + +#ifdef CONFIG_INT128_TYPE + if (((is_load && HAVE_ATOMIC128_RO) || (!is_load && HAVE_ATOMIC128_RW)) + && ok_for_atomic(16, host, reg_start, evl, log2_esz)) { + vext_ldst_atom_16_host(vd, byte_offset, host, is_load); + return 16; + } +#endif + +#ifdef CONFIG_ATOMIC64 + if (ok_for_atomic(8, host, reg_start, evl, log2_esz)) { + vext_ldst_atom_8_host(vd, byte_offset, host, is_load); + return 8; + } +#endif + + if (ok_for_atomic(4, host, reg_start, evl, log2_esz)) { + vext_ldst_atom_4_host(vd, byte_offset, host, is_load); + return 4; + } + + if (ok_for_atomic(2, host, reg_start, evl, log2_esz)) { + vext_ldst_atom_2_host(vd, byte_offset, host, is_load); + return 2; + } +#endif + + ldst_host(vd, reg_start, host); + return 1; +} + static inline QEMU_ALWAYS_INLINE void vext_continus_ldst_host(CPURISCVState *env, vext_ldst_elem_fn_host *ldst_host, void *vd, uint32_t evl, uint32_t reg_start, void *host, - uint32_t esz, bool is_load) + uint32_t esz, bool is_load, uint32_t log2_esz) { #if HOST_BIG_ENDIAN for (; reg_start < evl; reg_start++, host += esz) { @@ -225,10 +317,13 @@ vext_continus_ldst_host(CPURISCVState *env, vext_ldst_elem_fn_host *ldst_host, } else { memcpy(host, vd + byte_offset, size); } - } else { - for (; reg_start < evl; reg_start++, host += esz) { - ldst_host(vd, reg_start, host); - } + return; + } + uint32_t chunk = 0; + for (; reg_start < evl; reg_start += (chunk >> log2_esz), + host += chunk) { + chunk = vext_ldst_atomic_chunk_host(env, ldst_host, vd, evl, reg_start, + host, esz, is_load, log2_esz); } #endif } @@ -343,7 +438,7 @@ vext_page_ldst_us(CPURISCVState *env, void *vd, target_ulong addr, if (flags == 0) { if (nf == 1) { vext_continus_ldst_host(env, ldst_host, vd, evl, env->vstart, host, - esz, is_load); + esz, is_load, log2_esz); } else { for (i = env->vstart; i < evl; ++i) { k = 0;