diff mbox series

[RFC,v4,2/2] target/riscv: rvv: improve performance of RISC-V vector loads and stores on large amounts of data.

Message ID 20241029194348.59574-3-paolo.savini@embecosm.com (mailing list archive)
State New
Headers show
Series target/riscv: add wrapper for target specific macros in atomicity check. | expand

Commit Message

Paolo Savini Oct. 29, 2024, 7:43 p.m. UTC
This patch optimizes the emulation of unit-stride load/store RVV instructions
when the data being loaded/stored per iteration amounts to 16 bytes or more.
The optimization consists of calling __builtin_memcpy on chunks of data of 16
bytes between the memory address of the simulated vector register and the
destination memory address and vice versa.
This is done only if we have direct access to the RAM of the host machine,
if the host is little endiand and if it supports atomic 128 bit memory
operations.

Signed-off-by: Paolo Savini <paolo.savini@embecosm.com>
---
 target/riscv/vector_helper.c    | 17 ++++++++++++++++-
 target/riscv/vector_internals.h | 12 ++++++++++++
 2 files changed, 28 insertions(+), 1 deletion(-)
diff mbox series

Patch

diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c
index 75c24653f0..e1c100e907 100644
--- a/target/riscv/vector_helper.c
+++ b/target/riscv/vector_helper.c
@@ -488,7 +488,22 @@  vext_group_ldst_host(CPURISCVState *env, void *vd, uint32_t byte_end,
     }
 
     fn = fns[is_load][group_size];
-    fn(vd, byte_offset, host + byte_offset);
+
+    /* __builtin_memcpy uses host 16 bytes vector loads and stores if supported.
+     * We need to make sure that these instructions have guarantees of atomicity.
+     * E.g. x86 processors provide strong guarantees of atomicity for 16-byte
+     * memory operations if the memory operands are 16-byte aligned */
+    if (!HOST_BIG_ENDIAN && (byte_offset + 16 < byte_end) &&
+		    ((byte_offset % 16) == 0) && HOST_128_ATOMIC_MEM_OP) {
+      group_size = MO_128;
+      if (is_load) {
+        __builtin_memcpy((uint8_t *)(vd + byte_offset), (uint8_t *)(host + byte_offset), 16);
+      } else {
+        __builtin_memcpy((uint8_t *)(host + byte_offset), (uint8_t *)(vd + byte_offset), 16);
+      }
+    } else {
+      fn(vd, byte_offset, host + byte_offset);
+    }
 
     return 1 << group_size;
 }
diff --git a/target/riscv/vector_internals.h b/target/riscv/vector_internals.h
index f59d7d5c19..92694162ce 100644
--- a/target/riscv/vector_internals.h
+++ b/target/riscv/vector_internals.h
@@ -56,6 +56,18 @@  static inline uint32_t vext_nf(uint32_t desc)
 #define H8(x)   (x)
 #endif
 
+/*
+ * If we use host SIMD memory operations to accelerate the emulation we might
+ * want to rely on host specific flags to check that the memory accesses will
+ * be atomic.
+ */
+#if defined(HOST_X86_64)
+#define HOST_128_ATOMIC_MEM_OP \
+	((cpuinfo & (CPUINFO_ATOMIC_VMOVDQA | CPUINFO_ATOMIC_VMOVDQU)) != 0)
+#else
+#define HOST_128_ATOMIC_MEM_OP false
+#endif
+
 /*
  * Encode LMUL to lmul as following:
  *     LMUL    vlmul    lmul