Message ID | 20230225100253.3993383-4-xuhaoyue1@hisilicon.com (mailing list archive) |
---|---|
State | Changes Requested |
Headers | show |
Series | Add SVE ldr and str instruction | expand |
On Sat, Feb 25, 2023 at 06:02:53PM +0800, Haoyue Xu wrote: > + > +set_source_files_properties(hns_roce_u_hw_v2.c PROPERTIES COMPILE_FLAGS "${SVE_FLAGS}") > diff --git a/providers/hns/hns_roce_u_hw_v2.c b/providers/hns/hns_roce_u_hw_v2.c > index 3a294968..bd457217 100644 > --- a/providers/hns/hns_roce_u_hw_v2.c > +++ b/providers/hns/hns_roce_u_hw_v2.c > @@ -299,6 +299,11 @@ static void hns_roce_update_sq_db(struct hns_roce_context *ctx, > hns_roce_write64(qp->sq.db_reg, (__le32 *)&sq_db); > } > > +static void hns_roce_sve_write512(uint64_t *dest, uint64_t *val) > +{ > + mmio_memcpy_x64_sve(dest, val); > +} This is not the right way, you should make this work like the x86 SSE stuff, using a "__attribute__((target(xx)))" Look in util/mmio.c and implement a mmio_memcpy_x64 for ARM SVE mmio_memcpy_x64 is defined to try to generate a 64 byte PCI-E TLP. If you don't want or can't handle that then you should write your own loop of 8 byte stores. > static void hns_roce_write512(uint64_t *dest, uint64_t *val) > { > mmio_memcpy_x64(dest, val, sizeof(struct hns_roce_rc_sq_wqe)); > @@ -314,7 +319,10 @@ static void hns_roce_write_dwqe(struct hns_roce_qp *qp, void *wqe) > hr_reg_write(rc_sq_wqe, RCWQE_DB_SL_H, qp->sl >> HNS_ROCE_SL_SHIFT); > hr_reg_write(rc_sq_wqe, RCWQE_WQE_IDX, qp->sq.head); > > - hns_roce_write512(qp->sq.db_reg, wqe); > + if (qp->flags & HNS_ROCE_QP_CAP_SVE_DIRECT_WQE) Why do you need a device flag here? > + hns_roce_sve_write512(qp->sq.db_reg, wqe); > + else > + hns_roce_write512(qp->sq.db_reg, wqe); Isn't this function being called on WC memory already? The usual way to make the 64 byte write is with stores to WC memory.. Jason
On 2023/3/27, Haoyue Xu wrote: On 2023/3/23 3:02:47, Jason Gunthorpe wrote: > On Sat, Feb 25, 2023 at 06:02:53PM +0800, Haoyue Xu wrote: > >> + >> +set_source_files_properties(hns_roce_u_hw_v2.c PROPERTIES COMPILE_FLAGS "${SVE_FLAGS}") >> diff --git a/providers/hns/hns_roce_u_hw_v2.c b/providers/hns/hns_roce_u_hw_v2.c >> index 3a294968..bd457217 100644 >> --- a/providers/hns/hns_roce_u_hw_v2.c >> +++ b/providers/hns/hns_roce_u_hw_v2.c >> @@ -299,6 +299,11 @@ static void hns_roce_update_sq_db(struct hns_roce_context *ctx, >> hns_roce_write64(qp->sq.db_reg, (__le32 *)&sq_db); >> } >> >> +static void hns_roce_sve_write512(uint64_t *dest, uint64_t *val) >> +{ >> + mmio_memcpy_x64_sve(dest, val); >> +} > > This is not the right way, you should make this work like the x86 SSE > stuff, using a "__attribute__((target(xx)))" > > Look in util/mmio.c and implement a mmio_memcpy_x64 for ARM SVE > > mmio_memcpy_x64 is defined to try to generate a 64 byte PCI-E TLP. > > If you don't want or can't handle that then you should write your own > loop of 8 byte stores. > We will refer to the mmio.c and make a new version, reflected in v2. >> static void hns_roce_write512(uint64_t *dest, uint64_t *val) >> { >> mmio_memcpy_x64(dest, val, sizeof(struct hns_roce_rc_sq_wqe)); >> @@ -314,7 +319,10 @@ static void hns_roce_write_dwqe(struct hns_roce_qp *qp, void *wqe) >> hr_reg_write(rc_sq_wqe, RCWQE_DB_SL_H, qp->sl >> HNS_ROCE_SL_SHIFT); >> hr_reg_write(rc_sq_wqe, RCWQE_WQE_IDX, qp->sq.head); >> >> - hns_roce_write512(qp->sq.db_reg, wqe); >> + if (qp->flags & HNS_ROCE_QP_CAP_SVE_DIRECT_WQE) > > Why do you need a device flag here? Our CPU die can support NEON instructions and SVE instructions, but some CPU dies only have SVE instructions that can accelerate our direct WQE performance. Therefore, we need to add such a flag bit to distinguish. > >> + hns_roce_sve_write512(qp->sq.db_reg, wqe); >> + else >> + hns_roce_write512(qp->sq.db_reg, wqe); > > Isn't this function being called on WC memory already? The usual way > to make the 64 byte write is with stores to WC memory.. > > Jason > . > We are currently using WC memory. Sincerely, Haoyue
On Mon, Mar 27, 2023 at 08:53:35PM +0800, xuhaoyue (A) wrote: > >> static void hns_roce_write512(uint64_t *dest, uint64_t *val) > >> { > >> mmio_memcpy_x64(dest, val, sizeof(struct hns_roce_rc_sq_wqe)); > >> @@ -314,7 +319,10 @@ static void hns_roce_write_dwqe(struct hns_roce_qp *qp, void *wqe) > >> hr_reg_write(rc_sq_wqe, RCWQE_DB_SL_H, qp->sl >> HNS_ROCE_SL_SHIFT); > >> hr_reg_write(rc_sq_wqe, RCWQE_WQE_IDX, qp->sq.head); > >> > >> - hns_roce_write512(qp->sq.db_reg, wqe); > >> + if (qp->flags & HNS_ROCE_QP_CAP_SVE_DIRECT_WQE) > > > > Why do you need a device flag here? > > Our CPU die can support NEON instructions and SVE instructions, > but some CPU dies only have SVE instructions that can accelerate our direct WQE performance. > Therefore, we need to add such a flag bit to distinguish. NEON vs SVE is available to userspace already, it shouldn't come throuhg a driver flag. You need another reason to add this flag The userspace should detect the right instruction to use based on the cpu flags using the attribute stuff I pointed you at Jason
On 2023/3/27 20:55:59, Jason Gunthorpe wrote: > On Mon, Mar 27, 2023 at 08:53:35PM +0800, xuhaoyue (A) wrote: > >>>> static void hns_roce_write512(uint64_t *dest, uint64_t *val) >>>> { >>>> mmio_memcpy_x64(dest, val, sizeof(struct hns_roce_rc_sq_wqe)); >>>> @@ -314,7 +319,10 @@ static void hns_roce_write_dwqe(struct hns_roce_qp *qp, void *wqe) >>>> hr_reg_write(rc_sq_wqe, RCWQE_DB_SL_H, qp->sl >> HNS_ROCE_SL_SHIFT); >>>> hr_reg_write(rc_sq_wqe, RCWQE_WQE_IDX, qp->sq.head); >>>> >>>> - hns_roce_write512(qp->sq.db_reg, wqe); >>>> + if (qp->flags & HNS_ROCE_QP_CAP_SVE_DIRECT_WQE) >>> >>> Why do you need a device flag here? >> >> Our CPU die can support NEON instructions and SVE instructions, >> but some CPU dies only have SVE instructions that can accelerate our direct WQE performance. >> Therefore, we need to add such a flag bit to distinguish. > > NEON vs SVE is available to userspace already, it shouldn't come > throuhg a driver flag. You need another reason to add this flag > > The userspace should detect the right instruction to use based on the > cpu flags using the attribute stuff I pointed you at > > Jason > . > We optimized direct wqe based on different instructions for different CPUs, but the architecture of the CPUs is the same and supports both SVE and NEON instructions. We plan to use cpuid to distinguish between them. Is this more reasonable? Sincerely, Haoyue
On Thu, Mar 30, 2023 at 08:57:41PM +0800, xuhaoyue (A) wrote: > > > On 2023/3/27 20:55:59, Jason Gunthorpe wrote: > > On Mon, Mar 27, 2023 at 08:53:35PM +0800, xuhaoyue (A) wrote: > > > >>>> static void hns_roce_write512(uint64_t *dest, uint64_t *val) > >>>> { > >>>> mmio_memcpy_x64(dest, val, sizeof(struct hns_roce_rc_sq_wqe)); > >>>> @@ -314,7 +319,10 @@ static void hns_roce_write_dwqe(struct hns_roce_qp *qp, void *wqe) > >>>> hr_reg_write(rc_sq_wqe, RCWQE_DB_SL_H, qp->sl >> HNS_ROCE_SL_SHIFT); > >>>> hr_reg_write(rc_sq_wqe, RCWQE_WQE_IDX, qp->sq.head); > >>>> > >>>> - hns_roce_write512(qp->sq.db_reg, wqe); > >>>> + if (qp->flags & HNS_ROCE_QP_CAP_SVE_DIRECT_WQE) > >>> > >>> Why do you need a device flag here? > >> > >> Our CPU die can support NEON instructions and SVE instructions, > >> but some CPU dies only have SVE instructions that can accelerate our direct WQE performance. > >> Therefore, we need to add such a flag bit to distinguish. > > > > NEON vs SVE is available to userspace already, it shouldn't come > > throuhg a driver flag. You need another reason to add this flag > > > > The userspace should detect the right instruction to use based on the > > cpu flags using the attribute stuff I pointed you at > > > > Jason > > . > > > > We optimized direct wqe based on different instructions for > different CPUs, but the architecture of the CPUs is the same and > supports both SVE and NEON instructions. We plan to use cpuid to > distinguish between them. Is this more reasonable? Uhh, do you mean certain CPUs won't work with SVE and others won't work with NEON? That is quite horrible Jason
On 2023/3/30 21:01:20, Jason Gunthorpe wrote: > On Thu, Mar 30, 2023 at 08:57:41PM +0800, xuhaoyue (A) wrote: >> >> >> On 2023/3/27 20:55:59, Jason Gunthorpe wrote: >>> On Mon, Mar 27, 2023 at 08:53:35PM +0800, xuhaoyue (A) wrote: >>> >>>>>> static void hns_roce_write512(uint64_t *dest, uint64_t *val) >>>>>> { >>>>>> mmio_memcpy_x64(dest, val, sizeof(struct hns_roce_rc_sq_wqe)); >>>>>> @@ -314,7 +319,10 @@ static void hns_roce_write_dwqe(struct hns_roce_qp *qp, void *wqe) >>>>>> hr_reg_write(rc_sq_wqe, RCWQE_DB_SL_H, qp->sl >> HNS_ROCE_SL_SHIFT); >>>>>> hr_reg_write(rc_sq_wqe, RCWQE_WQE_IDX, qp->sq.head); >>>>>> >>>>>> - hns_roce_write512(qp->sq.db_reg, wqe); >>>>>> + if (qp->flags & HNS_ROCE_QP_CAP_SVE_DIRECT_WQE) >>>>> >>>>> Why do you need a device flag here? >>>> >>>> Our CPU die can support NEON instructions and SVE instructions, >>>> but some CPU dies only have SVE instructions that can accelerate our direct WQE performance. >>>> Therefore, we need to add such a flag bit to distinguish. >>> >>> NEON vs SVE is available to userspace already, it shouldn't come >>> throuhg a driver flag. You need another reason to add this flag >>> >>> The userspace should detect the right instruction to use based on the >>> cpu flags using the attribute stuff I pointed you at >>> >>> Jason >>> . >>> >> >> We optimized direct wqe based on different instructions for >> different CPUs, but the architecture of the CPUs is the same and >> supports both SVE and NEON instructions. We plan to use cpuid to >> distinguish between them. Is this more reasonable? > > Uhh, do you mean certain CPUs won't work with SVE and others won't > work with NEON? > > That is quite horrible > > Jason > . > No, acctually for general scenarios, our CPU supports two types of instructions, SVE and NEON. However, for the CPU that requires high fp64 floating-point computing power, the SVE instruction is enhanced and the NEON instruction is weakened. Sincerely, Haoyue
On Fri, Mar 31, 2023 at 11:38:26AM +0800, xuhaoyue (A) wrote: > > > On 2023/3/30 21:01:20, Jason Gunthorpe wrote: > > On Thu, Mar 30, 2023 at 08:57:41PM +0800, xuhaoyue (A) wrote: > >> > >> > >> On 2023/3/27 20:55:59, Jason Gunthorpe wrote: > >>> On Mon, Mar 27, 2023 at 08:53:35PM +0800, xuhaoyue (A) wrote: > >>> > >>>>>> static void hns_roce_write512(uint64_t *dest, uint64_t *val) > >>>>>> { > >>>>>> mmio_memcpy_x64(dest, val, sizeof(struct hns_roce_rc_sq_wqe)); > >>>>>> @@ -314,7 +319,10 @@ static void hns_roce_write_dwqe(struct hns_roce_qp *qp, void *wqe) > >>>>>> hr_reg_write(rc_sq_wqe, RCWQE_DB_SL_H, qp->sl >> HNS_ROCE_SL_SHIFT); > >>>>>> hr_reg_write(rc_sq_wqe, RCWQE_WQE_IDX, qp->sq.head); > >>>>>> > >>>>>> - hns_roce_write512(qp->sq.db_reg, wqe); > >>>>>> + if (qp->flags & HNS_ROCE_QP_CAP_SVE_DIRECT_WQE) > >>>>> > >>>>> Why do you need a device flag here? > >>>> > >>>> Our CPU die can support NEON instructions and SVE instructions, > >>>> but some CPU dies only have SVE instructions that can accelerate our direct WQE performance. > >>>> Therefore, we need to add such a flag bit to distinguish. > >>> > >>> NEON vs SVE is available to userspace already, it shouldn't come > >>> throuhg a driver flag. You need another reason to add this flag > >>> > >>> The userspace should detect the right instruction to use based on the > >>> cpu flags using the attribute stuff I pointed you at > >>> > >>> Jason > >>> . > >>> > >> > >> We optimized direct wqe based on different instructions for > >> different CPUs, but the architecture of the CPUs is the same and > >> supports both SVE and NEON instructions. We plan to use cpuid to > >> distinguish between them. Is this more reasonable? > > > > Uhh, do you mean certain CPUs won't work with SVE and others won't > > work with NEON? > > > > That is quite horrible > > > > Jason > > . > > > > No, acctually for general scenarios, our CPU supports two types of instructions, SVE and NEON. > However, for the CPU that requires high fp64 floating-point computing power, the SVE instruction is enhanced and the NEON instruction is weakened. Ideally the decision of what CPU instruction to use will be made by rdma-core, using the the various schemes for dynamic link time selection It should apply universally to all providers Jason
diff --git a/CMakeLists.txt b/CMakeLists.txt index 0cb68264..ee1024d5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -417,6 +417,8 @@ endif() RDMA_Check_SSE(HAVE_TARGET_SSE) +RDMA_Check_SVE(HAVE_TARGET_SVE) + # Enable development support features # Prune unneeded shared libraries during linking RDMA_AddOptLDFlag(CMAKE_EXE_LINKER_FLAGS SUPPORTS_AS_NEEDED "-Wl,--as-needed") diff --git a/buildlib/RDMA_EnableCStd.cmake b/buildlib/RDMA_EnableCStd.cmake index 3c42824f..c6bd6603 100644 --- a/buildlib/RDMA_EnableCStd.cmake +++ b/buildlib/RDMA_EnableCStd.cmake @@ -127,3 +127,10 @@ int main(int argc, char *argv[]) endif() set(${TO_VAR} "${HAVE_TARGET_SSE}" PARENT_SCOPE) endFunction() + +function(RDMA_Check_SVE TO_VAR) + RDMA_Check_C_Compiles(HAVE_TARGET_SVE "${SVE_CHECK_PROGRAM}") + + set(SVE_FLAGS "-march=armv8.2-a+sve" PARENT_SCOPE) + set(${TO_VAR} "${HAVE_TARGET_SVE}" PARENT_SCOPE) +endFunction() diff --git a/providers/hns/CMakeLists.txt b/providers/hns/CMakeLists.txt index 7aaca757..5c2bcf3b 100644 --- a/providers/hns/CMakeLists.txt +++ b/providers/hns/CMakeLists.txt @@ -5,3 +5,5 @@ rdma_provider(hns hns_roce_u_hw_v2.c hns_roce_u_verbs.c ) + +set_source_files_properties(hns_roce_u_hw_v2.c PROPERTIES COMPILE_FLAGS "${SVE_FLAGS}") diff --git a/providers/hns/hns_roce_u_hw_v2.c b/providers/hns/hns_roce_u_hw_v2.c index 3a294968..bd457217 100644 --- a/providers/hns/hns_roce_u_hw_v2.c +++ b/providers/hns/hns_roce_u_hw_v2.c @@ -299,6 +299,11 @@ static void hns_roce_update_sq_db(struct hns_roce_context *ctx, hns_roce_write64(qp->sq.db_reg, (__le32 *)&sq_db); } +static void hns_roce_sve_write512(uint64_t *dest, uint64_t *val) +{ + mmio_memcpy_x64_sve(dest, val); +} + static void hns_roce_write512(uint64_t *dest, uint64_t *val) { mmio_memcpy_x64(dest, val, sizeof(struct hns_roce_rc_sq_wqe)); @@ -314,7 +319,10 @@ static void hns_roce_write_dwqe(struct hns_roce_qp *qp, void *wqe) hr_reg_write(rc_sq_wqe, RCWQE_DB_SL_H, qp->sl >> HNS_ROCE_SL_SHIFT); hr_reg_write(rc_sq_wqe, RCWQE_WQE_IDX, qp->sq.head); - hns_roce_write512(qp->sq.db_reg, wqe); + if (qp->flags & HNS_ROCE_QP_CAP_SVE_DIRECT_WQE) + hns_roce_sve_write512(qp->sq.db_reg, wqe); + else + hns_roce_write512(qp->sq.db_reg, wqe); } static void update_cq_db(struct hns_roce_context *ctx, struct hns_roce_cq *cq) diff --git a/util/mmio.h b/util/mmio.h index b60935c4..13fd2654 100644 --- a/util/mmio.h +++ b/util/mmio.h @@ -207,6 +207,17 @@ __le64 mmio_read64_le(const void *addr); /* This strictly guarantees the order of TLP generation for the memory copy to be in ascending address order. */ +#if defined(__aarch64__) || defined(__arm__) +static inline void mmio_memcpy_x64_sve(void *dest, const void *src) +{ + asm volatile( + "ldr z0, [%0]\n" + "str z0, [%1]\n" + ::"r" (val), "r"(dest):"cc", "memory" + ); +} +#endif + #if defined(__aarch64__) || defined(__arm__) #include <arm_neon.h>