diff mbox series

[RFC,for-next,3/3] libhns: Add support for SVE Direct WQE function

Message ID 20230225100253.3993383-4-xuhaoyue1@hisilicon.com (mailing list archive)
State Changes Requested
Headers show
Series Add SVE ldr and str instruction | expand

Commit Message

Haoyue Xu Feb. 25, 2023, 10:02 a.m. UTC
From: Yixing Liu <liuyixing1@huawei.com>

The newly added SVE Direct WQE function only supports sve ldr and str
instructions, this patch adds ldr and str assembly to achieve this function.

Signed-off-by: Yixing Liu <liuyixing1@huawei.com>
---
 CMakeLists.txt                   |  2 ++
 buildlib/RDMA_EnableCStd.cmake   |  7 +++++++
 providers/hns/CMakeLists.txt     |  2 ++
 providers/hns/hns_roce_u_hw_v2.c | 10 +++++++++-
 util/mmio.h                      | 11 +++++++++++
 5 files changed, 31 insertions(+), 1 deletion(-)

Comments

Jason Gunthorpe March 22, 2023, 7:02 p.m. UTC | #1
On Sat, Feb 25, 2023 at 06:02:53PM +0800, Haoyue Xu wrote:

> +
> +set_source_files_properties(hns_roce_u_hw_v2.c PROPERTIES COMPILE_FLAGS "${SVE_FLAGS}")
> diff --git a/providers/hns/hns_roce_u_hw_v2.c b/providers/hns/hns_roce_u_hw_v2.c
> index 3a294968..bd457217 100644
> --- a/providers/hns/hns_roce_u_hw_v2.c
> +++ b/providers/hns/hns_roce_u_hw_v2.c
> @@ -299,6 +299,11 @@ static void hns_roce_update_sq_db(struct hns_roce_context *ctx,
>  	hns_roce_write64(qp->sq.db_reg, (__le32 *)&sq_db);
>  }
>  
> +static void hns_roce_sve_write512(uint64_t *dest, uint64_t *val)
> +{
> +	mmio_memcpy_x64_sve(dest, val);
> +}

This is not the right way, you should make this work like the x86 SSE
stuff, using a "__attribute__((target(xx)))"

Look in util/mmio.c and implement a mmio_memcpy_x64 for ARM SVE

mmio_memcpy_x64 is defined to try to generate a 64 byte PCI-E TLP.

If you don't want or can't handle that then you should write your own
loop of 8 byte stores.

>  static void hns_roce_write512(uint64_t *dest, uint64_t *val)
>  {
>  	mmio_memcpy_x64(dest, val, sizeof(struct hns_roce_rc_sq_wqe));
> @@ -314,7 +319,10 @@ static void hns_roce_write_dwqe(struct hns_roce_qp *qp, void *wqe)
>  	hr_reg_write(rc_sq_wqe, RCWQE_DB_SL_H, qp->sl >> HNS_ROCE_SL_SHIFT);
>  	hr_reg_write(rc_sq_wqe, RCWQE_WQE_IDX, qp->sq.head);
>  
> -	hns_roce_write512(qp->sq.db_reg, wqe);
> +	if (qp->flags & HNS_ROCE_QP_CAP_SVE_DIRECT_WQE)

Why do you need a device flag here?

> +		hns_roce_sve_write512(qp->sq.db_reg, wqe);
> +	else
> +		hns_roce_write512(qp->sq.db_reg, wqe);

Isn't this function being called on WC memory already? The usual way
to make the 64 byte write is with stores to WC memory..

Jason
Haoyue Xu March 27, 2023, 12:53 p.m. UTC | #2
On 2023/3/27, Haoyue Xu wrote:
On 2023/3/23 3:02:47, Jason Gunthorpe wrote:
> On Sat, Feb 25, 2023 at 06:02:53PM +0800, Haoyue Xu wrote:
> 
>> +
>> +set_source_files_properties(hns_roce_u_hw_v2.c PROPERTIES COMPILE_FLAGS "${SVE_FLAGS}")
>> diff --git a/providers/hns/hns_roce_u_hw_v2.c b/providers/hns/hns_roce_u_hw_v2.c
>> index 3a294968..bd457217 100644
>> --- a/providers/hns/hns_roce_u_hw_v2.c
>> +++ b/providers/hns/hns_roce_u_hw_v2.c
>> @@ -299,6 +299,11 @@ static void hns_roce_update_sq_db(struct hns_roce_context *ctx,
>>  	hns_roce_write64(qp->sq.db_reg, (__le32 *)&sq_db);
>>  }
>>  
>> +static void hns_roce_sve_write512(uint64_t *dest, uint64_t *val)
>> +{
>> +	mmio_memcpy_x64_sve(dest, val);
>> +}
> 
> This is not the right way, you should make this work like the x86 SSE
> stuff, using a "__attribute__((target(xx)))"
> 
> Look in util/mmio.c and implement a mmio_memcpy_x64 for ARM SVE
> 
> mmio_memcpy_x64 is defined to try to generate a 64 byte PCI-E TLP.
> 
> If you don't want or can't handle that then you should write your own
> loop of 8 byte stores.
> 

We will refer to the mmio.c and make a new version, reflected in v2.


>>  static void hns_roce_write512(uint64_t *dest, uint64_t *val)
>>  {
>>  	mmio_memcpy_x64(dest, val, sizeof(struct hns_roce_rc_sq_wqe));
>> @@ -314,7 +319,10 @@ static void hns_roce_write_dwqe(struct hns_roce_qp *qp, void *wqe)
>>  	hr_reg_write(rc_sq_wqe, RCWQE_DB_SL_H, qp->sl >> HNS_ROCE_SL_SHIFT);
>>  	hr_reg_write(rc_sq_wqe, RCWQE_WQE_IDX, qp->sq.head);
>>  
>> -	hns_roce_write512(qp->sq.db_reg, wqe);
>> +	if (qp->flags & HNS_ROCE_QP_CAP_SVE_DIRECT_WQE)
> 
> Why do you need a device flag here?

Our CPU die can support NEON instructions and SVE instructions,
but some CPU dies only have SVE instructions that can accelerate our direct WQE performance.
Therefore, we need to add such a flag bit to distinguish.


> 
>> +		hns_roce_sve_write512(qp->sq.db_reg, wqe);
>> +	else
>> +		hns_roce_write512(qp->sq.db_reg, wqe);
> 
> Isn't this function being called on WC memory already? The usual way
> to make the 64 byte write is with stores to WC memory..
> 
> Jason
> .
> 
We are currently using WC memory.

Sincerely,
Haoyue
Jason Gunthorpe March 27, 2023, 12:55 p.m. UTC | #3
On Mon, Mar 27, 2023 at 08:53:35PM +0800, xuhaoyue (A) wrote:

> >>  static void hns_roce_write512(uint64_t *dest, uint64_t *val)
> >>  {
> >>  	mmio_memcpy_x64(dest, val, sizeof(struct hns_roce_rc_sq_wqe));
> >> @@ -314,7 +319,10 @@ static void hns_roce_write_dwqe(struct hns_roce_qp *qp, void *wqe)
> >>  	hr_reg_write(rc_sq_wqe, RCWQE_DB_SL_H, qp->sl >> HNS_ROCE_SL_SHIFT);
> >>  	hr_reg_write(rc_sq_wqe, RCWQE_WQE_IDX, qp->sq.head);
> >>  
> >> -	hns_roce_write512(qp->sq.db_reg, wqe);
> >> +	if (qp->flags & HNS_ROCE_QP_CAP_SVE_DIRECT_WQE)
> > 
> > Why do you need a device flag here?
> 
> Our CPU die can support NEON instructions and SVE instructions,
> but some CPU dies only have SVE instructions that can accelerate our direct WQE performance.
> Therefore, we need to add such a flag bit to distinguish.

NEON vs SVE is available to userspace already, it shouldn't come
throuhg a driver flag. You need another reason to add this flag

The userspace should detect the right instruction to use based on the
cpu flags using the attribute stuff I pointed you at

Jason
Haoyue Xu March 30, 2023, 12:57 p.m. UTC | #4
On 2023/3/27 20:55:59, Jason Gunthorpe wrote:
> On Mon, Mar 27, 2023 at 08:53:35PM +0800, xuhaoyue (A) wrote:
> 
>>>>  static void hns_roce_write512(uint64_t *dest, uint64_t *val)
>>>>  {
>>>>  	mmio_memcpy_x64(dest, val, sizeof(struct hns_roce_rc_sq_wqe));
>>>> @@ -314,7 +319,10 @@ static void hns_roce_write_dwqe(struct hns_roce_qp *qp, void *wqe)
>>>>  	hr_reg_write(rc_sq_wqe, RCWQE_DB_SL_H, qp->sl >> HNS_ROCE_SL_SHIFT);
>>>>  	hr_reg_write(rc_sq_wqe, RCWQE_WQE_IDX, qp->sq.head);
>>>>  
>>>> -	hns_roce_write512(qp->sq.db_reg, wqe);
>>>> +	if (qp->flags & HNS_ROCE_QP_CAP_SVE_DIRECT_WQE)
>>>
>>> Why do you need a device flag here?
>>
>> Our CPU die can support NEON instructions and SVE instructions,
>> but some CPU dies only have SVE instructions that can accelerate our direct WQE performance.
>> Therefore, we need to add such a flag bit to distinguish.
> 
> NEON vs SVE is available to userspace already, it shouldn't come
> throuhg a driver flag. You need another reason to add this flag
> 
> The userspace should detect the right instruction to use based on the
> cpu flags using the attribute stuff I pointed you at
> 
> Jason
> .
> 

We optimized direct wqe based on different instructions for different CPUs,
but the architecture of the CPUs is the same and supports both SVE and NEON instructions.
We plan to use cpuid to distinguish between them. Is this more reasonable?

Sincerely,
Haoyue
Jason Gunthorpe March 30, 2023, 1:01 p.m. UTC | #5
On Thu, Mar 30, 2023 at 08:57:41PM +0800, xuhaoyue (A) wrote:
> 
> 
> On 2023/3/27 20:55:59, Jason Gunthorpe wrote:
> > On Mon, Mar 27, 2023 at 08:53:35PM +0800, xuhaoyue (A) wrote:
> > 
> >>>>  static void hns_roce_write512(uint64_t *dest, uint64_t *val)
> >>>>  {
> >>>>  	mmio_memcpy_x64(dest, val, sizeof(struct hns_roce_rc_sq_wqe));
> >>>> @@ -314,7 +319,10 @@ static void hns_roce_write_dwqe(struct hns_roce_qp *qp, void *wqe)
> >>>>  	hr_reg_write(rc_sq_wqe, RCWQE_DB_SL_H, qp->sl >> HNS_ROCE_SL_SHIFT);
> >>>>  	hr_reg_write(rc_sq_wqe, RCWQE_WQE_IDX, qp->sq.head);
> >>>>  
> >>>> -	hns_roce_write512(qp->sq.db_reg, wqe);
> >>>> +	if (qp->flags & HNS_ROCE_QP_CAP_SVE_DIRECT_WQE)
> >>>
> >>> Why do you need a device flag here?
> >>
> >> Our CPU die can support NEON instructions and SVE instructions,
> >> but some CPU dies only have SVE instructions that can accelerate our direct WQE performance.
> >> Therefore, we need to add such a flag bit to distinguish.
> > 
> > NEON vs SVE is available to userspace already, it shouldn't come
> > throuhg a driver flag. You need another reason to add this flag
> > 
> > The userspace should detect the right instruction to use based on the
> > cpu flags using the attribute stuff I pointed you at
> > 
> > Jason
> > .
> > 
> 
> We optimized direct wqe based on different instructions for
> different CPUs, but the architecture of the CPUs is the same and
> supports both SVE and NEON instructions.  We plan to use cpuid to
> distinguish between them. Is this more reasonable?

Uhh, do you mean certain CPUs won't work with SVE and others won't
work with NEON?

That is quite horrible

Jason
Haoyue Xu March 31, 2023, 3:38 a.m. UTC | #6
On 2023/3/30 21:01:20, Jason Gunthorpe wrote:
> On Thu, Mar 30, 2023 at 08:57:41PM +0800, xuhaoyue (A) wrote:
>>
>>
>> On 2023/3/27 20:55:59, Jason Gunthorpe wrote:
>>> On Mon, Mar 27, 2023 at 08:53:35PM +0800, xuhaoyue (A) wrote:
>>>
>>>>>>  static void hns_roce_write512(uint64_t *dest, uint64_t *val)
>>>>>>  {
>>>>>>  	mmio_memcpy_x64(dest, val, sizeof(struct hns_roce_rc_sq_wqe));
>>>>>> @@ -314,7 +319,10 @@ static void hns_roce_write_dwqe(struct hns_roce_qp *qp, void *wqe)
>>>>>>  	hr_reg_write(rc_sq_wqe, RCWQE_DB_SL_H, qp->sl >> HNS_ROCE_SL_SHIFT);
>>>>>>  	hr_reg_write(rc_sq_wqe, RCWQE_WQE_IDX, qp->sq.head);
>>>>>>  
>>>>>> -	hns_roce_write512(qp->sq.db_reg, wqe);
>>>>>> +	if (qp->flags & HNS_ROCE_QP_CAP_SVE_DIRECT_WQE)
>>>>>
>>>>> Why do you need a device flag here?
>>>>
>>>> Our CPU die can support NEON instructions and SVE instructions,
>>>> but some CPU dies only have SVE instructions that can accelerate our direct WQE performance.
>>>> Therefore, we need to add such a flag bit to distinguish.
>>>
>>> NEON vs SVE is available to userspace already, it shouldn't come
>>> throuhg a driver flag. You need another reason to add this flag
>>>
>>> The userspace should detect the right instruction to use based on the
>>> cpu flags using the attribute stuff I pointed you at
>>>
>>> Jason
>>> .
>>>
>>
>> We optimized direct wqe based on different instructions for
>> different CPUs, but the architecture of the CPUs is the same and
>> supports both SVE and NEON instructions.  We plan to use cpuid to
>> distinguish between them. Is this more reasonable?
> 
> Uhh, do you mean certain CPUs won't work with SVE and others won't
> work with NEON?
> 
> That is quite horrible
> 
> Jason
> .
> 

No, acctually for general scenarios, our CPU supports two types of instructions, SVE and NEON.
However, for the CPU that requires high fp64 floating-point computing power, the SVE instruction is enhanced and the NEON instruction is weakened.

Sincerely,
Haoyue
Jason Gunthorpe March 31, 2023, 11:39 a.m. UTC | #7
On Fri, Mar 31, 2023 at 11:38:26AM +0800, xuhaoyue (A) wrote:
> 
> 
> On 2023/3/30 21:01:20, Jason Gunthorpe wrote:
> > On Thu, Mar 30, 2023 at 08:57:41PM +0800, xuhaoyue (A) wrote:
> >>
> >>
> >> On 2023/3/27 20:55:59, Jason Gunthorpe wrote:
> >>> On Mon, Mar 27, 2023 at 08:53:35PM +0800, xuhaoyue (A) wrote:
> >>>
> >>>>>>  static void hns_roce_write512(uint64_t *dest, uint64_t *val)
> >>>>>>  {
> >>>>>>  	mmio_memcpy_x64(dest, val, sizeof(struct hns_roce_rc_sq_wqe));
> >>>>>> @@ -314,7 +319,10 @@ static void hns_roce_write_dwqe(struct hns_roce_qp *qp, void *wqe)
> >>>>>>  	hr_reg_write(rc_sq_wqe, RCWQE_DB_SL_H, qp->sl >> HNS_ROCE_SL_SHIFT);
> >>>>>>  	hr_reg_write(rc_sq_wqe, RCWQE_WQE_IDX, qp->sq.head);
> >>>>>>  
> >>>>>> -	hns_roce_write512(qp->sq.db_reg, wqe);
> >>>>>> +	if (qp->flags & HNS_ROCE_QP_CAP_SVE_DIRECT_WQE)
> >>>>>
> >>>>> Why do you need a device flag here?
> >>>>
> >>>> Our CPU die can support NEON instructions and SVE instructions,
> >>>> but some CPU dies only have SVE instructions that can accelerate our direct WQE performance.
> >>>> Therefore, we need to add such a flag bit to distinguish.
> >>>
> >>> NEON vs SVE is available to userspace already, it shouldn't come
> >>> throuhg a driver flag. You need another reason to add this flag
> >>>
> >>> The userspace should detect the right instruction to use based on the
> >>> cpu flags using the attribute stuff I pointed you at
> >>>
> >>> Jason
> >>> .
> >>>
> >>
> >> We optimized direct wqe based on different instructions for
> >> different CPUs, but the architecture of the CPUs is the same and
> >> supports both SVE and NEON instructions.  We plan to use cpuid to
> >> distinguish between them. Is this more reasonable?
> > 
> > Uhh, do you mean certain CPUs won't work with SVE and others won't
> > work with NEON?
> > 
> > That is quite horrible
> > 
> > Jason
> > .
> > 
> 
> No, acctually for general scenarios, our CPU supports two types of instructions, SVE and NEON.
> However, for the CPU that requires high fp64 floating-point computing power, the SVE instruction is enhanced and the NEON instruction is weakened.

Ideally the decision of what CPU instruction to use will be made by
rdma-core, using the the various schemes for dynamic link time
selection

It should apply universally to all providers

Jason
diff mbox series

Patch

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0cb68264..ee1024d5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -417,6 +417,8 @@  endif()
 
 RDMA_Check_SSE(HAVE_TARGET_SSE)
 
+RDMA_Check_SVE(HAVE_TARGET_SVE)
+
 # Enable development support features
 # Prune unneeded shared libraries during linking
 RDMA_AddOptLDFlag(CMAKE_EXE_LINKER_FLAGS SUPPORTS_AS_NEEDED "-Wl,--as-needed")
diff --git a/buildlib/RDMA_EnableCStd.cmake b/buildlib/RDMA_EnableCStd.cmake
index 3c42824f..c6bd6603 100644
--- a/buildlib/RDMA_EnableCStd.cmake
+++ b/buildlib/RDMA_EnableCStd.cmake
@@ -127,3 +127,10 @@  int main(int argc, char *argv[])
   endif()
   set(${TO_VAR} "${HAVE_TARGET_SSE}" PARENT_SCOPE)
 endFunction()
+
+function(RDMA_Check_SVE TO_VAR)
+  RDMA_Check_C_Compiles(HAVE_TARGET_SVE "${SVE_CHECK_PROGRAM}")
+
+  set(SVE_FLAGS "-march=armv8.2-a+sve" PARENT_SCOPE)
+  set(${TO_VAR} "${HAVE_TARGET_SVE}" PARENT_SCOPE)
+endFunction()
diff --git a/providers/hns/CMakeLists.txt b/providers/hns/CMakeLists.txt
index 7aaca757..5c2bcf3b 100644
--- a/providers/hns/CMakeLists.txt
+++ b/providers/hns/CMakeLists.txt
@@ -5,3 +5,5 @@  rdma_provider(hns
   hns_roce_u_hw_v2.c
   hns_roce_u_verbs.c
 )
+
+set_source_files_properties(hns_roce_u_hw_v2.c PROPERTIES COMPILE_FLAGS "${SVE_FLAGS}")
diff --git a/providers/hns/hns_roce_u_hw_v2.c b/providers/hns/hns_roce_u_hw_v2.c
index 3a294968..bd457217 100644
--- a/providers/hns/hns_roce_u_hw_v2.c
+++ b/providers/hns/hns_roce_u_hw_v2.c
@@ -299,6 +299,11 @@  static void hns_roce_update_sq_db(struct hns_roce_context *ctx,
 	hns_roce_write64(qp->sq.db_reg, (__le32 *)&sq_db);
 }
 
+static void hns_roce_sve_write512(uint64_t *dest, uint64_t *val)
+{
+	mmio_memcpy_x64_sve(dest, val);
+}
+
 static void hns_roce_write512(uint64_t *dest, uint64_t *val)
 {
 	mmio_memcpy_x64(dest, val, sizeof(struct hns_roce_rc_sq_wqe));
@@ -314,7 +319,10 @@  static void hns_roce_write_dwqe(struct hns_roce_qp *qp, void *wqe)
 	hr_reg_write(rc_sq_wqe, RCWQE_DB_SL_H, qp->sl >> HNS_ROCE_SL_SHIFT);
 	hr_reg_write(rc_sq_wqe, RCWQE_WQE_IDX, qp->sq.head);
 
-	hns_roce_write512(qp->sq.db_reg, wqe);
+	if (qp->flags & HNS_ROCE_QP_CAP_SVE_DIRECT_WQE)
+		hns_roce_sve_write512(qp->sq.db_reg, wqe);
+	else
+		hns_roce_write512(qp->sq.db_reg, wqe);
 }
 
 static void update_cq_db(struct hns_roce_context *ctx, struct hns_roce_cq *cq)
diff --git a/util/mmio.h b/util/mmio.h
index b60935c4..13fd2654 100644
--- a/util/mmio.h
+++ b/util/mmio.h
@@ -207,6 +207,17 @@  __le64 mmio_read64_le(const void *addr);
 /* This strictly guarantees the order of TLP generation for the memory copy to
    be in ascending address order.
 */
+#if defined(__aarch64__) || defined(__arm__)
+static inline void mmio_memcpy_x64_sve(void *dest, const void *src)
+{
+	asm volatile(
+		"ldr z0, [%0]\n"
+		"str z0, [%1]\n"
+		::"r" (val), "r"(dest):"cc", "memory"
+	);
+}
+#endif
+
 #if defined(__aarch64__) || defined(__arm__)
 #include <arm_neon.h>