diff mbox series

[v2,rdma-core,2/2] libhns: Add support for direct wqe

Message ID 20211116150316.21925-3-liangwenpeng@huawei.com (mailing list archive)
State Superseded
Headers show
Series libhns: Add support for direct wqe | expand

Commit Message

Wenpeng Liang Nov. 16, 2021, 3:03 p.m. UTC
From: Yixing Liu <liuyixing1@huawei.com>

The current write wqe mechanism is to write to DDR first, and then notify
the hardware through doorbell to read the data. Direct wqe is a mechanism
to fill wqe directly into the hardware. In the case of light load, the wqe
will be filled into pcie bar space of the hardware, this will reduce one
memory access operation and therefore reduce the latency. SIMD instructions
allows cpu to write the 512 bits at one time to device memory, thus it can
be used for posting direct wqe.

The process of post send of HIP08/09:

   +-----------+
   | post send |
   +-----+-----+
         |
   +-----+-----+
   | write WQE |
   +-----+-----+
         |
         | udma_to_device_barrier()
         |
   +-----+-----+   Y  +-----------+  N
   |  HIP09 ?  +------+ multi WR ?+-------------+
   +-----+-----+      +-----+-----+             |
         | N                | Y                 |
   +-----+-----+      +-----+-----+    +--------+--------+
   |  ring DB  |      |  ring DB  |    |direct WQE (ST4) |
   +-----------+      +-----------+    +-----------------+

Signed-off-by: Yixing Liu <liuyixing1@huawei.com>
Signed-off-by: Lang Cheng <chenglang@huawei.com>
Signed-off-by: Wenpeng Liang <liangwenpeng@huawei.com>
---
 providers/hns/hns_roce_u.h       |  5 +++-
 providers/hns/hns_roce_u_hw_v2.c | 44 +++++++++++++++++++++++++-------
 providers/hns/hns_roce_u_hw_v2.h | 31 ++++++++++++----------
 providers/hns/hns_roce_u_verbs.c | 26 +++++++++++++++++--
 util/mmio.h                      | 27 +++++++++++++++++++-
 5 files changed, 107 insertions(+), 26 deletions(-)
diff mbox series

Patch

diff --git a/providers/hns/hns_roce_u.h b/providers/hns/hns_roce_u.h
index 1616db9f..c6f70d9b 100644
--- a/providers/hns/hns_roce_u.h
+++ b/providers/hns/hns_roce_u.h
@@ -81,6 +81,8 @@ 
 
 #define INVALID_SGE_LENGTH 0x80000000
 
+#define HNS_ROCE_DWQE_PAGE_SIZE 65536
+
 #define HNS_ROCE_ADDRESS_MASK 0xFFFFFFFF
 #define HNS_ROCE_ADDRESS_SHIFT 32
 
@@ -280,13 +282,14 @@  struct hns_roce_qp {
 	struct hns_roce_sge_ex		ex_sge;
 	unsigned int			next_sge;
 	int				port_num;
-	int				sl;
+	uint8_t				sl;
 	unsigned int			qkey;
 	enum ibv_mtu			path_mtu;
 
 	struct hns_roce_rinl_buf	rq_rinl_buf;
 	unsigned long			flags;
 	int				refcnt; /* specially used for XRC */
+	void				*dwqe_page;
 };
 
 struct hns_roce_av {
diff --git a/providers/hns/hns_roce_u_hw_v2.c b/providers/hns/hns_roce_u_hw_v2.c
index c044807e..0f56763e 100644
--- a/providers/hns/hns_roce_u_hw_v2.c
+++ b/providers/hns/hns_roce_u_hw_v2.c
@@ -33,6 +33,7 @@ 
 #define _GNU_SOURCE
 #include <stdio.h>
 #include <string.h>
+#include <sys/mman.h>
 #include "hns_roce_u.h"
 #include "hns_roce_u_db.h"
 #include "hns_roce_u_hw_v2.h"
@@ -297,20 +298,40 @@  static void hns_roce_update_rq_db(struct hns_roce_context *ctx,
 }
 
 static void hns_roce_update_sq_db(struct hns_roce_context *ctx,
-				  unsigned int qpn, unsigned int sl,
-				  unsigned int sq_head)
+				  struct hns_roce_qp *qp)
 {
 	struct hns_roce_db sq_db = {};
 
-	sq_db.byte_4 = htole32(qpn);
+	sq_db.byte_4 = htole32(qp->verbs_qp.qp.qp_num);
 	roce_set_field(sq_db.byte_4, DB_BYTE_4_CMD_M, DB_BYTE_4_CMD_S,
 		       HNS_ROCE_V2_SQ_DB);
-	sq_db.parameter = htole32(sq_head);
-	roce_set_field(sq_db.parameter, DB_PARAM_SL_M, DB_PARAM_SL_S, sl);
+	sq_db.parameter = htole32(qp->sq.head);
+	roce_set_field(sq_db.parameter, DB_PARAM_SL_M, DB_PARAM_SL_S, qp->sl);
 
 	hns_roce_write64(ctx->uar + ROCEE_VF_DB_CFG0_OFFSET, (__le32 *)&sq_db);
 }
 
+static void hns_roce_write512(uint64_t *dest, uint64_t *val)
+{
+	mmio_memcpy_x64(dest, val, sizeof(struct hns_roce_rc_sq_wqe));
+}
+
+static void hns_roce_write_dwqe(struct hns_roce_qp *qp, void *wqe)
+{
+	struct hns_roce_rc_sq_wqe *rc_sq_wqe = wqe;
+
+	/* All kinds of DirectWQE have the same header field layout */
+	roce_set_bit(rc_sq_wqe->byte_4, RC_SQ_WQE_BYTE_4_FLAG_S, 1);
+	roce_set_field(rc_sq_wqe->byte_4, RC_SQ_WQE_BYTE_4_DB_SL_L_M,
+		       RC_SQ_WQE_BYTE_4_DB_SL_L_S, qp->sl);
+	roce_set_field(rc_sq_wqe->byte_4, RC_SQ_WQE_BYTE_4_DB_SL_H_M,
+		       RC_SQ_WQE_BYTE_4_DB_SL_H_S, qp->sl >> HNS_ROCE_SL_SHIFT);
+	roce_set_field(rc_sq_wqe->byte_4, RC_SQ_WQE_BYTE_4_WQE_INDEX_M,
+		       RC_SQ_WQE_BYTE_4_WQE_INDEX_S, qp->sq.head);
+
+	 hns_roce_write512(qp->dwqe_page, wqe);
+}
+
 static void hns_roce_v2_update_cq_cons_index(struct hns_roce_context *ctx,
 					     struct hns_roce_cq *cq)
 {
@@ -339,8 +360,7 @@  static struct hns_roce_qp *hns_roce_v2_find_qp(struct hns_roce_context *ctx,
 		return NULL;
 }
 
-static void hns_roce_v2_clear_qp(struct hns_roce_context *ctx,
-				 struct hns_roce_qp *qp)
+void hns_roce_v2_clear_qp(struct hns_roce_context *ctx, struct hns_roce_qp *qp)
 {
 	uint32_t qpn = qp->verbs_qp.qp.qp_num;
 	uint32_t tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift;
@@ -1234,6 +1254,7 @@  int hns_roce_u_v2_post_send(struct ibv_qp *ibvqp, struct ibv_send_wr *wr,
 			break;
 		case IBV_QPT_UD:
 			ret = set_ud_wqe(wqe, qp, wr, nreq, &sge_info);
+			qp->sl = to_hr_ah(wr->wr.ud.ah)->av.sl;
 			break;
 		default:
 			ret = EINVAL;
@@ -1249,10 +1270,12 @@  out:
 	if (likely(nreq)) {
 		qp->sq.head += nreq;
 		qp->next_sge = sge_info.start_idx;
-
 		udma_to_device_barrier();
 
-		hns_roce_update_sq_db(ctx, ibvqp->qp_num, qp->sl, qp->sq.head);
+		if (nreq == 1 && (qp->flags & HNS_ROCE_QP_CAP_DIRECT_WQE))
+			hns_roce_write_dwqe(qp, wqe);
+		else
+			hns_roce_update_sq_db(ctx, qp);
 
 		if (qp->flags & HNS_ROCE_QP_CAP_SQ_RECORD_DB)
 			*(qp->sdb) = qp->sq.head & 0xffff;
@@ -1544,6 +1567,9 @@  static int hns_roce_u_v2_destroy_qp(struct ibv_qp *ibqp)
 	if (ret)
 		return ret;
 
+	if (qp->flags & HNS_ROCE_QP_CAP_DIRECT_WQE)
+		munmap(qp->dwqe_page, HNS_ROCE_DWQE_PAGE_SIZE);
+
 	hns_roce_v2_clear_qp(ctx, qp);
 
 	hns_roce_lock_cqs(ibqp);
diff --git a/providers/hns/hns_roce_u_hw_v2.h b/providers/hns/hns_roce_u_hw_v2.h
index c13d82e3..af72cd70 100644
--- a/providers/hns/hns_roce_u_hw_v2.h
+++ b/providers/hns/hns_roce_u_hw_v2.h
@@ -40,6 +40,8 @@ 
 
 #define HNS_ROCE_CMDSN_MASK			0x3
 
+#define HNS_ROCE_SL_SHIFT 2
+
 /* V2 REG DEFINITION */
 #define ROCEE_VF_DB_CFG0_OFFSET			0x0230
 
@@ -133,6 +135,8 @@  struct hns_roce_db {
 #define DB_BYTE_4_CMD_S 24
 #define DB_BYTE_4_CMD_M GENMASK(27, 24)
 
+#define DB_BYTE_4_FLAG_S 31
+
 #define DB_PARAM_SRQ_PRODUCER_COUNTER_S 0
 #define DB_PARAM_SRQ_PRODUCER_COUNTER_M GENMASK(15, 0)
 
@@ -216,8 +220,16 @@  struct hns_roce_rc_sq_wqe {
 };
 
 #define RC_SQ_WQE_BYTE_4_OPCODE_S 0
-#define RC_SQ_WQE_BYTE_4_OPCODE_M \
-	(((1UL << 5) - 1) << RC_SQ_WQE_BYTE_4_OPCODE_S)
+#define RC_SQ_WQE_BYTE_4_OPCODE_M GENMASK(4, 0)
+
+#define RC_SQ_WQE_BYTE_4_DB_SL_L_S 5
+#define RC_SQ_WQE_BYTE_4_DB_SL_L_M GENMASK(6, 5)
+
+#define RC_SQ_WQE_BYTE_4_DB_SL_H_S 13
+#define RC_SQ_WQE_BYTE_4_DB_SL_H_M GENMASK(14, 13)
+
+#define RC_SQ_WQE_BYTE_4_WQE_INDEX_S 15
+#define RC_SQ_WQE_BYTE_4_WQE_INDEX_M GENMASK(30, 15)
 
 #define RC_SQ_WQE_BYTE_4_OWNER_S 7
 
@@ -239,6 +251,8 @@  struct hns_roce_rc_sq_wqe {
 
 #define RC_SQ_WQE_BYTE_4_RDMA_WRITE_S 22
 
+#define RC_SQ_WQE_BYTE_4_FLAG_S 31
+
 #define RC_SQ_WQE_BYTE_16_XRC_SRQN_S 0
 #define RC_SQ_WQE_BYTE_16_XRC_SRQN_M \
 	(((1UL << 24) - 1) << RC_SQ_WQE_BYTE_16_XRC_SRQN_S)
@@ -311,23 +325,12 @@  struct hns_roce_ud_sq_wqe {
 #define UD_SQ_WQE_OPCODE_S 0
 #define UD_SQ_WQE_OPCODE_M GENMASK(4, 0)
 
-#define UD_SQ_WQE_DB_SL_L_S 5
-#define UD_SQ_WQE_DB_SL_L_M GENMASK(6, 5)
-
-#define UD_SQ_WQE_DB_SL_H_S 13
-#define UD_SQ_WQE_DB_SL_H_M GENMASK(14, 13)
-
-#define UD_SQ_WQE_INDEX_S 15
-#define UD_SQ_WQE_INDEX_M GENMASK(30, 15)
-
 #define UD_SQ_WQE_OWNER_S 7
 
 #define UD_SQ_WQE_CQE_S 8
 
 #define UD_SQ_WQE_SE_S 11
 
-#define UD_SQ_WQE_FLAG_S 31
-
 #define UD_SQ_WQE_PD_S 0
 #define UD_SQ_WQE_PD_M GENMASK(23, 0)
 
@@ -376,4 +379,6 @@  struct hns_roce_ud_sq_wqe {
 
 #define MAX_SERVICE_LEVEL 0x7
 
+void hns_roce_v2_clear_qp(struct hns_roce_context *ctx, struct hns_roce_qp *qp);
+
 #endif /* _HNS_ROCE_U_HW_V2_H */
diff --git a/providers/hns/hns_roce_u_verbs.c b/providers/hns/hns_roce_u_verbs.c
index c0069b8b..11db2a34 100644
--- a/providers/hns/hns_roce_u_verbs.c
+++ b/providers/hns/hns_roce_u_verbs.c
@@ -1119,7 +1119,8 @@  static int hns_roce_store_qp(struct hns_roce_context *ctx,
 
 static int qp_exec_create_cmd(struct ibv_qp_init_attr_ex *attr,
 			      struct hns_roce_qp *qp,
-			      struct hns_roce_context *ctx)
+			      struct hns_roce_context *ctx,
+			      uint64_t *dwqe_mmap_key)
 {
 	struct hns_roce_create_qp_ex_resp resp_ex = {};
 	struct hns_roce_create_qp_ex cmd_ex = {};
@@ -1136,6 +1137,7 @@  static int qp_exec_create_cmd(struct ibv_qp_init_attr_ex *attr,
 				    &resp_ex.ibv_resp, sizeof(resp_ex));
 
 	qp->flags = resp_ex.drv_payload.cap_flags;
+	*dwqe_mmap_key = resp_ex.drv_payload.dwqe_mmap_key;
 
 	return ret;
 }
@@ -1187,11 +1189,23 @@  static int hns_roce_alloc_qp_buf(struct ibv_qp_init_attr_ex *attr,
 	return ret;
 }
 
+static int mmap_dwqe(struct ibv_context *ibv_ctx, struct hns_roce_qp *qp,
+		     uint64_t dwqe_mmap_key)
+{
+	qp->dwqe_page = mmap(NULL, HNS_ROCE_DWQE_PAGE_SIZE, PROT_WRITE,
+			     MAP_SHARED, ibv_ctx->cmd_fd, dwqe_mmap_key);
+	if (qp->dwqe_page == MAP_FAILED)
+		return -EINVAL;
+
+	return 0;
+}
+
 static struct ibv_qp *create_qp(struct ibv_context *ibv_ctx,
 				struct ibv_qp_init_attr_ex *attr)
 {
 	struct hns_roce_context *context = to_hr_ctx(ibv_ctx);
 	struct hns_roce_qp *qp;
+	uint64_t dwqe_mmap_key;
 	int ret;
 
 	ret = verify_qp_create_attr(context, attr);
@@ -1210,7 +1224,7 @@  static struct ibv_qp *create_qp(struct ibv_context *ibv_ctx,
 	if (ret)
 		goto err_buf;
 
-	ret = qp_exec_create_cmd(attr, qp, context);
+	ret = qp_exec_create_cmd(attr, qp, context, &dwqe_mmap_key);
 	if (ret)
 		goto err_cmd;
 
@@ -1218,10 +1232,18 @@  static struct ibv_qp *create_qp(struct ibv_context *ibv_ctx,
 	if (ret)
 		goto err_store;
 
+	if (qp->flags & HNS_ROCE_QP_CAP_DIRECT_WQE) {
+		ret = mmap_dwqe(ibv_ctx, qp, dwqe_mmap_key);
+		if (ret)
+			goto err_dwqe;
+	}
+
 	qp_setup_config(attr, qp, context);
 
 	return &qp->verbs_qp.qp;
 
+err_dwqe:
+	hns_roce_v2_clear_qp(context, qp);
 err_store:
 	ibv_cmd_destroy_qp(&qp->verbs_qp.qp);
 err_cmd:
diff --git a/util/mmio.h b/util/mmio.h
index 101af9dd..01d1455e 100644
--- a/util/mmio.h
+++ b/util/mmio.h
@@ -210,8 +210,33 @@  static inline void mmio_memcpy_x64(void *dest, const void *src, size_t bytecnt)
 {
 	s390_mmio_write(dest, src, bytecnt);
 }
-#else
 
+#elif defined(__aarch64__) || defined(__arm__)
+#include <arm_neon.h>
+
+static inline void _mmio_memcpy_x64_64b(void *dest, const void *src)
+{
+	vst4q_u64(dest, vld4q_u64(src));
+}
+
+static inline void _mmio_memcpy_x64(void *dest, const void *src, size_t bytecnt)
+{
+	do {
+		_mmio_memcpy_x64_64b(dest, src);
+		bytecnt -= sizeof(uint64x2x4_t);
+		src += sizeof(uint64x2x4_t);
+	} while (bytecnt > 0);
+}
+
+#define mmio_memcpy_x64(dest, src, bytecount)                                  \
+	({                                                                     \
+		if (__builtin_constant_p((bytecount) == 64))                   \
+			_mmio_memcpy_x64_64b((dest), (src));                   \
+		else                                                           \
+			_mmio_memcpy_x64((dest), (src), (bytecount));          \
+	})
+
+#else
 /* Transfer is some multiple of 64 bytes */
 static inline void mmio_memcpy_x64(void *dest, const void *src, size_t bytecnt)
 {