Message ID | 1622194379-59868-5-git-send-email-liweihang@huawei.com (mailing list archive) |
---|---|
State | Changes Requested |
Headers | show |
Series | libhns: Add support for direct WQE | expand |
On Fri, May 28, 2021 at 05:32:59PM +0800, Weihang Li wrote: > diff --git a/providers/hns/hns_roce_u_hw_v2.c b/providers/hns/hns_roce_u_hw_v2.c > index aa57cc4..28d455b 100644 > +++ b/providers/hns/hns_roce_u_hw_v2.c > @@ -33,10 +33,15 @@ > #define _GNU_SOURCE > #include <stdio.h> > #include <string.h> > +#include <sys/mman.h> > #include "hns_roce_u.h" > #include "hns_roce_u_db.h" > #include "hns_roce_u_hw_v2.h" > > +#if defined(__aarch64__) || defined(__arm__) > +#include <arm_neon.h> > +#endif > + > #define HR_IBV_OPC_MAP(ib_key, hr_key) \ > [IBV_WR_ ## ib_key] = HNS_ROCE_WQE_OP_ ## hr_key > > @@ -313,6 +318,39 @@ static void hns_roce_update_sq_db(struct hns_roce_context *ctx, > (__le32 *)&sq_db); > } > > +static inline void hns_roce_write512(uint64_t *dest, uint64_t *val) > +{ > +#if defined(__aarch64__) || defined(__arm__) > + uint64x2x4_t dwqe; > + > + /* Load multiple 4-element structures to 4 registers */ > + dwqe = vld4q_u64(val); > + /* store multiple 4-element structures from 4 registers */ > + vst4q_u64(dest, dwqe); > +#else > + int i; > + > + for (i = 0; i < HNS_ROCE_WRITE_TIMES; i++) > + hns_roce_write64(dest + i, val + HNS_ROCE_WORD_NUM * i); > +#endif > +} No code like this in providers. This should be done similiarly to how SSE is handled on x86 This is mmio_memcpy_x64(dest, val, 64); The above should be conditionalized to trigger NEON #if defined(__aarch64__) || defined(__arm__) static inline void __mmio_memcpy_x64_64b(..) {.. vst4q_u64(dest, vld4q_u64(src)) ..} #endif #define mmio_memcpy_x64(dest, src, bytecount) ({if (__builtin_constant_p(bytecount == 64) __mmio_memcpy_x64_64b(dest,src,bytecount) ... And I'm not sure what barriers you need for prot_device, but certainly more than none. If you don't know then use the WC barriers Jason
On 2021/6/4 22:50, Jason Gunthorpe wrote: > On Fri, May 28, 2021 at 05:32:59PM +0800, Weihang Li wrote: >> diff --git a/providers/hns/hns_roce_u_hw_v2.c b/providers/hns/hns_roce_u_hw_v2.c >> index aa57cc4..28d455b 100644 >> +++ b/providers/hns/hns_roce_u_hw_v2.c >> @@ -33,10 +33,15 @@ >> #define _GNU_SOURCE >> #include <stdio.h> >> #include <string.h> >> +#include <sys/mman.h> >> #include "hns_roce_u.h" >> #include "hns_roce_u_db.h" >> #include "hns_roce_u_hw_v2.h" >> >> +#if defined(__aarch64__) || defined(__arm__) >> +#include <arm_neon.h> >> +#endif >> + >> #define HR_IBV_OPC_MAP(ib_key, hr_key) \ >> [IBV_WR_ ## ib_key] = HNS_ROCE_WQE_OP_ ## hr_key >> >> @@ -313,6 +318,39 @@ static void hns_roce_update_sq_db(struct hns_roce_context *ctx, >> (__le32 *)&sq_db); >> } >> >> +static inline void hns_roce_write512(uint64_t *dest, uint64_t *val) >> +{ >> +#if defined(__aarch64__) || defined(__arm__) >> + uint64x2x4_t dwqe; >> + >> + /* Load multiple 4-element structures to 4 registers */ >> + dwqe = vld4q_u64(val); >> + /* store multiple 4-element structures from 4 registers */ >> + vst4q_u64(dest, dwqe); >> +#else >> + int i; >> + >> + for (i = 0; i < HNS_ROCE_WRITE_TIMES; i++) >> + hns_roce_write64(dest + i, val + HNS_ROCE_WORD_NUM * i); >> +#endif >> +} > > No code like this in providers. This should be done similiarly to how > SSE is handled on x86 > > This is > > mmio_memcpy_x64(dest, val, 64); > > The above should be conditionalized to trigger NEON > > #if defined(__aarch64__) || defined(__arm__) > static inline void __mmio_memcpy_x64_64b(..) > {.. > vst4q_u64(dest, vld4q_u64(src)) > ..} > #endif > > #define mmio_memcpy_x64(dest, src, bytecount) > ({if (__builtin_constant_p(bytecount == 64) > __mmio_memcpy_x64_64b(dest,src,bytecount) > ... > OK, thank you. > And I'm not sure what barriers you need for prot_device, but certainly > more than none. If you don't know then use the WC barriers > ST4 instructions can guarantee the 64 bytes data to be wrote at a time, so we don't need a barrier. Weihang > Jason >
On Fri, Jun 11, 2021 at 09:20:51AM +0000, liweihang wrote: > On 2021/6/4 22:50, Jason Gunthorpe wrote: > > On Fri, May 28, 2021 at 05:32:59PM +0800, Weihang Li wrote: > >> diff --git a/providers/hns/hns_roce_u_hw_v2.c b/providers/hns/hns_roce_u_hw_v2.c > >> index aa57cc4..28d455b 100644 > >> +++ b/providers/hns/hns_roce_u_hw_v2.c > >> @@ -33,10 +33,15 @@ > >> #define _GNU_SOURCE > >> #include <stdio.h> > >> #include <string.h> > >> +#include <sys/mman.h> > >> #include "hns_roce_u.h" > >> #include "hns_roce_u_db.h" > >> #include "hns_roce_u_hw_v2.h" > >> > >> +#if defined(__aarch64__) || defined(__arm__) > >> +#include <arm_neon.h> > >> +#endif > >> + > >> #define HR_IBV_OPC_MAP(ib_key, hr_key) \ > >> [IBV_WR_ ## ib_key] = HNS_ROCE_WQE_OP_ ## hr_key > >> > >> @@ -313,6 +318,39 @@ static void hns_roce_update_sq_db(struct hns_roce_context *ctx, > >> (__le32 *)&sq_db); > >> } > >> > >> +static inline void hns_roce_write512(uint64_t *dest, uint64_t *val) > >> +{ > >> +#if defined(__aarch64__) || defined(__arm__) > >> + uint64x2x4_t dwqe; > >> + > >> + /* Load multiple 4-element structures to 4 registers */ > >> + dwqe = vld4q_u64(val); > >> + /* store multiple 4-element structures from 4 registers */ > >> + vst4q_u64(dest, dwqe); > >> +#else > >> + int i; > >> + > >> + for (i = 0; i < HNS_ROCE_WRITE_TIMES; i++) > >> + hns_roce_write64(dest + i, val + HNS_ROCE_WORD_NUM * i); > >> +#endif > >> +} > > > > No code like this in providers. This should be done similiarly to how > > SSE is handled on x86 > > > > This is > > > > mmio_memcpy_x64(dest, val, 64); > > > > The above should be conditionalized to trigger NEON > > > > #if defined(__aarch64__) || defined(__arm__) > > static inline void __mmio_memcpy_x64_64b(..) > > {.. > > vst4q_u64(dest, vld4q_u64(src)) > > ..} > > #endif > > > > #define mmio_memcpy_x64(dest, src, bytecount) > > ({if (__builtin_constant_p(bytecount == 64) > > __mmio_memcpy_x64_64b(dest,src,bytecount) > > ... > > > > OK, thank you. > > > And I'm not sure what barriers you need for prot_device, but certainly > > more than none. If you don't know then use the WC barriers > > > > ST4 instructions can guarantee the 64 bytes data to be wrote at a time, so we > don't need a barrier. arm is always a relaxed out of order storage model, you need barriers to ensure that the observance of the ST4 is in-order with the other writes that might be going on Jason
On 2021/6/11 19:31, Jason Gunthorpe wrote: > On Fri, Jun 11, 2021 at 09:20:51AM +0000, liweihang wrote: >> On 2021/6/4 22:50, Jason Gunthorpe wrote: >>> On Fri, May 28, 2021 at 05:32:59PM +0800, Weihang Li wrote: >>>> diff --git a/providers/hns/hns_roce_u_hw_v2.c b/providers/hns/hns_roce_u_hw_v2.c >>>> index aa57cc4..28d455b 100644 >>>> +++ b/providers/hns/hns_roce_u_hw_v2.c >>>> @@ -33,10 +33,15 @@ >>>> #define _GNU_SOURCE >>>> #include <stdio.h> >>>> #include <string.h> >>>> +#include <sys/mman.h> >>>> #include "hns_roce_u.h" >>>> #include "hns_roce_u_db.h" >>>> #include "hns_roce_u_hw_v2.h" >>>> >>>> +#if defined(__aarch64__) || defined(__arm__) >>>> +#include <arm_neon.h> >>>> +#endif >>>> + >>>> #define HR_IBV_OPC_MAP(ib_key, hr_key) \ >>>> [IBV_WR_ ## ib_key] = HNS_ROCE_WQE_OP_ ## hr_key >>>> >>>> @@ -313,6 +318,39 @@ static void hns_roce_update_sq_db(struct hns_roce_context *ctx, >>>> (__le32 *)&sq_db); >>>> } >>>> >>>> +static inline void hns_roce_write512(uint64_t *dest, uint64_t *val) >>>> +{ >>>> +#if defined(__aarch64__) || defined(__arm__) >>>> + uint64x2x4_t dwqe; >>>> + >>>> + /* Load multiple 4-element structures to 4 registers */ >>>> + dwqe = vld4q_u64(val); >>>> + /* store multiple 4-element structures from 4 registers */ >>>> + vst4q_u64(dest, dwqe); >>>> +#else >>>> + int i; >>>> + >>>> + for (i = 0; i < HNS_ROCE_WRITE_TIMES; i++) >>>> + hns_roce_write64(dest + i, val + HNS_ROCE_WORD_NUM * i); >>>> +#endif >>>> +} >>> >>> No code like this in providers. This should be done similiarly to how >>> SSE is handled on x86 >>> >>> This is >>> >>> mmio_memcpy_x64(dest, val, 64); >>> >>> The above should be conditionalized to trigger NEON >>> >>> #if defined(__aarch64__) || defined(__arm__) >>> static inline void __mmio_memcpy_x64_64b(..) >>> {.. >>> vst4q_u64(dest, vld4q_u64(src)) >>> ..} >>> #endif >>> >>> #define mmio_memcpy_x64(dest, src, bytecount) >>> ({if (__builtin_constant_p(bytecount == 64) >>> __mmio_memcpy_x64_64b(dest,src,bytecount) >>> ... >>> >> >> OK, thank you. >> >>> And I'm not sure what barriers you need for prot_device, but certainly >>> more than none. If you don't know then use the WC barriers >>> >> >> ST4 instructions can guarantee the 64 bytes data to be wrote at a time, so we >> don't need a barrier. > > arm is always a relaxed out of order storage model, you need barriers > to ensure that the observance of the ST4 is in-order with the other > writes that might be going on > > Jason > Hi Jason Sorry for the late reply. Here is the process of post send of HIP08/09: +-----------+ | post send | +-----+-----+ | +-----+-----+ | write WQE | +-----+-----+ | | udma_to_device_barrier() | +-----+-----+ Y +-----------+ N | HIP09 ? +------+ multi WR ?+-------------+ +-----+-----+ +-----+-----+ | | N | Y | +-----+-----+ +-----+-----+ +--------+--------+ | ring DB | | ring DB | |direct WQE (ST4) | +-----------+ +-----------+ +-----------------+ After users call ibv_post_send, the driver writes the WQE into memory, and add a barrier to ensure that all of the WQE has been fully written. Then, for HIP09, we check if there is only one WR, and if so, we write the WQE into pci bar space via ST4 instructions, then the hardware will get the WQE. If there are more than one WQEs, we generate a SQ doorbell to tell the hardware to read WQEs. Direct WQE merge the process ring doorbell and get WQE from memory to the hardware, avoiding reading WQEs from the memory after the doorbell is updated. The ST4 instructions is atomic as ring doorbell for the hardware, and before ST4, the WQE has been fully written into the memory. So I think current barrier is enough for Direct WQE. If there is still any issues in this process, could you please tell us where to add the barrier? Thank you :) Weihang
On Wed, Jun 16, 2021 at 09:55:45AM +0000, liweihang wrote: > If there is still any issues in this process, could you please tell us where to > add the barrier? Thank you :) I don't know ARM perfectly well, but generally look at 1) Do these special stores barrier with the spin unlock protecting the post send? Allowing them to leak out will get things out of order 2) ARM MMIO stores are not ordered, so that DB store the ST4 store are not guaranteed to execute in program order without a barrier. The spinlock is not a MMIO barrier You could ignore some of this when the DB rings were basically idempotent, but if you are xfering data it is more tricky. This is why we always see a barrier after a WC store to put all future MMIO strongly in order with the store. Jason
On 2021/6/17 3:14, Jason Gunthorpe wrote: > On Wed, Jun 16, 2021 at 09:55:45AM +0000, liweihang wrote: > >> If there is still any issues in this process, could you please tell us where to >> add the barrier? Thank you :) > > I don't know ARM perfectly well, but generally look at > > 1) Do these special stores barrier with the spin unlock protecting > the post send? Allowing them to leak out will get things out of > order I do not think we need to rely on the spin unlock to ensure correct ordering for ST4 store. ST4 store is similiar as DB store, the difference is that DB store writes 8 bytes to the device's MMIO space and ST4 store writes 64 bytes, the ST4 store can be ordered by udma_to_device_barrier() too, which mean we can also use udma_to_device_barrier() to ensure correct ordering between ST4 store and DB store too. > > 2) ARM MMIO stores are not ordered, so that DB store the ST4 store > are not guaranteed to execute in program order without a barrier. > The spinlock is not a MMIO barrier > As there is udma_to_device_barrier() between each round of post send, we can guarantee that the last DB store/ST4 store reaches the device before issuing the the next DB store/ST4 store. > You could ignore some of this when the DB rings were basically > idempotent, but if you are xfering data it is more tricky. This is why > we always see a barrier after a WC store to put all future MMIO > strongly in order with the store. > > Jason > "st4 store" writes the doorbell and the content of WQE to the roce engine, and the st4 store ensure doorbell and the content of WQE both reach the roce engine at the same time. we tried to avoid WC store by using st4 store here, as WC store might need a different barrier in order to flush the data to the device. Thanks Weihang
diff --git a/providers/hns/hns_roce_u.h b/providers/hns/hns_roce_u.h index 3c4b162..2ffb604 100644 --- a/providers/hns/hns_roce_u.h +++ b/providers/hns/hns_roce_u.h @@ -81,6 +81,8 @@ #define INVALID_SGE_LENGTH 0x80000000 +#define HNS_ROCE_DWQE_PAGE_SIZE 65536 + #define HNS_ROCE_ADDRESS_MASK 0xFFFFFFFF #define HNS_ROCE_ADDRESS_SHIFT 32 @@ -280,13 +282,14 @@ struct hns_roce_qp { struct hns_roce_sge_ex ex_sge; unsigned int next_sge; int port_num; - int sl; + uint8_t sl; unsigned int qkey; enum ibv_mtu path_mtu; struct hns_roce_rinl_buf rq_rinl_buf; unsigned long flags; int refcnt; /* specially used for XRC */ + void *dwqe_page; }; struct hns_roce_av { @@ -417,7 +420,7 @@ hns_roce_u_create_qp_ex(struct ibv_context *context, struct ibv_qp *hns_roce_u_open_qp(struct ibv_context *context, struct ibv_qp_open_attr *attr); - +void hns_roce_v2_clear_qp(struct hns_roce_context *ctx, struct hns_roce_qp *qp); int hns_roce_u_query_qp(struct ibv_qp *ibqp, struct ibv_qp_attr *attr, int attr_mask, struct ibv_qp_init_attr *init_attr); diff --git a/providers/hns/hns_roce_u_hw_v2.c b/providers/hns/hns_roce_u_hw_v2.c index aa57cc4..28d455b 100644 --- a/providers/hns/hns_roce_u_hw_v2.c +++ b/providers/hns/hns_roce_u_hw_v2.c @@ -33,10 +33,15 @@ #define _GNU_SOURCE #include <stdio.h> #include <string.h> +#include <sys/mman.h> #include "hns_roce_u.h" #include "hns_roce_u_db.h" #include "hns_roce_u_hw_v2.h" +#if defined(__aarch64__) || defined(__arm__) +#include <arm_neon.h> +#endif + #define HR_IBV_OPC_MAP(ib_key, hr_key) \ [IBV_WR_ ## ib_key] = HNS_ROCE_WQE_OP_ ## hr_key @@ -313,6 +318,39 @@ static void hns_roce_update_sq_db(struct hns_roce_context *ctx, (__le32 *)&sq_db); } +static inline void hns_roce_write512(uint64_t *dest, uint64_t *val) +{ +#if defined(__aarch64__) || defined(__arm__) + uint64x2x4_t dwqe; + + /* Load multiple 4-element structures to 4 registers */ + dwqe = vld4q_u64(val); + /* store multiple 4-element structures from 4 registers */ + vst4q_u64(dest, dwqe); +#else + int i; + + for (i = 0; i < HNS_ROCE_WRITE_TIMES; i++) + hns_roce_write64(dest + i, val + HNS_ROCE_WORD_NUM * i); +#endif +} + +static void hns_roce_write_dwqe(struct hns_roce_qp *qp, void *wqe) +{ + struct hns_roce_rc_sq_wqe *rc_sq_wqe = wqe; + + /* All kinds of DirectWQE have the same header field layout */ + roce_set_bit(rc_sq_wqe->byte_4, RC_SQ_WQE_BYTE_4_FLAG_S, 1); + roce_set_field(rc_sq_wqe->byte_4, RC_SQ_WQE_BYTE_4_DB_SL_L_M, + RC_SQ_WQE_BYTE_4_DB_SL_L_S, qp->sl); + roce_set_field(rc_sq_wqe->byte_4, RC_SQ_WQE_BYTE_4_DB_SL_H_M, + RC_SQ_WQE_BYTE_4_DB_SL_H_S, qp->sl >> HNS_ROCE_SL_SHIFT); + roce_set_field(rc_sq_wqe->byte_4, RC_SQ_WQE_BYTE_4_WQE_INDEX_M, + RC_SQ_WQE_BYTE_4_WQE_INDEX_S, qp->sq.head); + + hns_roce_write512(qp->dwqe_page, wqe); +} + static void hns_roce_v2_update_cq_cons_index(struct hns_roce_context *ctx, struct hns_roce_cq *cq) { @@ -342,8 +380,7 @@ static struct hns_roce_qp *hns_roce_v2_find_qp(struct hns_roce_context *ctx, return NULL; } -static void hns_roce_v2_clear_qp(struct hns_roce_context *ctx, - struct hns_roce_qp *qp) +void hns_roce_v2_clear_qp(struct hns_roce_context *ctx, struct hns_roce_qp *qp) { uint32_t qpn = qp->verbs_qp.qp.qp_num; uint32_t tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift; @@ -1240,6 +1277,7 @@ int hns_roce_u_v2_post_send(struct ibv_qp *ibvqp, struct ibv_send_wr *wr, break; case IBV_QPT_UD: ret = set_ud_wqe(wqe, qp, wr, nreq, &sge_info); + qp->sl = to_hr_ah(wr->wr.ud.ah)->av.sl; break; default: ret = EINVAL; @@ -1255,10 +1293,13 @@ out: if (likely(nreq)) { qp->sq.head += nreq; qp->next_sge = sge_info.start_idx; - udma_to_device_barrier(); - hns_roce_update_sq_db(ctx, ibvqp->qp_num, qp->sl, qp->sq.head); + if (nreq == 1 && (qp->flags & HNS_ROCE_QP_CAP_DIRECT_WQE)) + hns_roce_write_dwqe(qp, wqe); + else + hns_roce_update_sq_db(ctx, qp->verbs_qp.qp.qp_num, qp->sl, + qp->sq.head); if (qp->flags & HNS_ROCE_QP_CAP_SQ_RECORD_DB) *(qp->sdb) = qp->sq.head & 0xffff; @@ -1564,6 +1605,9 @@ static int hns_roce_u_v2_destroy_qp(struct ibv_qp *ibqp) hns_roce_unlock_cqs(ibqp); + if (qp->flags & HNS_ROCE_QP_CAP_DIRECT_WQE) + munmap(qp->dwqe_page, HNS_ROCE_DWQE_PAGE_SIZE); + hns_roce_free_qp_buf(qp, ctx); free(qp); diff --git a/providers/hns/hns_roce_u_hw_v2.h b/providers/hns/hns_roce_u_hw_v2.h index c13d82e..b319826 100644 --- a/providers/hns/hns_roce_u_hw_v2.h +++ b/providers/hns/hns_roce_u_hw_v2.h @@ -40,6 +40,8 @@ #define HNS_ROCE_CMDSN_MASK 0x3 +#define HNS_ROCE_SL_SHIFT 2 + /* V2 REG DEFINITION */ #define ROCEE_VF_DB_CFG0_OFFSET 0x0230 @@ -133,6 +135,8 @@ struct hns_roce_db { #define DB_BYTE_4_CMD_S 24 #define DB_BYTE_4_CMD_M GENMASK(27, 24) +#define DB_BYTE_4_FLAG_S 31 + #define DB_PARAM_SRQ_PRODUCER_COUNTER_S 0 #define DB_PARAM_SRQ_PRODUCER_COUNTER_M GENMASK(15, 0) @@ -216,8 +220,16 @@ struct hns_roce_rc_sq_wqe { }; #define RC_SQ_WQE_BYTE_4_OPCODE_S 0 -#define RC_SQ_WQE_BYTE_4_OPCODE_M \ - (((1UL << 5) - 1) << RC_SQ_WQE_BYTE_4_OPCODE_S) +#define RC_SQ_WQE_BYTE_4_OPCODE_M GENMASK(4, 0) + +#define RC_SQ_WQE_BYTE_4_DB_SL_L_S 5 +#define RC_SQ_WQE_BYTE_4_DB_SL_L_M GENMASK(6, 5) + +#define RC_SQ_WQE_BYTE_4_DB_SL_H_S 13 +#define RC_SQ_WQE_BYTE_4_DB_SL_H_M GENMASK(14, 13) + +#define RC_SQ_WQE_BYTE_4_WQE_INDEX_S 15 +#define RC_SQ_WQE_BYTE_4_WQE_INDEX_M GENMASK(30, 15) #define RC_SQ_WQE_BYTE_4_OWNER_S 7 @@ -239,6 +251,8 @@ struct hns_roce_rc_sq_wqe { #define RC_SQ_WQE_BYTE_4_RDMA_WRITE_S 22 +#define RC_SQ_WQE_BYTE_4_FLAG_S 31 + #define RC_SQ_WQE_BYTE_16_XRC_SRQN_S 0 #define RC_SQ_WQE_BYTE_16_XRC_SRQN_M \ (((1UL << 24) - 1) << RC_SQ_WQE_BYTE_16_XRC_SRQN_S) @@ -311,23 +325,12 @@ struct hns_roce_ud_sq_wqe { #define UD_SQ_WQE_OPCODE_S 0 #define UD_SQ_WQE_OPCODE_M GENMASK(4, 0) -#define UD_SQ_WQE_DB_SL_L_S 5 -#define UD_SQ_WQE_DB_SL_L_M GENMASK(6, 5) - -#define UD_SQ_WQE_DB_SL_H_S 13 -#define UD_SQ_WQE_DB_SL_H_M GENMASK(14, 13) - -#define UD_SQ_WQE_INDEX_S 15 -#define UD_SQ_WQE_INDEX_M GENMASK(30, 15) - #define UD_SQ_WQE_OWNER_S 7 #define UD_SQ_WQE_CQE_S 8 #define UD_SQ_WQE_SE_S 11 -#define UD_SQ_WQE_FLAG_S 31 - #define UD_SQ_WQE_PD_S 0 #define UD_SQ_WQE_PD_M GENMASK(23, 0) diff --git a/providers/hns/hns_roce_u_verbs.c b/providers/hns/hns_roce_u_verbs.c index 7b44829..f97144e 100644 --- a/providers/hns/hns_roce_u_verbs.c +++ b/providers/hns/hns_roce_u_verbs.c @@ -1115,6 +1115,37 @@ static int hns_roce_store_qp(struct hns_roce_context *ctx, return 0; } +static off_t get_dwqe_mmap_offset(unsigned long qpn, int page_size, int cmd) +{ + off_t offset = 0; + unsigned long idx; + + idx = qpn * (HNS_ROCE_DWQE_PAGE_SIZE / page_size); + + hns_roce_mmap_set_command(cmd, &offset); + hns_roce_mmap_set_index(idx, &offset); + + return offset * page_size; +} + +static int mmap_dwqe(struct ibv_context *ibv_ctx, struct hns_roce_qp *qp) +{ + struct hns_roce_device *hr_dev = to_hr_dev(ibv_ctx->device); + int page_size = hr_dev->page_size; + off_t offset; + + offset = get_dwqe_mmap_offset(qp->verbs_qp.qp.qp_num, page_size, + HNS_ROCE_MMAP_DWQE_PAGE); + + qp->dwqe_page = mmap(NULL, HNS_ROCE_DWQE_PAGE_SIZE, PROT_WRITE, + MAP_SHARED, ibv_ctx->cmd_fd, offset); + + if (qp->dwqe_page == MAP_FAILED) + return -EINVAL; + + return 0; +} + static int qp_exec_create_cmd(struct ibv_qp_init_attr_ex *attr, struct hns_roce_qp *qp, struct hns_roce_context *ctx) @@ -1216,10 +1247,18 @@ static struct ibv_qp *create_qp(struct ibv_context *ibv_ctx, if (ret) goto err_store; + if (qp->flags & HNS_ROCE_QP_CAP_DIRECT_WQE) { + ret = mmap_dwqe(ibv_ctx, qp); + if (ret) + goto err_dwqe; + } + qp_setup_config(attr, qp, context); return &qp->verbs_qp.qp; +err_dwqe: + hns_roce_v2_clear_qp(context, qp); err_store: ibv_cmd_destroy_qp(&qp->verbs_qp.qp); err_cmd: