Message ID | 1445964755-13371-4-git-send-email-matanb@mellanox.com (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
On Tue, Oct 27, 2015 at 6:52 PM, Matan Barak <matanb@mellanox.com> wrote: > Add an implementation for verb_poll_cq extension verb. > This patch implements the new API via the standard > function mlx4_poll_one. > > Signed-off-by: Matan Barak <matanb@mellanox.com> > --- > src/cq.c | 307 ++++++++++++++++++++++++++++++++++++++++++++++++++++++------ > src/mlx4.c | 1 + > src/mlx4.h | 4 + > src/verbs.c | 1 + > 4 files changed, 284 insertions(+), 29 deletions(-) > > diff --git a/src/cq.c b/src/cq.c > index 32c9070..c86e824 100644 > --- a/src/cq.c > +++ b/src/cq.c > @@ -52,6 +52,7 @@ enum { > }; > > enum { > + CQ_CONTINUE = 1, > CQ_OK = 0, > CQ_EMPTY = -1, > CQ_POLL_ERR = -2 > @@ -121,7 +122,9 @@ static void update_cons_index(struct mlx4_cq *cq) > *cq->set_ci_db = htonl(cq->cons_index & 0xffffff); > } > > -static void mlx4_handle_error_cqe(struct mlx4_err_cqe *cqe, struct ibv_wc *wc) > +static void mlx4_handle_error_cqe(struct mlx4_err_cqe *cqe, > + enum ibv_wc_status *status, > + enum ibv_wc_opcode *vendor_err) > { > if (cqe->syndrome == MLX4_CQE_SYNDROME_LOCAL_QP_OP_ERR) > printf(PFX "local QP operation err " > @@ -133,64 +136,68 @@ static void mlx4_handle_error_cqe(struct mlx4_err_cqe *cqe, struct ibv_wc *wc) > > switch (cqe->syndrome) { > case MLX4_CQE_SYNDROME_LOCAL_LENGTH_ERR: > - wc->status = IBV_WC_LOC_LEN_ERR; > + *status = IBV_WC_LOC_LEN_ERR; > break; > case MLX4_CQE_SYNDROME_LOCAL_QP_OP_ERR: > - wc->status = IBV_WC_LOC_QP_OP_ERR; > + *status = IBV_WC_LOC_QP_OP_ERR; > break; > case MLX4_CQE_SYNDROME_LOCAL_PROT_ERR: > - wc->status = IBV_WC_LOC_PROT_ERR; > + *status = IBV_WC_LOC_PROT_ERR; > break; > case MLX4_CQE_SYNDROME_WR_FLUSH_ERR: > - wc->status = IBV_WC_WR_FLUSH_ERR; > + *status = IBV_WC_WR_FLUSH_ERR; > break; > case MLX4_CQE_SYNDROME_MW_BIND_ERR: > - wc->status = IBV_WC_MW_BIND_ERR; > + *status = IBV_WC_MW_BIND_ERR; > break; > case MLX4_CQE_SYNDROME_BAD_RESP_ERR: > - wc->status = IBV_WC_BAD_RESP_ERR; > + *status = IBV_WC_BAD_RESP_ERR; > break; > case MLX4_CQE_SYNDROME_LOCAL_ACCESS_ERR: > - wc->status = IBV_WC_LOC_ACCESS_ERR; > + *status = IBV_WC_LOC_ACCESS_ERR; > break; > case MLX4_CQE_SYNDROME_REMOTE_INVAL_REQ_ERR: > - wc->status = IBV_WC_REM_INV_REQ_ERR; > + *status = IBV_WC_REM_INV_REQ_ERR; > break; > case MLX4_CQE_SYNDROME_REMOTE_ACCESS_ERR: > - wc->status = IBV_WC_REM_ACCESS_ERR; > + *status = IBV_WC_REM_ACCESS_ERR; > break; > case MLX4_CQE_SYNDROME_REMOTE_OP_ERR: > - wc->status = IBV_WC_REM_OP_ERR; > + *status = IBV_WC_REM_OP_ERR; > break; > case MLX4_CQE_SYNDROME_TRANSPORT_RETRY_EXC_ERR: > - wc->status = IBV_WC_RETRY_EXC_ERR; > + *status = IBV_WC_RETRY_EXC_ERR; > break; > case MLX4_CQE_SYNDROME_RNR_RETRY_EXC_ERR: > - wc->status = IBV_WC_RNR_RETRY_EXC_ERR; > + *status = IBV_WC_RNR_RETRY_EXC_ERR; > break; > case MLX4_CQE_SYNDROME_REMOTE_ABORTED_ERR: > - wc->status = IBV_WC_REM_ABORT_ERR; > + *status = IBV_WC_REM_ABORT_ERR; > break; > default: > - wc->status = IBV_WC_GENERAL_ERR; > + *status = IBV_WC_GENERAL_ERR; > break; > } > > - wc->vendor_err = cqe->vendor_err; > + *vendor_err = cqe->vendor_err; > } > > -static int mlx4_poll_one(struct mlx4_cq *cq, > - struct mlx4_qp **cur_qp, > - struct ibv_wc *wc) > +static inline int mlx4_handle_cq(struct mlx4_cq *cq, > + struct mlx4_qp **cur_qp, > + uint64_t *wc_wr_id, > + enum ibv_wc_status *wc_status, > + uint32_t *wc_vendor_err, > + struct mlx4_cqe **pcqe, > + uint32_t *pqpn, > + int *pis_send) > { > struct mlx4_wq *wq; > struct mlx4_cqe *cqe; > struct mlx4_srq *srq; > uint32_t qpn; > - uint32_t g_mlpath_rqpn; > - uint16_t wqe_index; > int is_error; > int is_send; > + uint16_t wqe_index; > > cqe = next_cqe_sw(cq); > if (!cqe) > @@ -201,7 +208,7 @@ static int mlx4_poll_one(struct mlx4_cq *cq, > > ++cq->cons_index; > > - VALGRIND_MAKE_MEM_DEFINED(cqe, sizeof *cqe); > + VALGRIND_MAKE_MEM_DEFINED(cqe, sizeof(*cqe)); > > /* > * Make sure we read CQ entry contents after we've checked the > @@ -210,7 +217,6 @@ static int mlx4_poll_one(struct mlx4_cq *cq, > rmb(); > > qpn = ntohl(cqe->vlan_my_qpn) & MLX4_CQE_QPN_MASK; > - wc->qp_num = qpn; > > is_send = cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK; > is_error = (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) == > @@ -243,26 +249,50 @@ static int mlx4_poll_one(struct mlx4_cq *cq, > if (is_send) { > wq = &(*cur_qp)->sq; > wqe_index = ntohs(cqe->wqe_index); > - wq->tail += (uint16_t) (wqe_index - (uint16_t) wq->tail); > - wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)]; > + wq->tail += (uint16_t)(wqe_index - (uint16_t)wq->tail); > + *wc_wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)]; > ++wq->tail; > } else if (srq) { > wqe_index = htons(cqe->wqe_index); > - wc->wr_id = srq->wrid[wqe_index]; > + *wc_wr_id = srq->wrid[wqe_index]; > mlx4_free_srq_wqe(srq, wqe_index); > } else { > wq = &(*cur_qp)->rq; > - wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)]; > + *wc_wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)]; > ++wq->tail; > } > > if (is_error) { > - mlx4_handle_error_cqe((struct mlx4_err_cqe *) cqe, wc); > + mlx4_handle_error_cqe((struct mlx4_err_cqe *)cqe, > + wc_status, wc_vendor_err); > return CQ_OK; > } > > - wc->status = IBV_WC_SUCCESS; > + *wc_status = IBV_WC_SUCCESS; > > + *pcqe = cqe; > + *pqpn = qpn; > + *pis_send = is_send; > + > + return CQ_CONTINUE; > +} > + > +static int mlx4_poll_one(struct mlx4_cq *cq, > + struct mlx4_qp **cur_qp, > + struct ibv_wc *wc) > +{ > + struct mlx4_cqe *cqe; > + uint32_t qpn; > + uint32_t g_mlpath_rqpn; > + int is_send; > + int err; > + > + err = mlx4_handle_cq(cq, cur_qp, &wc->wr_id, &wc->status, > + &wc->vendor_err, &cqe, &qpn, &is_send); > + if (err != CQ_CONTINUE) > + return err; > + > + wc->qp_num = qpn; > if (is_send) { > wc->wc_flags = 0; > switch (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) { > @@ -340,6 +370,195 @@ static int mlx4_poll_one(struct mlx4_cq *cq, > return CQ_OK; > } > > +union wc_buffer { > + uint8_t *b8; > + uint16_t *b16; > + uint32_t *b32; > + uint64_t *b64; > +}; > + > +static inline int _mlx4_poll_one_ex(struct mlx4_cq *cq, > + struct mlx4_qp **cur_qp, > + struct ibv_wc_ex **pwc_ex, > + uint64_t wc_flags) > +{ > + struct mlx4_cqe *cqe; > + uint32_t qpn; > + uint32_t g_mlpath_rqpn; > + int is_send; > + struct ibv_wc_ex *wc_ex = *pwc_ex; > + union wc_buffer wc_buffer; > + int err; > + uint64_t wc_flags_out = 0; > + > + wc_buffer.b64 = (uint64_t *)&wc_ex->buffer; > + wc_ex->wc_flags = 0; > + wc_ex->reserved = 0; > + err = mlx4_handle_cq(cq, cur_qp, &wc_ex->wr_id, &wc_ex->status, > + &wc_ex->vendor_err, &cqe, &qpn, &is_send); > + if (err != CQ_CONTINUE) > + return err; > + > + if (is_send) { > + switch (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) { > + case MLX4_OPCODE_RDMA_WRITE_IMM: > + wc_flags_out |= IBV_WC_EX_IMM; > + case MLX4_OPCODE_RDMA_WRITE: > + wc_ex->opcode = IBV_WC_RDMA_WRITE; > + if (wc_flags & IBV_WC_EX_WITH_BYTE_LEN) > + wc_buffer.b32++; > + if (wc_flags & IBV_WC_EX_WITH_IMM) > + wc_buffer.b32++; > + break; > + case MLX4_OPCODE_SEND_IMM: > + wc_flags_out |= IBV_WC_EX_IMM; > + case MLX4_OPCODE_SEND: > + wc_ex->opcode = IBV_WC_SEND; > + if (wc_flags & IBV_WC_EX_WITH_BYTE_LEN) > + wc_buffer.b32++; > + if (wc_flags & IBV_WC_EX_WITH_IMM) > + wc_buffer.b32++; > + break; > + case MLX4_OPCODE_RDMA_READ: > + wc_ex->opcode = IBV_WC_RDMA_READ; > + if (wc_flags & IBV_WC_EX_WITH_BYTE_LEN) { > + *wc_buffer.b32++ = ntohl(cqe->byte_cnt); > + wc_flags_out |= IBV_WC_EX_WITH_BYTE_LEN; > + } > + if (wc_flags & IBV_WC_EX_WITH_IMM) > + wc_buffer.b32++; > + break; > + case MLX4_OPCODE_ATOMIC_CS: > + wc_ex->opcode = IBV_WC_COMP_SWAP; > + if (wc_flags & IBV_WC_EX_WITH_BYTE_LEN) { > + *wc_buffer.b32++ = 8; > + wc_flags_out |= IBV_WC_EX_WITH_BYTE_LEN; > + } > + if (wc_flags & IBV_WC_EX_WITH_IMM) > + wc_buffer.b32++; > + break; > + case MLX4_OPCODE_ATOMIC_FA: > + wc_ex->opcode = IBV_WC_FETCH_ADD; > + if (wc_flags & IBV_WC_EX_WITH_BYTE_LEN) { > + *wc_buffer.b32++ = 8; > + wc_flags_out |= IBV_WC_EX_WITH_BYTE_LEN; > + } > + if (wc_flags & IBV_WC_EX_WITH_IMM) > + wc_buffer.b32++; > + break; > + case MLX4_OPCODE_BIND_MW: > + wc_ex->opcode = IBV_WC_BIND_MW; > + if (wc_flags & IBV_WC_EX_WITH_BYTE_LEN) > + wc_buffer.b32++; > + if (wc_flags & IBV_WC_EX_WITH_IMM) > + wc_buffer.b32++; > + break; > + default: > + /* assume it's a send completion */ > + wc_ex->opcode = IBV_WC_SEND; > + if (wc_flags & IBV_WC_EX_WITH_BYTE_LEN) > + wc_buffer.b32++; > + if (wc_flags & IBV_WC_EX_WITH_IMM) > + wc_buffer.b32++; > + break; > + } > + > + if (wc_flags & IBV_WC_EX_WITH_QP_NUM) { > + *wc_buffer.b32++ = qpn; > + wc_flags_out |= IBV_WC_EX_WITH_QP_NUM; > + } > + if (wc_flags & IBV_WC_EX_WITH_SRC_QP) > + wc_buffer.b32++; > + if (wc_flags & IBV_WC_EX_WITH_PKEY_INDEX) > + wc_buffer.b16++; > + if (wc_flags & IBV_WC_EX_WITH_SLID) > + wc_buffer.b16++; > + if (wc_flags & IBV_WC_EX_WITH_SL) > + wc_buffer.b8++; > + if (wc_flags & IBV_WC_EX_WITH_DLID_PATH_BITS) > + wc_buffer.b8++; > + } else { > + if (wc_flags & IBV_WC_EX_WITH_BYTE_LEN) { > + *wc_buffer.b32++ = ntohl(cqe->byte_cnt); > + wc_flags_out |= IBV_WC_EX_WITH_BYTE_LEN; > + } > + > + switch (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) { > + case MLX4_RECV_OPCODE_RDMA_WRITE_IMM: > + wc_ex->opcode = IBV_WC_RECV_RDMA_WITH_IMM; > + wc_flags_out |= IBV_WC_EX_IMM; > + if (wc_flags & IBV_WC_EX_WITH_IMM) { > + *wc_buffer.b32++ = cqe->immed_rss_invalid; > + wc_flags_out |= IBV_WC_EX_WITH_IMM; > + } > + break; > + case MLX4_RECV_OPCODE_SEND: > + wc_ex->opcode = IBV_WC_RECV; > + if (wc_flags & IBV_WC_EX_WITH_IMM) > + wc_buffer.b32++; > + break; > + case MLX4_RECV_OPCODE_SEND_IMM: > + wc_ex->opcode = IBV_WC_RECV; > + wc_flags_out |= IBV_WC_EX_IMM; > + if (wc_flags & IBV_WC_EX_WITH_IMM) { > + *wc_buffer.b32++ = cqe->immed_rss_invalid; > + wc_flags_out |= IBV_WC_EX_WITH_IMM; > + } > + break; > + } > + > + if (wc_flags & IBV_WC_EX_WITH_QP_NUM) { > + *wc_buffer.b32++ = qpn; > + wc_flags_out |= IBV_WC_EX_WITH_QP_NUM; > + } > + g_mlpath_rqpn = ntohl(cqe->g_mlpath_rqpn); > + if (wc_flags & IBV_WC_EX_WITH_SRC_QP) { > + *wc_buffer.b32++ = g_mlpath_rqpn & 0xffffff; > + wc_flags_out |= IBV_WC_EX_WITH_SRC_QP; > + } > + if (wc_flags & IBV_WC_EX_WITH_PKEY_INDEX) { > + *wc_buffer.b16++ = ntohl(cqe->immed_rss_invalid) & 0x7f; > + wc_flags_out |= IBV_WC_EX_WITH_PKEY_INDEX; > + } > + if (wc_flags & IBV_WC_EX_WITH_SLID) { > + *wc_buffer.b16++ = ntohs(cqe->rlid); > + wc_flags_out |= IBV_WC_EX_WITH_SLID; > + } > + if (wc_flags & IBV_WC_EX_WITH_SL) { > + wc_flags_out |= IBV_WC_EX_WITH_SL; > + if ((*cur_qp) && (*cur_qp)->link_layer == IBV_LINK_LAYER_ETHERNET) > + *wc_buffer.b8++ = ntohs(cqe->sl_vid) >> 13; > + else > + *wc_buffer.b8++ = ntohs(cqe->sl_vid) >> 12; > + } > + if (wc_flags & IBV_WC_EX_WITH_DLID_PATH_BITS) { > + *wc_buffer.b8++ = (g_mlpath_rqpn >> 24) & 0x7f; > + wc_flags_out |= IBV_WC_EX_WITH_DLID_PATH_BITS; > + } > + wc_flags_out |= g_mlpath_rqpn & 0x80000000 ? IBV_WC_EX_GRH : 0; > + /* When working with xrc srqs, don't have qp to check link layer. > + * Using IB SL, should consider Roce. (TBD) > + */ > + } > + > + wc_ex->wc_flags = wc_flags_out; > + /* Align the WC ex to the next 64bit. This is mandatory as ibv_wc_ex is > + * 64bit aligned. pwc_ex is used to write to the next wc and thus we > + * need to align it. > + */ > + *pwc_ex = (struct ibv_wc_ex *)((uintptr_t)(wc_buffer.b8 + sizeof(uint64_t) - 1) & > + ~(sizeof(uint64_t) - 1)); > + > + return CQ_OK; > +} > + > +int mlx4_poll_one_ex(struct mlx4_cq *cq, > + struct mlx4_qp **cur_qp, > + struct ibv_wc_ex **pwc_ex) > +{ > + return _mlx4_poll_one_ex(cq, cur_qp, pwc_ex, cq->wc_flags); > +} > + > int mlx4_poll_cq(struct ibv_cq *ibcq, int ne, struct ibv_wc *wc) > { > struct mlx4_cq *cq = to_mcq(ibcq); > @@ -363,6 +582,36 @@ int mlx4_poll_cq(struct ibv_cq *ibcq, int ne, struct ibv_wc *wc) > return err == CQ_POLL_ERR ? err : npolled; > } > > +int mlx4_poll_cq_ex(struct ibv_cq *ibcq, > + struct ibv_wc_ex *wc, > + struct ibv_poll_cq_ex_attr *attr) > +{ > + struct mlx4_cq *cq = to_mcq(ibcq); > + struct mlx4_qp *qp = NULL; > + int npolled; > + int err = CQ_OK; > + unsigned int ne = attr->max_entries; > + uint64_t wc_flags = cq->wc_flags; > + > + if (attr->comp_mask) > + return -EINVAL; > + > + pthread_spin_lock(&cq->lock); > + > + for (npolled = 0; npolled < ne; ++npolled) { > + err = _mlx4_poll_one_ex(cq, &qp, &wc, wc_flags); > + if (err != CQ_OK) > + break; > + } > + > + if (npolled || err == CQ_POLL_ERR) > + update_cons_index(cq); > + > + pthread_spin_unlock(&cq->lock); > + > + return err == CQ_POLL_ERR ? err : npolled; > +} > + > int mlx4_arm_cq(struct ibv_cq *ibvcq, int solicited) > { > struct mlx4_cq *cq = to_mcq(ibvcq); > diff --git a/src/mlx4.c b/src/mlx4.c > index 9cfd013..cc1211f 100644 > --- a/src/mlx4.c > +++ b/src/mlx4.c > @@ -209,6 +209,7 @@ static int mlx4_init_context(struct verbs_device *v_device, > verbs_set_ctx_op(verbs_ctx, ibv_destroy_flow, ibv_cmd_destroy_flow); > verbs_set_ctx_op(verbs_ctx, query_device_ex, mlx4_query_device_ex); > verbs_set_ctx_op(verbs_ctx, create_cq_ex, mlx4_create_cq_ex); > + verbs_set_ctx_op(verbs_ctx, poll_cq_ex, mlx4_poll_cq_ex); > > return 0; > > diff --git a/src/mlx4.h b/src/mlx4.h > index 91eb79c..e22f879 100644 > --- a/src/mlx4.h > +++ b/src/mlx4.h > @@ -213,6 +213,7 @@ struct mlx4_pd { > > struct mlx4_cq { > struct ibv_cq ibv_cq; > + uint64_t wc_flags; > struct mlx4_buf buf; > struct mlx4_buf resize_buf; > pthread_spinlock_t lock; > @@ -410,6 +411,9 @@ int mlx4_alloc_cq_buf(struct mlx4_device *dev, struct mlx4_buf *buf, int nent, > int mlx4_resize_cq(struct ibv_cq *cq, int cqe); > int mlx4_destroy_cq(struct ibv_cq *cq); > int mlx4_poll_cq(struct ibv_cq *cq, int ne, struct ibv_wc *wc); > +int mlx4_poll_cq_ex(struct ibv_cq *ibcq, > + struct ibv_wc_ex *wc, > + struct ibv_poll_cq_ex_attr *attr); > int mlx4_arm_cq(struct ibv_cq *cq, int solicited); > void mlx4_cq_event(struct ibv_cq *cq); > void __mlx4_cq_clean(struct mlx4_cq *cq, uint32_t qpn, struct mlx4_srq *srq); > diff --git a/src/verbs.c b/src/verbs.c > index 3290b86..0dcdc87 100644 > --- a/src/verbs.c > +++ b/src/verbs.c > @@ -387,6 +387,7 @@ static struct ibv_cq *create_cq(struct ibv_context *context, > goto err_db; > > cq->creation_flags = cmd_e.ibv_cmd.flags; > + cq->wc_flags = cq_attr->wc_flags; > cq->cqn = resp.cqn; > > return &cq->ibv_cq; > -- > 2.1.0 > > -- > To unsubscribe from this list: send the line "unsubscribe linux-rdma" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html This should have libmlx4 prefix. -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/src/cq.c b/src/cq.c index 32c9070..c86e824 100644 --- a/src/cq.c +++ b/src/cq.c @@ -52,6 +52,7 @@ enum { }; enum { + CQ_CONTINUE = 1, CQ_OK = 0, CQ_EMPTY = -1, CQ_POLL_ERR = -2 @@ -121,7 +122,9 @@ static void update_cons_index(struct mlx4_cq *cq) *cq->set_ci_db = htonl(cq->cons_index & 0xffffff); } -static void mlx4_handle_error_cqe(struct mlx4_err_cqe *cqe, struct ibv_wc *wc) +static void mlx4_handle_error_cqe(struct mlx4_err_cqe *cqe, + enum ibv_wc_status *status, + enum ibv_wc_opcode *vendor_err) { if (cqe->syndrome == MLX4_CQE_SYNDROME_LOCAL_QP_OP_ERR) printf(PFX "local QP operation err " @@ -133,64 +136,68 @@ static void mlx4_handle_error_cqe(struct mlx4_err_cqe *cqe, struct ibv_wc *wc) switch (cqe->syndrome) { case MLX4_CQE_SYNDROME_LOCAL_LENGTH_ERR: - wc->status = IBV_WC_LOC_LEN_ERR; + *status = IBV_WC_LOC_LEN_ERR; break; case MLX4_CQE_SYNDROME_LOCAL_QP_OP_ERR: - wc->status = IBV_WC_LOC_QP_OP_ERR; + *status = IBV_WC_LOC_QP_OP_ERR; break; case MLX4_CQE_SYNDROME_LOCAL_PROT_ERR: - wc->status = IBV_WC_LOC_PROT_ERR; + *status = IBV_WC_LOC_PROT_ERR; break; case MLX4_CQE_SYNDROME_WR_FLUSH_ERR: - wc->status = IBV_WC_WR_FLUSH_ERR; + *status = IBV_WC_WR_FLUSH_ERR; break; case MLX4_CQE_SYNDROME_MW_BIND_ERR: - wc->status = IBV_WC_MW_BIND_ERR; + *status = IBV_WC_MW_BIND_ERR; break; case MLX4_CQE_SYNDROME_BAD_RESP_ERR: - wc->status = IBV_WC_BAD_RESP_ERR; + *status = IBV_WC_BAD_RESP_ERR; break; case MLX4_CQE_SYNDROME_LOCAL_ACCESS_ERR: - wc->status = IBV_WC_LOC_ACCESS_ERR; + *status = IBV_WC_LOC_ACCESS_ERR; break; case MLX4_CQE_SYNDROME_REMOTE_INVAL_REQ_ERR: - wc->status = IBV_WC_REM_INV_REQ_ERR; + *status = IBV_WC_REM_INV_REQ_ERR; break; case MLX4_CQE_SYNDROME_REMOTE_ACCESS_ERR: - wc->status = IBV_WC_REM_ACCESS_ERR; + *status = IBV_WC_REM_ACCESS_ERR; break; case MLX4_CQE_SYNDROME_REMOTE_OP_ERR: - wc->status = IBV_WC_REM_OP_ERR; + *status = IBV_WC_REM_OP_ERR; break; case MLX4_CQE_SYNDROME_TRANSPORT_RETRY_EXC_ERR: - wc->status = IBV_WC_RETRY_EXC_ERR; + *status = IBV_WC_RETRY_EXC_ERR; break; case MLX4_CQE_SYNDROME_RNR_RETRY_EXC_ERR: - wc->status = IBV_WC_RNR_RETRY_EXC_ERR; + *status = IBV_WC_RNR_RETRY_EXC_ERR; break; case MLX4_CQE_SYNDROME_REMOTE_ABORTED_ERR: - wc->status = IBV_WC_REM_ABORT_ERR; + *status = IBV_WC_REM_ABORT_ERR; break; default: - wc->status = IBV_WC_GENERAL_ERR; + *status = IBV_WC_GENERAL_ERR; break; } - wc->vendor_err = cqe->vendor_err; + *vendor_err = cqe->vendor_err; } -static int mlx4_poll_one(struct mlx4_cq *cq, - struct mlx4_qp **cur_qp, - struct ibv_wc *wc) +static inline int mlx4_handle_cq(struct mlx4_cq *cq, + struct mlx4_qp **cur_qp, + uint64_t *wc_wr_id, + enum ibv_wc_status *wc_status, + uint32_t *wc_vendor_err, + struct mlx4_cqe **pcqe, + uint32_t *pqpn, + int *pis_send) { struct mlx4_wq *wq; struct mlx4_cqe *cqe; struct mlx4_srq *srq; uint32_t qpn; - uint32_t g_mlpath_rqpn; - uint16_t wqe_index; int is_error; int is_send; + uint16_t wqe_index; cqe = next_cqe_sw(cq); if (!cqe) @@ -201,7 +208,7 @@ static int mlx4_poll_one(struct mlx4_cq *cq, ++cq->cons_index; - VALGRIND_MAKE_MEM_DEFINED(cqe, sizeof *cqe); + VALGRIND_MAKE_MEM_DEFINED(cqe, sizeof(*cqe)); /* * Make sure we read CQ entry contents after we've checked the @@ -210,7 +217,6 @@ static int mlx4_poll_one(struct mlx4_cq *cq, rmb(); qpn = ntohl(cqe->vlan_my_qpn) & MLX4_CQE_QPN_MASK; - wc->qp_num = qpn; is_send = cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK; is_error = (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) == @@ -243,26 +249,50 @@ static int mlx4_poll_one(struct mlx4_cq *cq, if (is_send) { wq = &(*cur_qp)->sq; wqe_index = ntohs(cqe->wqe_index); - wq->tail += (uint16_t) (wqe_index - (uint16_t) wq->tail); - wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)]; + wq->tail += (uint16_t)(wqe_index - (uint16_t)wq->tail); + *wc_wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)]; ++wq->tail; } else if (srq) { wqe_index = htons(cqe->wqe_index); - wc->wr_id = srq->wrid[wqe_index]; + *wc_wr_id = srq->wrid[wqe_index]; mlx4_free_srq_wqe(srq, wqe_index); } else { wq = &(*cur_qp)->rq; - wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)]; + *wc_wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)]; ++wq->tail; } if (is_error) { - mlx4_handle_error_cqe((struct mlx4_err_cqe *) cqe, wc); + mlx4_handle_error_cqe((struct mlx4_err_cqe *)cqe, + wc_status, wc_vendor_err); return CQ_OK; } - wc->status = IBV_WC_SUCCESS; + *wc_status = IBV_WC_SUCCESS; + *pcqe = cqe; + *pqpn = qpn; + *pis_send = is_send; + + return CQ_CONTINUE; +} + +static int mlx4_poll_one(struct mlx4_cq *cq, + struct mlx4_qp **cur_qp, + struct ibv_wc *wc) +{ + struct mlx4_cqe *cqe; + uint32_t qpn; + uint32_t g_mlpath_rqpn; + int is_send; + int err; + + err = mlx4_handle_cq(cq, cur_qp, &wc->wr_id, &wc->status, + &wc->vendor_err, &cqe, &qpn, &is_send); + if (err != CQ_CONTINUE) + return err; + + wc->qp_num = qpn; if (is_send) { wc->wc_flags = 0; switch (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) { @@ -340,6 +370,195 @@ static int mlx4_poll_one(struct mlx4_cq *cq, return CQ_OK; } +union wc_buffer { + uint8_t *b8; + uint16_t *b16; + uint32_t *b32; + uint64_t *b64; +}; + +static inline int _mlx4_poll_one_ex(struct mlx4_cq *cq, + struct mlx4_qp **cur_qp, + struct ibv_wc_ex **pwc_ex, + uint64_t wc_flags) +{ + struct mlx4_cqe *cqe; + uint32_t qpn; + uint32_t g_mlpath_rqpn; + int is_send; + struct ibv_wc_ex *wc_ex = *pwc_ex; + union wc_buffer wc_buffer; + int err; + uint64_t wc_flags_out = 0; + + wc_buffer.b64 = (uint64_t *)&wc_ex->buffer; + wc_ex->wc_flags = 0; + wc_ex->reserved = 0; + err = mlx4_handle_cq(cq, cur_qp, &wc_ex->wr_id, &wc_ex->status, + &wc_ex->vendor_err, &cqe, &qpn, &is_send); + if (err != CQ_CONTINUE) + return err; + + if (is_send) { + switch (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) { + case MLX4_OPCODE_RDMA_WRITE_IMM: + wc_flags_out |= IBV_WC_EX_IMM; + case MLX4_OPCODE_RDMA_WRITE: + wc_ex->opcode = IBV_WC_RDMA_WRITE; + if (wc_flags & IBV_WC_EX_WITH_BYTE_LEN) + wc_buffer.b32++; + if (wc_flags & IBV_WC_EX_WITH_IMM) + wc_buffer.b32++; + break; + case MLX4_OPCODE_SEND_IMM: + wc_flags_out |= IBV_WC_EX_IMM; + case MLX4_OPCODE_SEND: + wc_ex->opcode = IBV_WC_SEND; + if (wc_flags & IBV_WC_EX_WITH_BYTE_LEN) + wc_buffer.b32++; + if (wc_flags & IBV_WC_EX_WITH_IMM) + wc_buffer.b32++; + break; + case MLX4_OPCODE_RDMA_READ: + wc_ex->opcode = IBV_WC_RDMA_READ; + if (wc_flags & IBV_WC_EX_WITH_BYTE_LEN) { + *wc_buffer.b32++ = ntohl(cqe->byte_cnt); + wc_flags_out |= IBV_WC_EX_WITH_BYTE_LEN; + } + if (wc_flags & IBV_WC_EX_WITH_IMM) + wc_buffer.b32++; + break; + case MLX4_OPCODE_ATOMIC_CS: + wc_ex->opcode = IBV_WC_COMP_SWAP; + if (wc_flags & IBV_WC_EX_WITH_BYTE_LEN) { + *wc_buffer.b32++ = 8; + wc_flags_out |= IBV_WC_EX_WITH_BYTE_LEN; + } + if (wc_flags & IBV_WC_EX_WITH_IMM) + wc_buffer.b32++; + break; + case MLX4_OPCODE_ATOMIC_FA: + wc_ex->opcode = IBV_WC_FETCH_ADD; + if (wc_flags & IBV_WC_EX_WITH_BYTE_LEN) { + *wc_buffer.b32++ = 8; + wc_flags_out |= IBV_WC_EX_WITH_BYTE_LEN; + } + if (wc_flags & IBV_WC_EX_WITH_IMM) + wc_buffer.b32++; + break; + case MLX4_OPCODE_BIND_MW: + wc_ex->opcode = IBV_WC_BIND_MW; + if (wc_flags & IBV_WC_EX_WITH_BYTE_LEN) + wc_buffer.b32++; + if (wc_flags & IBV_WC_EX_WITH_IMM) + wc_buffer.b32++; + break; + default: + /* assume it's a send completion */ + wc_ex->opcode = IBV_WC_SEND; + if (wc_flags & IBV_WC_EX_WITH_BYTE_LEN) + wc_buffer.b32++; + if (wc_flags & IBV_WC_EX_WITH_IMM) + wc_buffer.b32++; + break; + } + + if (wc_flags & IBV_WC_EX_WITH_QP_NUM) { + *wc_buffer.b32++ = qpn; + wc_flags_out |= IBV_WC_EX_WITH_QP_NUM; + } + if (wc_flags & IBV_WC_EX_WITH_SRC_QP) + wc_buffer.b32++; + if (wc_flags & IBV_WC_EX_WITH_PKEY_INDEX) + wc_buffer.b16++; + if (wc_flags & IBV_WC_EX_WITH_SLID) + wc_buffer.b16++; + if (wc_flags & IBV_WC_EX_WITH_SL) + wc_buffer.b8++; + if (wc_flags & IBV_WC_EX_WITH_DLID_PATH_BITS) + wc_buffer.b8++; + } else { + if (wc_flags & IBV_WC_EX_WITH_BYTE_LEN) { + *wc_buffer.b32++ = ntohl(cqe->byte_cnt); + wc_flags_out |= IBV_WC_EX_WITH_BYTE_LEN; + } + + switch (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) { + case MLX4_RECV_OPCODE_RDMA_WRITE_IMM: + wc_ex->opcode = IBV_WC_RECV_RDMA_WITH_IMM; + wc_flags_out |= IBV_WC_EX_IMM; + if (wc_flags & IBV_WC_EX_WITH_IMM) { + *wc_buffer.b32++ = cqe->immed_rss_invalid; + wc_flags_out |= IBV_WC_EX_WITH_IMM; + } + break; + case MLX4_RECV_OPCODE_SEND: + wc_ex->opcode = IBV_WC_RECV; + if (wc_flags & IBV_WC_EX_WITH_IMM) + wc_buffer.b32++; + break; + case MLX4_RECV_OPCODE_SEND_IMM: + wc_ex->opcode = IBV_WC_RECV; + wc_flags_out |= IBV_WC_EX_IMM; + if (wc_flags & IBV_WC_EX_WITH_IMM) { + *wc_buffer.b32++ = cqe->immed_rss_invalid; + wc_flags_out |= IBV_WC_EX_WITH_IMM; + } + break; + } + + if (wc_flags & IBV_WC_EX_WITH_QP_NUM) { + *wc_buffer.b32++ = qpn; + wc_flags_out |= IBV_WC_EX_WITH_QP_NUM; + } + g_mlpath_rqpn = ntohl(cqe->g_mlpath_rqpn); + if (wc_flags & IBV_WC_EX_WITH_SRC_QP) { + *wc_buffer.b32++ = g_mlpath_rqpn & 0xffffff; + wc_flags_out |= IBV_WC_EX_WITH_SRC_QP; + } + if (wc_flags & IBV_WC_EX_WITH_PKEY_INDEX) { + *wc_buffer.b16++ = ntohl(cqe->immed_rss_invalid) & 0x7f; + wc_flags_out |= IBV_WC_EX_WITH_PKEY_INDEX; + } + if (wc_flags & IBV_WC_EX_WITH_SLID) { + *wc_buffer.b16++ = ntohs(cqe->rlid); + wc_flags_out |= IBV_WC_EX_WITH_SLID; + } + if (wc_flags & IBV_WC_EX_WITH_SL) { + wc_flags_out |= IBV_WC_EX_WITH_SL; + if ((*cur_qp) && (*cur_qp)->link_layer == IBV_LINK_LAYER_ETHERNET) + *wc_buffer.b8++ = ntohs(cqe->sl_vid) >> 13; + else + *wc_buffer.b8++ = ntohs(cqe->sl_vid) >> 12; + } + if (wc_flags & IBV_WC_EX_WITH_DLID_PATH_BITS) { + *wc_buffer.b8++ = (g_mlpath_rqpn >> 24) & 0x7f; + wc_flags_out |= IBV_WC_EX_WITH_DLID_PATH_BITS; + } + wc_flags_out |= g_mlpath_rqpn & 0x80000000 ? IBV_WC_EX_GRH : 0; + /* When working with xrc srqs, don't have qp to check link layer. + * Using IB SL, should consider Roce. (TBD) + */ + } + + wc_ex->wc_flags = wc_flags_out; + /* Align the WC ex to the next 64bit. This is mandatory as ibv_wc_ex is + * 64bit aligned. pwc_ex is used to write to the next wc and thus we + * need to align it. + */ + *pwc_ex = (struct ibv_wc_ex *)((uintptr_t)(wc_buffer.b8 + sizeof(uint64_t) - 1) & + ~(sizeof(uint64_t) - 1)); + + return CQ_OK; +} + +int mlx4_poll_one_ex(struct mlx4_cq *cq, + struct mlx4_qp **cur_qp, + struct ibv_wc_ex **pwc_ex) +{ + return _mlx4_poll_one_ex(cq, cur_qp, pwc_ex, cq->wc_flags); +} + int mlx4_poll_cq(struct ibv_cq *ibcq, int ne, struct ibv_wc *wc) { struct mlx4_cq *cq = to_mcq(ibcq); @@ -363,6 +582,36 @@ int mlx4_poll_cq(struct ibv_cq *ibcq, int ne, struct ibv_wc *wc) return err == CQ_POLL_ERR ? err : npolled; } +int mlx4_poll_cq_ex(struct ibv_cq *ibcq, + struct ibv_wc_ex *wc, + struct ibv_poll_cq_ex_attr *attr) +{ + struct mlx4_cq *cq = to_mcq(ibcq); + struct mlx4_qp *qp = NULL; + int npolled; + int err = CQ_OK; + unsigned int ne = attr->max_entries; + uint64_t wc_flags = cq->wc_flags; + + if (attr->comp_mask) + return -EINVAL; + + pthread_spin_lock(&cq->lock); + + for (npolled = 0; npolled < ne; ++npolled) { + err = _mlx4_poll_one_ex(cq, &qp, &wc, wc_flags); + if (err != CQ_OK) + break; + } + + if (npolled || err == CQ_POLL_ERR) + update_cons_index(cq); + + pthread_spin_unlock(&cq->lock); + + return err == CQ_POLL_ERR ? err : npolled; +} + int mlx4_arm_cq(struct ibv_cq *ibvcq, int solicited) { struct mlx4_cq *cq = to_mcq(ibvcq); diff --git a/src/mlx4.c b/src/mlx4.c index 9cfd013..cc1211f 100644 --- a/src/mlx4.c +++ b/src/mlx4.c @@ -209,6 +209,7 @@ static int mlx4_init_context(struct verbs_device *v_device, verbs_set_ctx_op(verbs_ctx, ibv_destroy_flow, ibv_cmd_destroy_flow); verbs_set_ctx_op(verbs_ctx, query_device_ex, mlx4_query_device_ex); verbs_set_ctx_op(verbs_ctx, create_cq_ex, mlx4_create_cq_ex); + verbs_set_ctx_op(verbs_ctx, poll_cq_ex, mlx4_poll_cq_ex); return 0; diff --git a/src/mlx4.h b/src/mlx4.h index 91eb79c..e22f879 100644 --- a/src/mlx4.h +++ b/src/mlx4.h @@ -213,6 +213,7 @@ struct mlx4_pd { struct mlx4_cq { struct ibv_cq ibv_cq; + uint64_t wc_flags; struct mlx4_buf buf; struct mlx4_buf resize_buf; pthread_spinlock_t lock; @@ -410,6 +411,9 @@ int mlx4_alloc_cq_buf(struct mlx4_device *dev, struct mlx4_buf *buf, int nent, int mlx4_resize_cq(struct ibv_cq *cq, int cqe); int mlx4_destroy_cq(struct ibv_cq *cq); int mlx4_poll_cq(struct ibv_cq *cq, int ne, struct ibv_wc *wc); +int mlx4_poll_cq_ex(struct ibv_cq *ibcq, + struct ibv_wc_ex *wc, + struct ibv_poll_cq_ex_attr *attr); int mlx4_arm_cq(struct ibv_cq *cq, int solicited); void mlx4_cq_event(struct ibv_cq *cq); void __mlx4_cq_clean(struct mlx4_cq *cq, uint32_t qpn, struct mlx4_srq *srq); diff --git a/src/verbs.c b/src/verbs.c index 3290b86..0dcdc87 100644 --- a/src/verbs.c +++ b/src/verbs.c @@ -387,6 +387,7 @@ static struct ibv_cq *create_cq(struct ibv_context *context, goto err_db; cq->creation_flags = cmd_e.ibv_cmd.flags; + cq->wc_flags = cq_attr->wc_flags; cq->cqn = resp.cqn; return &cq->ibv_cq;
Add an implementation for verb_poll_cq extension verb. This patch implements the new API via the standard function mlx4_poll_one. Signed-off-by: Matan Barak <matanb@mellanox.com> --- src/cq.c | 307 ++++++++++++++++++++++++++++++++++++++++++++++++++++++------ src/mlx4.c | 1 + src/mlx4.h | 4 + src/verbs.c | 1 + 4 files changed, 284 insertions(+), 29 deletions(-)