@@ -52,6 +52,7 @@ enum {
};
enum {
+ CQ_CONTINUE = 1,
CQ_OK = 0,
CQ_EMPTY = -1,
CQ_POLL_ERR = -2
@@ -121,7 +122,9 @@ static void update_cons_index(struct mlx4_cq *cq)
*cq->set_ci_db = htonl(cq->cons_index & 0xffffff);
}
-static void mlx4_handle_error_cqe(struct mlx4_err_cqe *cqe, struct ibv_wc *wc)
+static void mlx4_handle_error_cqe(struct mlx4_err_cqe *cqe,
+ enum ibv_wc_status *status,
+ enum ibv_wc_opcode *vendor_err)
{
if (cqe->syndrome == MLX4_CQE_SYNDROME_LOCAL_QP_OP_ERR)
printf(PFX "local QP operation err "
@@ -133,64 +136,68 @@ static void mlx4_handle_error_cqe(struct mlx4_err_cqe *cqe, struct ibv_wc *wc)
switch (cqe->syndrome) {
case MLX4_CQE_SYNDROME_LOCAL_LENGTH_ERR:
- wc->status = IBV_WC_LOC_LEN_ERR;
+ *status = IBV_WC_LOC_LEN_ERR;
break;
case MLX4_CQE_SYNDROME_LOCAL_QP_OP_ERR:
- wc->status = IBV_WC_LOC_QP_OP_ERR;
+ *status = IBV_WC_LOC_QP_OP_ERR;
break;
case MLX4_CQE_SYNDROME_LOCAL_PROT_ERR:
- wc->status = IBV_WC_LOC_PROT_ERR;
+ *status = IBV_WC_LOC_PROT_ERR;
break;
case MLX4_CQE_SYNDROME_WR_FLUSH_ERR:
- wc->status = IBV_WC_WR_FLUSH_ERR;
+ *status = IBV_WC_WR_FLUSH_ERR;
break;
case MLX4_CQE_SYNDROME_MW_BIND_ERR:
- wc->status = IBV_WC_MW_BIND_ERR;
+ *status = IBV_WC_MW_BIND_ERR;
break;
case MLX4_CQE_SYNDROME_BAD_RESP_ERR:
- wc->status = IBV_WC_BAD_RESP_ERR;
+ *status = IBV_WC_BAD_RESP_ERR;
break;
case MLX4_CQE_SYNDROME_LOCAL_ACCESS_ERR:
- wc->status = IBV_WC_LOC_ACCESS_ERR;
+ *status = IBV_WC_LOC_ACCESS_ERR;
break;
case MLX4_CQE_SYNDROME_REMOTE_INVAL_REQ_ERR:
- wc->status = IBV_WC_REM_INV_REQ_ERR;
+ *status = IBV_WC_REM_INV_REQ_ERR;
break;
case MLX4_CQE_SYNDROME_REMOTE_ACCESS_ERR:
- wc->status = IBV_WC_REM_ACCESS_ERR;
+ *status = IBV_WC_REM_ACCESS_ERR;
break;
case MLX4_CQE_SYNDROME_REMOTE_OP_ERR:
- wc->status = IBV_WC_REM_OP_ERR;
+ *status = IBV_WC_REM_OP_ERR;
break;
case MLX4_CQE_SYNDROME_TRANSPORT_RETRY_EXC_ERR:
- wc->status = IBV_WC_RETRY_EXC_ERR;
+ *status = IBV_WC_RETRY_EXC_ERR;
break;
case MLX4_CQE_SYNDROME_RNR_RETRY_EXC_ERR:
- wc->status = IBV_WC_RNR_RETRY_EXC_ERR;
+ *status = IBV_WC_RNR_RETRY_EXC_ERR;
break;
case MLX4_CQE_SYNDROME_REMOTE_ABORTED_ERR:
- wc->status = IBV_WC_REM_ABORT_ERR;
+ *status = IBV_WC_REM_ABORT_ERR;
break;
default:
- wc->status = IBV_WC_GENERAL_ERR;
+ *status = IBV_WC_GENERAL_ERR;
break;
}
- wc->vendor_err = cqe->vendor_err;
+ *vendor_err = cqe->vendor_err;
}
-static int mlx4_poll_one(struct mlx4_cq *cq,
- struct mlx4_qp **cur_qp,
- struct ibv_wc *wc)
+static inline int mlx4_handle_cq(struct mlx4_cq *cq,
+ struct mlx4_qp **cur_qp,
+ uint64_t *wc_wr_id,
+ enum ibv_wc_status *wc_status,
+ uint32_t *wc_vendor_err,
+ struct mlx4_cqe **pcqe,
+ uint32_t *pqpn,
+ int *pis_send)
{
struct mlx4_wq *wq;
struct mlx4_cqe *cqe;
struct mlx4_srq *srq;
uint32_t qpn;
- uint32_t g_mlpath_rqpn;
- uint16_t wqe_index;
int is_error;
int is_send;
+ uint16_t wqe_index;
cqe = next_cqe_sw(cq);
if (!cqe)
@@ -201,7 +208,7 @@ static int mlx4_poll_one(struct mlx4_cq *cq,
++cq->cons_index;
- VALGRIND_MAKE_MEM_DEFINED(cqe, sizeof *cqe);
+ VALGRIND_MAKE_MEM_DEFINED(cqe, sizeof(*cqe));
/*
* Make sure we read CQ entry contents after we've checked the
@@ -210,7 +217,6 @@ static int mlx4_poll_one(struct mlx4_cq *cq,
rmb();
qpn = ntohl(cqe->vlan_my_qpn) & MLX4_CQE_QPN_MASK;
- wc->qp_num = qpn;
is_send = cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK;
is_error = (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) ==
@@ -243,26 +249,50 @@ static int mlx4_poll_one(struct mlx4_cq *cq,
if (is_send) {
wq = &(*cur_qp)->sq;
wqe_index = ntohs(cqe->wqe_index);
- wq->tail += (uint16_t) (wqe_index - (uint16_t) wq->tail);
- wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
+ wq->tail += (uint16_t)(wqe_index - (uint16_t)wq->tail);
+ *wc_wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
++wq->tail;
} else if (srq) {
wqe_index = htons(cqe->wqe_index);
- wc->wr_id = srq->wrid[wqe_index];
+ *wc_wr_id = srq->wrid[wqe_index];
mlx4_free_srq_wqe(srq, wqe_index);
} else {
wq = &(*cur_qp)->rq;
- wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
+ *wc_wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
++wq->tail;
}
if (is_error) {
- mlx4_handle_error_cqe((struct mlx4_err_cqe *) cqe, wc);
+ mlx4_handle_error_cqe((struct mlx4_err_cqe *)cqe,
+ wc_status, wc_vendor_err);
return CQ_OK;
}
- wc->status = IBV_WC_SUCCESS;
+ *wc_status = IBV_WC_SUCCESS;
+ *pcqe = cqe;
+ *pqpn = qpn;
+ *pis_send = is_send;
+
+ return CQ_CONTINUE;
+}
+
+static int mlx4_poll_one(struct mlx4_cq *cq,
+ struct mlx4_qp **cur_qp,
+ struct ibv_wc *wc)
+{
+ struct mlx4_cqe *cqe;
+ uint32_t qpn;
+ uint32_t g_mlpath_rqpn;
+ int is_send;
+ int err;
+
+ err = mlx4_handle_cq(cq, cur_qp, &wc->wr_id, &wc->status,
+ &wc->vendor_err, &cqe, &qpn, &is_send);
+ if (err != CQ_CONTINUE)
+ return err;
+
+ wc->qp_num = qpn;
if (is_send) {
wc->wc_flags = 0;
switch (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) {
@@ -340,6 +370,195 @@ static int mlx4_poll_one(struct mlx4_cq *cq,
return CQ_OK;
}
+union wc_buffer {
+ uint8_t *b8;
+ uint16_t *b16;
+ uint32_t *b32;
+ uint64_t *b64;
+};
+
+static inline int _mlx4_poll_one_ex(struct mlx4_cq *cq,
+ struct mlx4_qp **cur_qp,
+ struct ibv_wc_ex **pwc_ex,
+ uint64_t wc_flags)
+{
+ struct mlx4_cqe *cqe;
+ uint32_t qpn;
+ uint32_t g_mlpath_rqpn;
+ int is_send;
+ struct ibv_wc_ex *wc_ex = *pwc_ex;
+ union wc_buffer wc_buffer;
+ int err;
+ uint64_t wc_flags_out = 0;
+
+ wc_buffer.b64 = (uint64_t *)&wc_ex->buffer;
+ wc_ex->wc_flags = 0;
+ wc_ex->reserved = 0;
+ err = mlx4_handle_cq(cq, cur_qp, &wc_ex->wr_id, &wc_ex->status,
+ &wc_ex->vendor_err, &cqe, &qpn, &is_send);
+ if (err != CQ_CONTINUE)
+ return err;
+
+ if (is_send) {
+ switch (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) {
+ case MLX4_OPCODE_RDMA_WRITE_IMM:
+ wc_flags_out |= IBV_WC_EX_IMM;
+ case MLX4_OPCODE_RDMA_WRITE:
+ wc_ex->opcode = IBV_WC_RDMA_WRITE;
+ if (wc_flags & IBV_WC_EX_WITH_BYTE_LEN)
+ wc_buffer.b32++;
+ if (wc_flags & IBV_WC_EX_WITH_IMM)
+ wc_buffer.b32++;
+ break;
+ case MLX4_OPCODE_SEND_IMM:
+ wc_flags_out |= IBV_WC_EX_IMM;
+ case MLX4_OPCODE_SEND:
+ wc_ex->opcode = IBV_WC_SEND;
+ if (wc_flags & IBV_WC_EX_WITH_BYTE_LEN)
+ wc_buffer.b32++;
+ if (wc_flags & IBV_WC_EX_WITH_IMM)
+ wc_buffer.b32++;
+ break;
+ case MLX4_OPCODE_RDMA_READ:
+ wc_ex->opcode = IBV_WC_RDMA_READ;
+ if (wc_flags & IBV_WC_EX_WITH_BYTE_LEN) {
+ *wc_buffer.b32++ = ntohl(cqe->byte_cnt);
+ wc_flags_out |= IBV_WC_EX_WITH_BYTE_LEN;
+ }
+ if (wc_flags & IBV_WC_EX_WITH_IMM)
+ wc_buffer.b32++;
+ break;
+ case MLX4_OPCODE_ATOMIC_CS:
+ wc_ex->opcode = IBV_WC_COMP_SWAP;
+ if (wc_flags & IBV_WC_EX_WITH_BYTE_LEN) {
+ *wc_buffer.b32++ = 8;
+ wc_flags_out |= IBV_WC_EX_WITH_BYTE_LEN;
+ }
+ if (wc_flags & IBV_WC_EX_WITH_IMM)
+ wc_buffer.b32++;
+ break;
+ case MLX4_OPCODE_ATOMIC_FA:
+ wc_ex->opcode = IBV_WC_FETCH_ADD;
+ if (wc_flags & IBV_WC_EX_WITH_BYTE_LEN) {
+ *wc_buffer.b32++ = 8;
+ wc_flags_out |= IBV_WC_EX_WITH_BYTE_LEN;
+ }
+ if (wc_flags & IBV_WC_EX_WITH_IMM)
+ wc_buffer.b32++;
+ break;
+ case MLX4_OPCODE_BIND_MW:
+ wc_ex->opcode = IBV_WC_BIND_MW;
+ if (wc_flags & IBV_WC_EX_WITH_BYTE_LEN)
+ wc_buffer.b32++;
+ if (wc_flags & IBV_WC_EX_WITH_IMM)
+ wc_buffer.b32++;
+ break;
+ default:
+ /* assume it's a send completion */
+ wc_ex->opcode = IBV_WC_SEND;
+ if (wc_flags & IBV_WC_EX_WITH_BYTE_LEN)
+ wc_buffer.b32++;
+ if (wc_flags & IBV_WC_EX_WITH_IMM)
+ wc_buffer.b32++;
+ break;
+ }
+
+ if (wc_flags & IBV_WC_EX_WITH_QP_NUM) {
+ *wc_buffer.b32++ = qpn;
+ wc_flags_out |= IBV_WC_EX_WITH_QP_NUM;
+ }
+ if (wc_flags & IBV_WC_EX_WITH_SRC_QP)
+ wc_buffer.b32++;
+ if (wc_flags & IBV_WC_EX_WITH_PKEY_INDEX)
+ wc_buffer.b16++;
+ if (wc_flags & IBV_WC_EX_WITH_SLID)
+ wc_buffer.b16++;
+ if (wc_flags & IBV_WC_EX_WITH_SL)
+ wc_buffer.b8++;
+ if (wc_flags & IBV_WC_EX_WITH_DLID_PATH_BITS)
+ wc_buffer.b8++;
+ } else {
+ if (wc_flags & IBV_WC_EX_WITH_BYTE_LEN) {
+ *wc_buffer.b32++ = ntohl(cqe->byte_cnt);
+ wc_flags_out |= IBV_WC_EX_WITH_BYTE_LEN;
+ }
+
+ switch (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) {
+ case MLX4_RECV_OPCODE_RDMA_WRITE_IMM:
+ wc_ex->opcode = IBV_WC_RECV_RDMA_WITH_IMM;
+ wc_flags_out |= IBV_WC_EX_IMM;
+ if (wc_flags & IBV_WC_EX_WITH_IMM) {
+ *wc_buffer.b32++ = cqe->immed_rss_invalid;
+ wc_flags_out |= IBV_WC_EX_WITH_IMM;
+ }
+ break;
+ case MLX4_RECV_OPCODE_SEND:
+ wc_ex->opcode = IBV_WC_RECV;
+ if (wc_flags & IBV_WC_EX_WITH_IMM)
+ wc_buffer.b32++;
+ break;
+ case MLX4_RECV_OPCODE_SEND_IMM:
+ wc_ex->opcode = IBV_WC_RECV;
+ wc_flags_out |= IBV_WC_EX_IMM;
+ if (wc_flags & IBV_WC_EX_WITH_IMM) {
+ *wc_buffer.b32++ = cqe->immed_rss_invalid;
+ wc_flags_out |= IBV_WC_EX_WITH_IMM;
+ }
+ break;
+ }
+
+ if (wc_flags & IBV_WC_EX_WITH_QP_NUM) {
+ *wc_buffer.b32++ = qpn;
+ wc_flags_out |= IBV_WC_EX_WITH_QP_NUM;
+ }
+ g_mlpath_rqpn = ntohl(cqe->g_mlpath_rqpn);
+ if (wc_flags & IBV_WC_EX_WITH_SRC_QP) {
+ *wc_buffer.b32++ = g_mlpath_rqpn & 0xffffff;
+ wc_flags_out |= IBV_WC_EX_WITH_SRC_QP;
+ }
+ if (wc_flags & IBV_WC_EX_WITH_PKEY_INDEX) {
+ *wc_buffer.b16++ = ntohl(cqe->immed_rss_invalid) & 0x7f;
+ wc_flags_out |= IBV_WC_EX_WITH_PKEY_INDEX;
+ }
+ if (wc_flags & IBV_WC_EX_WITH_SLID) {
+ *wc_buffer.b16++ = ntohs(cqe->rlid);
+ wc_flags_out |= IBV_WC_EX_WITH_SLID;
+ }
+ if (wc_flags & IBV_WC_EX_WITH_SL) {
+ wc_flags_out |= IBV_WC_EX_WITH_SL;
+ if ((*cur_qp) && (*cur_qp)->link_layer == IBV_LINK_LAYER_ETHERNET)
+ *wc_buffer.b8++ = ntohs(cqe->sl_vid) >> 13;
+ else
+ *wc_buffer.b8++ = ntohs(cqe->sl_vid) >> 12;
+ }
+ if (wc_flags & IBV_WC_EX_WITH_DLID_PATH_BITS) {
+ *wc_buffer.b8++ = (g_mlpath_rqpn >> 24) & 0x7f;
+ wc_flags_out |= IBV_WC_EX_WITH_DLID_PATH_BITS;
+ }
+ wc_flags_out |= g_mlpath_rqpn & 0x80000000 ? IBV_WC_EX_GRH : 0;
+ /* When working with xrc srqs, don't have qp to check link layer.
+ * Using IB SL, should consider Roce. (TBD)
+ */
+ }
+
+ wc_ex->wc_flags = wc_flags_out;
+ /* Align the WC ex to the next 64bit. This is mandatory as ibv_wc_ex is
+ * 64bit aligned. pwc_ex is used to write to the next wc and thus we
+ * need to align it.
+ */
+ *pwc_ex = (struct ibv_wc_ex *)((uintptr_t)(wc_buffer.b8 + sizeof(uint64_t) - 1) &
+ ~(sizeof(uint64_t) - 1));
+
+ return CQ_OK;
+}
+
+int mlx4_poll_one_ex(struct mlx4_cq *cq,
+ struct mlx4_qp **cur_qp,
+ struct ibv_wc_ex **pwc_ex)
+{
+ return _mlx4_poll_one_ex(cq, cur_qp, pwc_ex, cq->wc_flags);
+}
+
int mlx4_poll_cq(struct ibv_cq *ibcq, int ne, struct ibv_wc *wc)
{
struct mlx4_cq *cq = to_mcq(ibcq);
@@ -363,6 +582,36 @@ int mlx4_poll_cq(struct ibv_cq *ibcq, int ne, struct ibv_wc *wc)
return err == CQ_POLL_ERR ? err : npolled;
}
+int mlx4_poll_cq_ex(struct ibv_cq *ibcq,
+ struct ibv_wc_ex *wc,
+ struct ibv_poll_cq_ex_attr *attr)
+{
+ struct mlx4_cq *cq = to_mcq(ibcq);
+ struct mlx4_qp *qp = NULL;
+ int npolled;
+ int err = CQ_OK;
+ unsigned int ne = attr->max_entries;
+ uint64_t wc_flags = cq->wc_flags;
+
+ if (attr->comp_mask)
+ return -EINVAL;
+
+ pthread_spin_lock(&cq->lock);
+
+ for (npolled = 0; npolled < ne; ++npolled) {
+ err = _mlx4_poll_one_ex(cq, &qp, &wc, wc_flags);
+ if (err != CQ_OK)
+ break;
+ }
+
+ if (npolled || err == CQ_POLL_ERR)
+ update_cons_index(cq);
+
+ pthread_spin_unlock(&cq->lock);
+
+ return err == CQ_POLL_ERR ? err : npolled;
+}
+
int mlx4_arm_cq(struct ibv_cq *ibvcq, int solicited)
{
struct mlx4_cq *cq = to_mcq(ibvcq);
@@ -209,6 +209,7 @@ static int mlx4_init_context(struct verbs_device *v_device,
verbs_set_ctx_op(verbs_ctx, ibv_destroy_flow, ibv_cmd_destroy_flow);
verbs_set_ctx_op(verbs_ctx, query_device_ex, mlx4_query_device_ex);
verbs_set_ctx_op(verbs_ctx, create_cq_ex, mlx4_create_cq_ex);
+ verbs_set_ctx_op(verbs_ctx, poll_cq_ex, mlx4_poll_cq_ex);
return 0;
@@ -213,6 +213,7 @@ struct mlx4_pd {
struct mlx4_cq {
struct ibv_cq ibv_cq;
+ uint64_t wc_flags;
struct mlx4_buf buf;
struct mlx4_buf resize_buf;
pthread_spinlock_t lock;
@@ -410,6 +411,9 @@ int mlx4_alloc_cq_buf(struct mlx4_device *dev, struct mlx4_buf *buf, int nent,
int mlx4_resize_cq(struct ibv_cq *cq, int cqe);
int mlx4_destroy_cq(struct ibv_cq *cq);
int mlx4_poll_cq(struct ibv_cq *cq, int ne, struct ibv_wc *wc);
+int mlx4_poll_cq_ex(struct ibv_cq *ibcq,
+ struct ibv_wc_ex *wc,
+ struct ibv_poll_cq_ex_attr *attr);
int mlx4_arm_cq(struct ibv_cq *cq, int solicited);
void mlx4_cq_event(struct ibv_cq *cq);
void __mlx4_cq_clean(struct mlx4_cq *cq, uint32_t qpn, struct mlx4_srq *srq);
@@ -387,6 +387,7 @@ static struct ibv_cq *create_cq(struct ibv_context *context,
goto err_db;
cq->creation_flags = cmd_e.ibv_cmd.flags;
+ cq->wc_flags = cq_attr->wc_flags;
cq->cqn = resp.cqn;
return &cq->ibv_cq;
Add an implementation for verb_poll_cq extension verb. This patch implements the new API via the standard function mlx4_poll_one. Signed-off-by: Matan Barak <matanb@mellanox.com> --- src/cq.c | 307 ++++++++++++++++++++++++++++++++++++++++++++++++++++++------ src/mlx4.c | 1 + src/mlx4.h | 4 + src/verbs.c | 1 + 4 files changed, 284 insertions(+), 29 deletions(-)