From patchwork Sun Nov 15 12:30:29 2015 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Matan Barak X-Patchwork-Id: 7618911 Return-Path: X-Original-To: patchwork-linux-rdma@patchwork.kernel.org Delivered-To: patchwork-parsemail@patchwork1.web.kernel.org Received: from mail.kernel.org (mail.kernel.org [198.145.29.136]) by patchwork1.web.kernel.org (Postfix) with ESMTP id DFCE79F2E2 for ; Sun, 15 Nov 2015 12:33:48 +0000 (UTC) Received: from mail.kernel.org (localhost [127.0.0.1]) by mail.kernel.org (Postfix) with ESMTP id D5AF9205B9 for ; Sun, 15 Nov 2015 12:33:46 +0000 (UTC) Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by mail.kernel.org (Postfix) with ESMTP id C580F205BA for ; Sun, 15 Nov 2015 12:33:44 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1751319AbbKOMdj (ORCPT ); Sun, 15 Nov 2015 07:33:39 -0500 Received: from [193.47.165.129] ([193.47.165.129]:48582 "EHLO mellanox.co.il" rhost-flags-FAIL-FAIL-OK-FAIL) by vger.kernel.org with ESMTP id S1751327AbbKOMd3 (ORCPT ); Sun, 15 Nov 2015 07:33:29 -0500 Received: from Internal Mail-Server by MTLPINE1 (envelope-from matanb@mellanox.com) with ESMTPS (AES256-SHA encrypted); 15 Nov 2015 14:32:51 +0200 Received: from rsws33.mtr.labs.mlnx (dev-r-vrt-064.mtr.labs.mlnx [10.212.64.1]) by labmailer.mlnx (8.13.8/8.13.8) with ESMTP id tAFCWo5x019038; Sun, 15 Nov 2015 14:32:50 +0200 From: Matan Barak To: Eli Cohen Cc: linux-rdma@vger.kernel.org, Doug Ledford , Matan Barak , Eran Ben Elisha , Christoph Lameter Subject: [PATCH libmlx5 2/7] Add ibv_poll_cq_ex support Date: Sun, 15 Nov 2015 14:30:29 +0200 Message-Id: <1447590634-12858-3-git-send-email-matanb@mellanox.com> X-Mailer: git-send-email 2.1.0 In-Reply-To: <1447590634-12858-1-git-send-email-matanb@mellanox.com> References: <1447590634-12858-1-git-send-email-matanb@mellanox.com> Sender: linux-rdma-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: linux-rdma@vger.kernel.org X-Spam-Status: No, score=-7.5 required=5.0 tests=BAYES_00, RCVD_IN_DNSWL_HI, RP_MATCHES_RCVD, UNPARSEABLE_RELAY autolearn=ham version=3.3.1 X-Spam-Checker-Version: SpamAssassin 3.3.1 (2010-03-16) on mail.kernel.org X-Virus-Scanned: ClamAV using ClamSMTP Extended poll_cq supports writing only user's required work completion fields. Adding support for this extended verb. Signed-off-by: Matan Barak --- src/cq.c | 699 +++++++++++++++++++++++++++++++++++++++++++++++++------------ src/mlx5.c | 5 + src/mlx5.h | 14 ++ 3 files changed, 584 insertions(+), 134 deletions(-) diff --git a/src/cq.c b/src/cq.c index 32f0dd4..0185696 100644 --- a/src/cq.c +++ b/src/cq.c @@ -200,6 +200,85 @@ static void handle_good_req(struct ibv_wc *wc, struct mlx5_cqe64 *cqe) } } +union wc_buffer { + uint8_t *b8; + uint16_t *b16; + uint32_t *b32; + uint64_t *b64; +}; + +static inline void handle_good_req_ex(struct ibv_wc_ex *wc_ex, + union wc_buffer *pwc_buffer, + struct mlx5_cqe64 *cqe, + uint64_t wc_flags, + uint32_t qpn) +{ + union wc_buffer wc_buffer = *pwc_buffer; + + switch (ntohl(cqe->sop_drop_qpn) >> 24) { + case MLX5_OPCODE_RDMA_WRITE_IMM: + wc_ex->wc_flags |= IBV_WC_EX_IMM; + case MLX5_OPCODE_RDMA_WRITE: + wc_ex->opcode = IBV_WC_RDMA_WRITE; + if (wc_flags & IBV_WC_EX_WITH_BYTE_LEN) + wc_buffer.b32++; + if (wc_flags & IBV_WC_EX_WITH_IMM) + wc_buffer.b32++; + break; + case MLX5_OPCODE_SEND_IMM: + wc_ex->wc_flags |= IBV_WC_EX_IMM; + case MLX5_OPCODE_SEND: + case MLX5_OPCODE_SEND_INVAL: + wc_ex->opcode = IBV_WC_SEND; + if (wc_flags & IBV_WC_EX_WITH_BYTE_LEN) + wc_buffer.b32++; + if (wc_flags & IBV_WC_EX_WITH_IMM) + wc_buffer.b32++; + break; + case MLX5_OPCODE_RDMA_READ: + wc_ex->opcode = IBV_WC_RDMA_READ; + if (wc_flags & IBV_WC_EX_WITH_BYTE_LEN) { + *wc_buffer.b32++ = ntohl(cqe->byte_cnt); + wc_ex->wc_flags |= IBV_WC_EX_WITH_BYTE_LEN; + } + if (wc_flags & IBV_WC_EX_WITH_IMM) + wc_buffer.b32++; + break; + case MLX5_OPCODE_ATOMIC_CS: + wc_ex->opcode = IBV_WC_COMP_SWAP; + if (wc_flags & IBV_WC_EX_WITH_BYTE_LEN) { + *wc_buffer.b32++ = 8; + wc_ex->wc_flags |= IBV_WC_EX_WITH_BYTE_LEN; + } + if (wc_flags & IBV_WC_EX_WITH_IMM) + wc_buffer.b32++; + break; + case MLX5_OPCODE_ATOMIC_FA: + wc_ex->opcode = IBV_WC_FETCH_ADD; + if (wc_flags & IBV_WC_EX_WITH_BYTE_LEN) { + *wc_buffer.b32++ = 8; + wc_ex->wc_flags |= IBV_WC_EX_WITH_BYTE_LEN; + } + if (wc_flags & IBV_WC_EX_WITH_IMM) + wc_buffer.b32++; + break; + case MLX5_OPCODE_BIND_MW: + wc_ex->opcode = IBV_WC_BIND_MW; + if (wc_flags & IBV_WC_EX_WITH_BYTE_LEN) + wc_buffer.b32++; + if (wc_flags & IBV_WC_EX_WITH_IMM) + wc_buffer.b32++; + break; + } + + if (wc_flags & IBV_WC_EX_WITH_QP_NUM) { + *wc_buffer.b32++ = qpn; + wc_ex->wc_flags |= IBV_WC_EX_WITH_QP_NUM; + } + + *pwc_buffer = wc_buffer; +} + static int handle_responder(struct ibv_wc *wc, struct mlx5_cqe64 *cqe, struct mlx5_qp *qp, struct mlx5_srq *srq) { @@ -262,6 +341,103 @@ static int handle_responder(struct ibv_wc *wc, struct mlx5_cqe64 *cqe, return IBV_WC_SUCCESS; } +static inline int handle_responder_ex(struct ibv_wc_ex *wc_ex, + union wc_buffer *pwc_buffer, + struct mlx5_cqe64 *cqe, + struct mlx5_qp *qp, struct mlx5_srq *srq, + uint64_t wc_flags, uint32_t qpn) +{ + uint16_t wqe_ctr; + struct mlx5_wq *wq; + uint8_t g; + union wc_buffer wc_buffer = *pwc_buffer; + int err = 0; + uint32_t byte_len = ntohl(cqe->byte_cnt); + + if (wc_flags & IBV_WC_EX_WITH_BYTE_LEN) { + *wc_buffer.b32++ = byte_len; + wc_ex->wc_flags |= IBV_WC_EX_WITH_BYTE_LEN; + } + if (srq) { + wqe_ctr = ntohs(cqe->wqe_counter); + wc_ex->wr_id = srq->wrid[wqe_ctr]; + mlx5_free_srq_wqe(srq, wqe_ctr); + if (cqe->op_own & MLX5_INLINE_SCATTER_32) + err = mlx5_copy_to_recv_srq(srq, wqe_ctr, cqe, + byte_len); + else if (cqe->op_own & MLX5_INLINE_SCATTER_64) + err = mlx5_copy_to_recv_srq(srq, wqe_ctr, cqe - 1, + byte_len); + } else { + wq = &qp->rq; + wqe_ctr = wq->tail & (wq->wqe_cnt - 1); + wc_ex->wr_id = wq->wrid[wqe_ctr]; + ++wq->tail; + if (cqe->op_own & MLX5_INLINE_SCATTER_32) + err = mlx5_copy_to_recv_wqe(qp, wqe_ctr, cqe, + byte_len); + else if (cqe->op_own & MLX5_INLINE_SCATTER_64) + err = mlx5_copy_to_recv_wqe(qp, wqe_ctr, cqe - 1, + byte_len); + } + if (err) + return err; + + switch (cqe->op_own >> 4) { + case MLX5_CQE_RESP_WR_IMM: + wc_ex->opcode = IBV_WC_RECV_RDMA_WITH_IMM; + wc_ex->wc_flags = IBV_WC_EX_IMM; + if (wc_flags & IBV_WC_EX_WITH_IMM) { + *wc_buffer.b32++ = ntohl(cqe->byte_cnt); + wc_ex->wc_flags |= IBV_WC_EX_WITH_IMM; + } + break; + case MLX5_CQE_RESP_SEND: + wc_ex->opcode = IBV_WC_RECV; + if (wc_flags & IBV_WC_EX_WITH_IMM) + wc_buffer.b32++; + break; + case MLX5_CQE_RESP_SEND_IMM: + wc_ex->opcode = IBV_WC_RECV; + wc_ex->wc_flags = IBV_WC_EX_WITH_IMM; + if (wc_flags & IBV_WC_EX_WITH_IMM) { + *wc_buffer.b32++ = ntohl(cqe->imm_inval_pkey); + wc_ex->wc_flags |= IBV_WC_EX_WITH_IMM; + } + break; + } + if (wc_flags & IBV_WC_EX_WITH_QP_NUM) { + *wc_buffer.b32++ = qpn; + wc_ex->wc_flags |= IBV_WC_EX_WITH_QP_NUM; + } + if (wc_flags & IBV_WC_EX_WITH_SRC_QP) { + *wc_buffer.b32++ = ntohl(cqe->flags_rqpn) & 0xffffff; + wc_ex->wc_flags |= IBV_WC_EX_WITH_SRC_QP; + } + if (wc_flags & IBV_WC_EX_WITH_PKEY_INDEX) { + *wc_buffer.b16++ = ntohl(cqe->imm_inval_pkey) & 0xffff; + wc_ex->wc_flags |= IBV_WC_EX_WITH_PKEY_INDEX; + } + if (wc_flags & IBV_WC_EX_WITH_SLID) { + *wc_buffer.b16++ = ntohs(cqe->slid); + wc_ex->wc_flags |= IBV_WC_EX_WITH_SLID; + } + if (wc_flags & IBV_WC_EX_WITH_SL) { + *wc_buffer.b8++ = (ntohl(cqe->flags_rqpn) >> 24) & 0xf; + wc_ex->wc_flags |= IBV_WC_EX_WITH_SL; + } + if (wc_flags & IBV_WC_EX_WITH_DLID_PATH_BITS) { + *wc_buffer.b8++ = cqe->ml_path & 0x7f; + wc_ex->wc_flags |= IBV_WC_EX_WITH_DLID_PATH_BITS; + } + + g = (ntohl(cqe->flags_rqpn) >> 28) & 3; + wc_ex->wc_flags |= g ? IBV_WC_EX_GRH : 0; + + *pwc_buffer = wc_buffer; + return IBV_WC_SUCCESS; +} + static void dump_cqe(FILE *fp, void *buf) { uint32_t *p = buf; @@ -273,54 +449,55 @@ static void dump_cqe(FILE *fp, void *buf) } static void mlx5_handle_error_cqe(struct mlx5_err_cqe *cqe, - struct ibv_wc *wc) + uint32_t *pwc_status, + uint32_t *pwc_vendor_err) { switch (cqe->syndrome) { case MLX5_CQE_SYNDROME_LOCAL_LENGTH_ERR: - wc->status = IBV_WC_LOC_LEN_ERR; + *pwc_status = IBV_WC_LOC_LEN_ERR; break; case MLX5_CQE_SYNDROME_LOCAL_QP_OP_ERR: - wc->status = IBV_WC_LOC_QP_OP_ERR; + *pwc_status = IBV_WC_LOC_QP_OP_ERR; break; case MLX5_CQE_SYNDROME_LOCAL_PROT_ERR: - wc->status = IBV_WC_LOC_PROT_ERR; + *pwc_status = IBV_WC_LOC_PROT_ERR; break; case MLX5_CQE_SYNDROME_WR_FLUSH_ERR: - wc->status = IBV_WC_WR_FLUSH_ERR; + *pwc_status = IBV_WC_WR_FLUSH_ERR; break; case MLX5_CQE_SYNDROME_MW_BIND_ERR: - wc->status = IBV_WC_MW_BIND_ERR; + *pwc_status = IBV_WC_MW_BIND_ERR; break; case MLX5_CQE_SYNDROME_BAD_RESP_ERR: - wc->status = IBV_WC_BAD_RESP_ERR; + *pwc_status = IBV_WC_BAD_RESP_ERR; break; case MLX5_CQE_SYNDROME_LOCAL_ACCESS_ERR: - wc->status = IBV_WC_LOC_ACCESS_ERR; + *pwc_status = IBV_WC_LOC_ACCESS_ERR; break; case MLX5_CQE_SYNDROME_REMOTE_INVAL_REQ_ERR: - wc->status = IBV_WC_REM_INV_REQ_ERR; + *pwc_status = IBV_WC_REM_INV_REQ_ERR; break; case MLX5_CQE_SYNDROME_REMOTE_ACCESS_ERR: - wc->status = IBV_WC_REM_ACCESS_ERR; + *pwc_status = IBV_WC_REM_ACCESS_ERR; break; case MLX5_CQE_SYNDROME_REMOTE_OP_ERR: - wc->status = IBV_WC_REM_OP_ERR; + *pwc_status = IBV_WC_REM_OP_ERR; break; case MLX5_CQE_SYNDROME_TRANSPORT_RETRY_EXC_ERR: - wc->status = IBV_WC_RETRY_EXC_ERR; + *pwc_status = IBV_WC_RETRY_EXC_ERR; break; case MLX5_CQE_SYNDROME_RNR_RETRY_EXC_ERR: - wc->status = IBV_WC_RNR_RETRY_EXC_ERR; + *pwc_status = IBV_WC_RNR_RETRY_EXC_ERR; break; case MLX5_CQE_SYNDROME_REMOTE_ABORTED_ERR: - wc->status = IBV_WC_REM_ABORT_ERR; + *pwc_status = IBV_WC_REM_ABORT_ERR; break; default: - wc->status = IBV_WC_GENERAL_ERR; + *pwc_status = IBV_WC_GENERAL_ERR; break; } - wc->vendor_err = cqe->vendor_err_synd; + *pwc_vendor_err = cqe->vendor_err_synd; } #if defined(__x86_64__) || defined (__i386__) @@ -453,6 +630,171 @@ static inline int get_srq_ctx(struct mlx5_context *mctx, return CQ_OK; } +static inline void dump_cqe_debug(FILE *fp, struct mlx5_cqe64 *cqe64) + __attribute__((always_inline)); +static inline void dump_cqe_debug(FILE *fp, struct mlx5_cqe64 *cqe64) +{ +#ifdef MLX5_DEBUG + if (mlx5_debug_mask & MLX5_DBG_CQ_CQE) { + mlx5_dbg(fp, MLX5_DBG_CQ_CQE, "dump cqe for cqn 0x%x:\n", cq->cqn); + dump_cqe(fp, cqe64); + } +#endif +} + +inline int mlx5_poll_one_cqe_req(struct mlx5_cq *cq, + struct mlx5_resource **cur_rsc, + void *cqe, uint32_t qpn, int cqe_ver, + uint64_t *wr_id) __attribute__((always_inline)); +inline int mlx5_poll_one_cqe_req(struct mlx5_cq *cq, + struct mlx5_resource **cur_rsc, + void *cqe, uint32_t qpn, int cqe_ver, + uint64_t *wr_id) +{ + struct mlx5_context *mctx = to_mctx(cq->ibv_cq.context); + struct mlx5_qp *mqp = NULL; + struct mlx5_cqe64 *cqe64 = (cq->cqe_sz == 64) ? cqe : cqe + 64; + uint32_t byte_len = ntohl(cqe64->byte_cnt); + struct mlx5_wq *wq; + uint16_t wqe_ctr; + int err; + int idx; + + mqp = get_req_context(mctx, cur_rsc, + (cqe_ver ? (ntohl(cqe64->srqn_uidx) & 0xffffff) : qpn), + cqe_ver); + if (unlikely(!mqp)) + return CQ_POLL_ERR; + wq = &mqp->sq; + wqe_ctr = ntohs(cqe64->wqe_counter); + idx = wqe_ctr & (wq->wqe_cnt - 1); + if (cqe64->op_own & MLX5_INLINE_SCATTER_32) + err = mlx5_copy_to_send_wqe(mqp, wqe_ctr, cqe, + byte_len); + else if (cqe64->op_own & MLX5_INLINE_SCATTER_64) + err = mlx5_copy_to_send_wqe(mqp, wqe_ctr, cqe - 1, + byte_len); + else + err = 0; + + wq->tail = wq->wqe_head[idx] + 1; + *wr_id = wq->wrid[idx]; + + return err; +} + +inline int mlx5_poll_one_cqe_resp(struct mlx5_context *mctx, + struct mlx5_resource **cur_rsc, + struct mlx5_srq **cur_srq, + struct mlx5_cqe64 *cqe64, int cqe_ver, + uint32_t qpn, int *is_srq) + __attribute__((always_inline)); +inline int mlx5_poll_one_cqe_resp(struct mlx5_context *mctx, + struct mlx5_resource **cur_rsc, + struct mlx5_srq **cur_srq, + struct mlx5_cqe64 *cqe64, int cqe_ver, + uint32_t qpn, int *is_srq) +{ + uint32_t srqn_uidx = ntohl(cqe64->srqn_uidx) & 0xffffff; + int err; + + if (cqe_ver) { + err = get_resp_cxt_v1(mctx, cur_rsc, cur_srq, srqn_uidx, is_srq); + } else { + if (srqn_uidx) { + err = get_srq_ctx(mctx, cur_srq, srqn_uidx); + *is_srq = 1; + } else { + err = get_resp_ctx(mctx, cur_rsc, qpn); + } + } + + return err; +} + +inline int mlx5_poll_one_cqe_err(struct mlx5_context *mctx, + struct mlx5_resource **cur_rsc, + struct mlx5_srq **cur_srq, + struct mlx5_cqe64 *cqe64, int cqe_ver, + uint32_t qpn, uint32_t *pwc_status, + uint32_t *pwc_vendor_err, + uint64_t *pwc_wr_id, uint8_t opcode) + __attribute__((always_inline)); +inline int mlx5_poll_one_cqe_err(struct mlx5_context *mctx, + struct mlx5_resource **cur_rsc, + struct mlx5_srq **cur_srq, + struct mlx5_cqe64 *cqe64, int cqe_ver, + uint32_t qpn, uint32_t *pwc_status, + uint32_t *pwc_vendor_err, + uint64_t *pwc_wr_id, uint8_t opcode) +{ + uint32_t srqn_uidx = ntohl(cqe64->srqn_uidx) & 0xffffff; + struct mlx5_err_cqe *ecqe = (struct mlx5_err_cqe *)cqe64; + int err = CQ_OK; + + mlx5_handle_error_cqe(ecqe, pwc_status, pwc_vendor_err); + if (unlikely(ecqe->syndrome != MLX5_CQE_SYNDROME_WR_FLUSH_ERR && + ecqe->syndrome != MLX5_CQE_SYNDROME_TRANSPORT_RETRY_EXC_ERR)) { + FILE *fp = mctx->dbg_fp; + + fprintf(fp, PFX "%s: got completion with error:\n", + mctx->hostname); + dump_cqe(fp, ecqe); + if (mlx5_freeze_on_error_cqe) { + fprintf(fp, PFX "freezing at poll cq..."); + while (1) + sleep(10); + } + } + + if (opcode == MLX5_CQE_REQ_ERR) { + struct mlx5_qp *mqp = NULL; + struct mlx5_wq *wq; + uint16_t wqe_ctr; + int idx; + + mqp = get_req_context(mctx, cur_rsc, (cqe_ver ? srqn_uidx : qpn), cqe_ver); + if (unlikely(!mqp)) + return CQ_POLL_ERR; + wq = &mqp->sq; + wqe_ctr = ntohs(cqe64->wqe_counter); + idx = wqe_ctr & (wq->wqe_cnt - 1); + *pwc_wr_id = wq->wrid[idx]; + wq->tail = wq->wqe_head[idx] + 1; + } else { + int is_srq = 0; + + if (cqe_ver) { + err = get_resp_cxt_v1(mctx, cur_rsc, cur_srq, srqn_uidx, &is_srq); + } else { + if (srqn_uidx) { + err = get_srq_ctx(mctx, cur_srq, srqn_uidx); + is_srq = 1; + } else { + err = get_resp_ctx(mctx, cur_rsc, qpn); + } + } + if (unlikely(err)) + return CQ_POLL_ERR; + + if (is_srq) { + uint16_t wqe_ctr = ntohs(cqe64->wqe_counter); + + *pwc_wr_id = (*cur_srq)->wrid[wqe_ctr]; + mlx5_free_srq_wqe(*cur_srq, wqe_ctr); + } else { + struct mlx5_qp *mqp = rsc_to_mqp(*cur_rsc); + struct mlx5_wq *wq; + + wq = &mqp->rq; + *pwc_wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)]; + ++wq->tail; + } + } + + return err; +} + static inline int mlx5_poll_one(struct mlx5_cq *cq, struct mlx5_resource **cur_rsc, struct mlx5_srq **cur_srq, @@ -464,17 +806,10 @@ static inline int mlx5_poll_one(struct mlx5_cq *cq, struct ibv_wc *wc, int cqe_ver) { struct mlx5_cqe64 *cqe64; - struct mlx5_wq *wq; - uint16_t wqe_ctr; void *cqe; uint32_t qpn; - uint32_t srqn_uidx; - int idx; uint8_t opcode; - struct mlx5_err_cqe *ecqe; int err; - int is_srq = 0; - struct mlx5_qp *mqp = NULL; struct mlx5_context *mctx = to_mctx(cq->ibv_cq.context); cqe = next_cqe_sw(cq); @@ -494,137 +829,165 @@ static inline int mlx5_poll_one(struct mlx5_cq *cq, */ rmb(); -#ifdef MLX5_DEBUG - if (mlx5_debug_mask & MLX5_DBG_CQ_CQE) { - FILE *fp = mctx->dbg_fp; - - mlx5_dbg(fp, MLX5_DBG_CQ_CQE, "dump cqe for cqn 0x%x:\n", cq->cqn); - dump_cqe(fp, cqe64); - } -#endif + dump_cqe_debug(mctx->dbg_fp, cqe64); qpn = ntohl(cqe64->sop_drop_qpn) & 0xffffff; wc->wc_flags = 0; switch (opcode) { case MLX5_CQE_REQ: - mqp = get_req_context(mctx, cur_rsc, - (cqe_ver ? (ntohl(cqe64->srqn_uidx) & 0xffffff) : qpn), - cqe_ver); - if (unlikely(!mqp)) - return CQ_POLL_ERR; - wq = &mqp->sq; - wqe_ctr = ntohs(cqe64->wqe_counter); - idx = wqe_ctr & (wq->wqe_cnt - 1); + err = mlx5_poll_one_cqe_req(cq, cur_rsc, cqe, qpn, cqe_ver, + &wc->wr_id); handle_good_req(wc, cqe64); - if (cqe64->op_own & MLX5_INLINE_SCATTER_32) - err = mlx5_copy_to_send_wqe(mqp, wqe_ctr, cqe, - wc->byte_len); - else if (cqe64->op_own & MLX5_INLINE_SCATTER_64) - err = mlx5_copy_to_send_wqe(mqp, wqe_ctr, cqe - 1, - wc->byte_len); - else - err = 0; - - wc->wr_id = wq->wrid[idx]; - wq->tail = wq->wqe_head[idx] + 1; wc->status = err; break; + case MLX5_CQE_RESP_WR_IMM: case MLX5_CQE_RESP_SEND: case MLX5_CQE_RESP_SEND_IMM: - case MLX5_CQE_RESP_SEND_INV: - srqn_uidx = ntohl(cqe64->srqn_uidx) & 0xffffff; - if (cqe_ver) { - err = get_resp_cxt_v1(mctx, cur_rsc, cur_srq, srqn_uidx, &is_srq); - } else { - if (srqn_uidx) { - err = get_srq_ctx(mctx, cur_srq, srqn_uidx); - is_srq = 1; - } else { - err = get_resp_ctx(mctx, cur_rsc, qpn); - } - } + case MLX5_CQE_RESP_SEND_INV: { + int is_srq; + + err = mlx5_poll_one_cqe_resp(mctx, cur_rsc, cur_srq, cqe64, + cqe_ver, qpn, &is_srq); if (unlikely(err)) return err; wc->status = handle_responder(wc, cqe64, rsc_to_mqp(*cur_rsc), is_srq ? *cur_srq : NULL); break; + } case MLX5_CQE_RESIZE_CQ: break; case MLX5_CQE_REQ_ERR: case MLX5_CQE_RESP_ERR: - srqn_uidx = ntohl(cqe64->srqn_uidx) & 0xffffff; - ecqe = (struct mlx5_err_cqe *)cqe64; - mlx5_handle_error_cqe(ecqe, wc); - if (unlikely(ecqe->syndrome != MLX5_CQE_SYNDROME_WR_FLUSH_ERR && - ecqe->syndrome != MLX5_CQE_SYNDROME_TRANSPORT_RETRY_EXC_ERR)) { - FILE *fp = mctx->dbg_fp; - fprintf(fp, PFX "%s: got completion with error:\n", - mctx->hostname); - dump_cqe(fp, ecqe); - if (mlx5_freeze_on_error_cqe) { - fprintf(fp, PFX "freezing at poll cq..."); - while (1) - sleep(10); - } - } + err = mlx5_poll_one_cqe_err(mctx, cur_rsc, cur_srq, cqe64, + cqe_ver, qpn, &wc->status, + &wc->vendor_err, &wc->wr_id, + opcode); + if (err != CQ_OK) + return err; + break; + } - if (opcode == MLX5_CQE_REQ_ERR) { - mqp = get_req_context(mctx, cur_rsc, (cqe_ver ? srqn_uidx : qpn), cqe_ver); - if (unlikely(!mqp)) - return CQ_POLL_ERR; - wq = &mqp->sq; - wqe_ctr = ntohs(cqe64->wqe_counter); - idx = wqe_ctr & (wq->wqe_cnt - 1); - wc->wr_id = wq->wrid[idx]; - wq->tail = wq->wqe_head[idx] + 1; - } else { - if (cqe_ver) { - err = get_resp_cxt_v1(mctx, cur_rsc, cur_srq, srqn_uidx, &is_srq); - } else { - if (srqn_uidx) { - err = get_srq_ctx(mctx, cur_srq, srqn_uidx); - is_srq = 1; - } else { - err = get_resp_ctx(mctx, cur_rsc, qpn); - } - } - if (unlikely(err)) - return CQ_POLL_ERR; + wc->qp_num = qpn; + return CQ_OK; +} - if (is_srq) { - wqe_ctr = ntohs(cqe64->wqe_counter); - wc->wr_id = (*cur_srq)->wrid[wqe_ctr]; - mlx5_free_srq_wqe(*cur_srq, wqe_ctr); - } else { - mqp = rsc_to_mqp(*cur_rsc); - wq = &mqp->rq; - wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)]; - ++wq->tail; - } - } +inline int mlx5_poll_one_ex(struct mlx5_cq *cq, + struct mlx5_resource **cur_rsc, + struct mlx5_srq **cur_srq, + struct ibv_wc_ex **pwc_ex, uint64_t wc_flags, + int cqe_ver) +{ + struct mlx5_cqe64 *cqe64; + void *cqe; + uint32_t qpn; + uint8_t opcode; + int err; + struct mlx5_context *mctx = to_mctx(cq->ibv_cq.context); + struct ibv_wc_ex *wc_ex = *pwc_ex; + union wc_buffer wc_buffer; + + cqe = next_cqe_sw(cq); + if (!cqe) + return CQ_EMPTY; + + cqe64 = (cq->cqe_sz == 64) ? cqe : cqe + 64; + + opcode = cqe64->op_own >> 4; + ++cq->cons_index; + + VALGRIND_MAKE_MEM_DEFINED(cqe64, sizeof *cqe64); + + /* + * Make sure we read CQ entry contents after we've checked the + * ownership bit. + */ + rmb(); + + dump_cqe_debug(mctx->dbg_fp, cqe64); + + qpn = ntohl(cqe64->sop_drop_qpn) & 0xffffff; + wc_buffer.b64 = (uint64_t *)&wc_ex->buffer; + wc_ex->wc_flags = 0; + wc_ex->reserved = 0; + + switch (opcode) { + case MLX5_CQE_REQ: + err = mlx5_poll_one_cqe_req(cq, cur_rsc, cqe, qpn, cqe_ver, + &wc_ex->wr_id); + handle_good_req_ex(wc_ex, &wc_buffer, cqe64, wc_flags, qpn); + wc_ex->status = err; + if (wc_flags & IBV_WC_EX_WITH_SRC_QP) + wc_buffer.b32++; + if (wc_flags & IBV_WC_EX_WITH_PKEY_INDEX) + wc_buffer.b16++; + if (wc_flags & IBV_WC_EX_WITH_SLID) + wc_buffer.b16++; + if (wc_flags & IBV_WC_EX_WITH_SL) + wc_buffer.b8++; + if (wc_flags & IBV_WC_EX_WITH_DLID_PATH_BITS) + wc_buffer.b8++; + break; + + case MLX5_CQE_RESP_WR_IMM: + case MLX5_CQE_RESP_SEND: + case MLX5_CQE_RESP_SEND_IMM: + case MLX5_CQE_RESP_SEND_INV: { + int is_srq; + + err = mlx5_poll_one_cqe_resp(mctx, cur_rsc, cur_srq, cqe64, + cqe_ver, qpn, &is_srq); + if (unlikely(err)) + return err; + + wc_ex->status = handle_responder_ex(wc_ex, &wc_buffer, cqe64, + rsc_to_mqp(*cur_rsc), + is_srq ? *cur_srq : NULL, + wc_flags, qpn); break; } + case MLX5_CQE_REQ_ERR: + case MLX5_CQE_RESP_ERR: + err = mlx5_poll_one_cqe_err(mctx, cur_rsc, cur_srq, cqe64, + cqe_ver, qpn, &wc_ex->status, + &wc_ex->vendor_err, &wc_ex->wr_id, + opcode); + if (err != CQ_OK) + return err; - wc->qp_num = qpn; + case MLX5_CQE_RESIZE_CQ: + if (wc_flags & IBV_WC_EX_WITH_BYTE_LEN) + wc_buffer.b32++; + if (wc_flags & IBV_WC_EX_WITH_IMM) + wc_buffer.b32++; + if (wc_flags & IBV_WC_EX_WITH_QP_NUM) { + *wc_buffer.b32++ = qpn; + wc_ex->wc_flags |= IBV_WC_EX_WITH_QP_NUM; + } + if (wc_flags & IBV_WC_EX_WITH_SRC_QP) + wc_buffer.b32++; + if (wc_flags & IBV_WC_EX_WITH_PKEY_INDEX) + wc_buffer.b16++; + if (wc_flags & IBV_WC_EX_WITH_SLID) + wc_buffer.b16++; + if (wc_flags & IBV_WC_EX_WITH_SL) + wc_buffer.b8++; + if (wc_flags & IBV_WC_EX_WITH_DLID_PATH_BITS) + wc_buffer.b8++; + break; + } + *pwc_ex = (struct ibv_wc_ex *)((uintptr_t)(wc_buffer.b8 + sizeof(uint64_t) - 1) & + ~(sizeof(uint64_t) - 1)); return CQ_OK; } -static inline int poll_cq(struct ibv_cq *ibcq, int ne, - struct ibv_wc *wc, int cqe_ver) - __attribute__((always_inline)); -static inline int poll_cq(struct ibv_cq *ibcq, int ne, - struct ibv_wc *wc, int cqe_ver) +static inline void mlx5_poll_cq_stall_start(struct mlx5_cq *cq) +__attribute__((always_inline)); +static inline void mlx5_poll_cq_stall_start(struct mlx5_cq *cq) { - struct mlx5_cq *cq = to_mcq(ibcq); - struct mlx5_resource *rsc = NULL; - struct mlx5_srq *srq = NULL; - int npolled; - int err = CQ_OK; - if (cq->stall_enable) { if (cq->stall_adaptive_enable) { if (cq->stall_last_count) @@ -634,19 +997,13 @@ static inline int poll_cq(struct ibv_cq *ibcq, int ne, mlx5_stall_poll_cq(); } } +} - mlx5_spin_lock(&cq->lock); - - for (npolled = 0; npolled < ne; ++npolled) { - err = mlx5_poll_one(cq, &rsc, &srq, wc + npolled, cqe_ver); - if (err != CQ_OK) - break; - } - - update_cons_index(cq); - - mlx5_spin_unlock(&cq->lock); - +static inline void mlx5_poll_cq_stall_end(struct mlx5_cq *cq, int ne, + int npolled, int err) __attribute__((always_inline)); +static inline void mlx5_poll_cq_stall_end(struct mlx5_cq *cq, int ne, + int npolled, int err) +{ if (cq->stall_enable) { if (cq->stall_adaptive_enable) { if (npolled == 0) { @@ -666,6 +1023,34 @@ static inline int poll_cq(struct ibv_cq *ibcq, int ne, cq->stall_next_poll = 1; } } +} + +static inline int poll_cq(struct ibv_cq *ibcq, int ne, + struct ibv_wc *wc, int cqe_ver) + __attribute__((always_inline)); +static inline int poll_cq(struct ibv_cq *ibcq, int ne, + struct ibv_wc *wc, int cqe_ver) +{ + struct mlx5_cq *cq = to_mcq(ibcq); + struct mlx5_resource *rsc = NULL; + struct mlx5_srq *srq = NULL; + int npolled; + int err = CQ_OK; + + mlx5_poll_cq_stall_start(cq); + mlx5_spin_lock(&cq->lock); + + for (npolled = 0; npolled < ne; ++npolled) { + err = mlx5_poll_one(cq, &rsc, &srq, wc + npolled, cqe_ver); + if (err != CQ_OK) + break; + } + + update_cons_index(cq); + + mlx5_spin_unlock(&cq->lock); + + mlx5_poll_cq_stall_end(cq, ne, npolled, err); return err == CQ_POLL_ERR ? err : npolled; } @@ -680,6 +1065,52 @@ int mlx5_poll_cq_v1(struct ibv_cq *ibcq, int ne, struct ibv_wc *wc) return poll_cq(ibcq, ne, wc, 1); } +static inline int poll_cq_ex(struct ibv_cq *ibcq, struct ibv_wc_ex *wc, + struct ibv_poll_cq_ex_attr *attr, int cqe_ver) +{ + struct mlx5_cq *cq = to_mcq(ibcq); + struct mlx5_resource *rsc = NULL; + struct mlx5_srq *srq = NULL; + int npolled; + int err = CQ_OK; + int (*poll_fn)(struct mlx5_cq *cq, struct mlx5_resource **rsc, + struct mlx5_srq **cur_srq, + struct ibv_wc_ex **pwc_ex, uint64_t wc_flags, + int cqe_ver) = + cq->poll_one; + uint64_t wc_flags = cq->wc_flags; + unsigned int ne = attr->max_entries; + + mlx5_poll_cq_stall_start(cq); + mlx5_spin_lock(&cq->lock); + + for (npolled = 0; npolled < ne; ++npolled) { + err = poll_fn(cq, &rsc, &srq, &wc, wc_flags, cqe_ver); + if (err != CQ_OK) + break; + } + + update_cons_index(cq); + + mlx5_spin_unlock(&cq->lock); + + mlx5_poll_cq_stall_end(cq, ne, npolled, err); + + return err == CQ_POLL_ERR ? err : npolled; +} + +int mlx5_poll_cq_ex(struct ibv_cq *ibcq, struct ibv_wc_ex *wc, + struct ibv_poll_cq_ex_attr *attr) +{ + return poll_cq_ex(ibcq, wc, attr, 0); +} + +int mlx5_poll_cq_v1_ex(struct ibv_cq *ibcq, struct ibv_wc_ex *wc, + struct ibv_poll_cq_ex_attr *attr) +{ + return poll_cq_ex(ibcq, wc, attr, 1); +} + int mlx5_arm_cq(struct ibv_cq *ibvcq, int solicited) { struct mlx5_cq *cq = to_mcq(ibvcq); diff --git a/src/mlx5.c b/src/mlx5.c index 5e9b61c..eac332b 100644 --- a/src/mlx5.c +++ b/src/mlx5.c @@ -664,6 +664,11 @@ static int mlx5_init_context(struct verbs_device *vdev, verbs_set_ctx_op(v_ctx, create_srq_ex, mlx5_create_srq_ex); verbs_set_ctx_op(v_ctx, get_srq_num, mlx5_get_srq_num); verbs_set_ctx_op(v_ctx, query_device_ex, mlx5_query_device_ex); + if (context->cqe_version && context->cqe_version == 1) + verbs_set_ctx_op(v_ctx, poll_cq_ex, mlx5_poll_cq_v1_ex); + else + verbs_set_ctx_op(v_ctx, poll_cq_ex, mlx5_poll_cq_ex); + return 0; diff --git a/src/mlx5.h b/src/mlx5.h index 325e07b..e27e79c 100644 --- a/src/mlx5.h +++ b/src/mlx5.h @@ -349,6 +349,11 @@ enum { struct mlx5_cq { struct ibv_cq ibv_cq; + uint64_t wc_flags; + int (*poll_one)(struct mlx5_cq *cq, struct mlx5_resource **cur_rsc, + struct mlx5_srq **cur_srq, + struct ibv_wc_ex **pwc_ex, uint64_t wc_flags, + int cqe_ver); struct mlx5_buf buf_a; struct mlx5_buf buf_b; struct mlx5_buf *active_buf; @@ -603,6 +608,15 @@ int mlx5_dereg_mr(struct ibv_mr *mr); struct ibv_cq *mlx5_create_cq(struct ibv_context *context, int cqe, struct ibv_comp_channel *channel, int comp_vector); +int mlx5_poll_cq_ex(struct ibv_cq *ibcq, struct ibv_wc_ex *wc, + struct ibv_poll_cq_ex_attr *attr); +int mlx5_poll_cq_v1_ex(struct ibv_cq *ibcq, struct ibv_wc_ex *wc, + struct ibv_poll_cq_ex_attr *attr); +int mlx5_poll_one_ex(struct mlx5_cq *cq, + struct mlx5_resource **cur_rsc, + struct mlx5_srq **cur_srq, + struct ibv_wc_ex **pwc_ex, uint64_t wc_flags, + int cqe_ver); int mlx5_alloc_cq_buf(struct mlx5_context *mctx, struct mlx5_cq *cq, struct mlx5_buf *buf, int nent, int cqe_sz); int mlx5_free_cq_buf(struct mlx5_context *ctx, struct mlx5_buf *buf);