[rdma-core,4/6] mlx5: Handle ODP fault completion in SRQ

Message ID	1550674658-13295-5-git-send-email-yishaih@mellanox.com (mailing list archive)
State	Not Applicable
Headers	show Return-Path: <linux-rdma-owner@kernel.org> From: Yishai Hadas <yishaih@mellanox.com> To: linux-rdma@vger.kernel.org Cc: yishaih@mellanox.com, monis@mellanox.com, artemyko@mellanox.com, jgg@mellanox.com, majd@mellanox.com Subject: [PATCH rdma-core 4/6] mlx5: Handle ODP fault completion in SRQ Date: Wed, 20 Feb 2019 16:57:36 +0200 Message-Id: <1550674658-13295-5-git-send-email-yishaih@mellanox.com> In-Reply-To: <1550674658-13295-1-git-send-email-yishaih@mellanox.com> References: <1550674658-13295-1-git-send-email-yishaih@mellanox.com> Sender: linux-rdma-owner@vger.kernel.org Precedence: bulk
Series	verbs: Enhanced ODP functionality \| expand [rdma-core,0/6] verbs: Enhanced ODP functionality [rdma-core,1/6] verbs: Add xrc_odp_caps field to response of query_device [rdma-core,2/6] verbs: Add SRQ as ODP capability support [rdma-core,3/6] mlx5: Introduce a wait queue for SRQ WQEs [rdma-core,4/6] mlx5: Handle ODP fault completion in SRQ [rdma-core,5/6] verbs: Add option to register ODP MR in ibv_srq_pingpong [rdma-core,6/6] verbs: Add option to register ODP MR in ibv_xsrq_pingpong

diff --git a/providers/mlx5/cq.c b/providers/mlx5/cq.c index 6f5c9f1..b9b47df 100644 --- a/providers/mlx5/cq.c +++ b/providers/mlx5/cq.c @@ -49,7 +49,8 @@ enum { CQ_OK = 0, CQ_EMPTY = -1, - CQ_POLL_ERR = -2 + CQ_POLL_ERR = -2, + CQ_POLL_NODATA = ENOENT }; enum { @@ -659,6 +660,12 @@ static int handle_tag_matching(struct mlx5_cq *cq, return CQ_OK; } +static inline int is_odp_pfault_err(struct mlx5_err_cqe *ecqe) +{ + return ecqe->syndrome == MLX5_CQE_SYNDROME_REMOTE_ABORTED_ERR && + ecqe->vendor_err_synd == MLX5_CQE_VENDOR_SYNDROME_ODP_PFAULT; +} + static inline int mlx5_parse_cqe(struct mlx5_cq *cq, struct mlx5_cqe64 *cqe64, void *cqe, @@ -682,10 +689,14 @@ static inline int mlx5_parse_cqe(struct mlx5_cq *cq, int idx; uint8_t opcode; struct mlx5_err_cqe *ecqe; - int err = 0; + int err; struct mlx5_qp *mqp; struct mlx5_context *mctx; - uint8_t is_srq = 0; + uint8_t is_srq; + +again: + is_srq = 0; + err = 0; mctx = to_mctx(ibv_cq_ex_to_cq(&cq->ibv_cq)->context); qpn = be32toh(cqe64->sop_drop_qpn) & 0xffffff; @@ -811,7 +822,8 @@ static inline int mlx5_parse_cqe(struct mlx5_cq *cq, wc->vendor_err = ecqe->vendor_err_synd; if (unlikely(ecqe->syndrome != MLX5_CQE_SYNDROME_WR_FLUSH_ERR && - ecqe->syndrome != MLX5_CQE_SYNDROME_TRANSPORT_RETRY_EXC_ERR)) { + ecqe->syndrome != MLX5_CQE_SYNDROME_TRANSPORT_RETRY_EXC_ERR && + !is_odp_pfault_err(ecqe))) { FILE *fp = mctx->dbg_fp; fprintf(fp, PFX "%s: got completion with error:\n", mctx->hostname); @@ -844,6 +856,17 @@ static inline int mlx5_parse_cqe(struct mlx5_cq *cq, if (is_srq) { wqe_ctr = be16toh(cqe64->wqe_counter); + if (is_odp_pfault_err(ecqe)) { + mlx5_complete_odp_fault(*cur_srq, wqe_ctr); + err = mlx5_get_next_cqe(cq, &cqe64, &cqe); + /* CQ_POLL_NODATA indicates that CQ was not empty but the polled CQE + * was handled internally and should not processed by the caller. + */ + if (err == CQ_EMPTY) + return CQ_POLL_NODATA; + goto again; + } + if (lazy) cq->ibv_cq.wr_id = (*cur_srq)->wrid[wqe_ctr]; else @@ -1060,7 +1083,7 @@ static inline int mlx5_start_poll(struct ibv_cq_ex *ibcq, struct ibv_poll_cq_att if (lock && err) mlx5_spin_unlock(&cq->lock); - if (stall && err) { + if (stall && err == CQ_POLL_ERR) { if (stall == POLLING_MODE_STALL_ADAPTIVE) { cq->stall_cycles = max(cq->stall_cycles - mlx5_stall_cq_dec_step, mlx5_stall_cq_poll_min); diff --git a/providers/mlx5/mlx5.h b/providers/mlx5/mlx5.h index f315f63..9129c0f 100644 --- a/providers/mlx5/mlx5.h +++ b/providers/mlx5/mlx5.h @@ -811,6 +811,7 @@ int mlx5_query_srq(struct ibv_srq *srq, int mlx5_destroy_srq(struct ibv_srq *srq); int mlx5_alloc_srq_buf(struct ibv_context *context, struct mlx5_srq *srq, uint32_t nwr); +void mlx5_complete_odp_fault(struct mlx5_srq *srq, int ind); void mlx5_free_srq_wqe(struct mlx5_srq *srq, int ind); int mlx5_post_srq_recv(struct ibv_srq *ibsrq, struct ibv_recv_wr *wr, @@ -1030,4 +1031,6 @@ static inline bool srq_has_waitq(struct mlx5_srq *srq) return srq->waitq_head >= 0; } +bool srq_cooldown_wqe(struct mlx5_srq *srq, int ind); + #endif /* MLX5_H */ diff --git a/providers/mlx5/mlx5dv.h b/providers/mlx5/mlx5dv.h index 796ea7b..e2788d8 100644 --- a/providers/mlx5/mlx5dv.h +++ b/providers/mlx5/mlx5dv.h @@ -512,6 +512,10 @@ enum { }; enum { + MLX5_CQE_VENDOR_SYNDROME_ODP_PFAULT = 0x93, +}; + +enum { MLX5_CQE_L2_OK = 1 << 0, MLX5_CQE_L3_OK = 1 << 1, MLX5_CQE_L4_OK = 1 << 2, diff --git a/providers/mlx5/srq.c b/providers/mlx5/srq.c index a2d37d0..71d74a7 100644 --- a/providers/mlx5/srq.c +++ b/providers/mlx5/srq.c @@ -82,6 +82,95 @@ void mlx5_free_srq_wqe(struct mlx5_srq *srq, int ind) mlx5_spin_unlock(&srq->lock); } +/* Take an index and put it last in wait queue */ +static void srq_put_in_waitq(struct mlx5_srq *srq, int ind) +{ + struct mlx5_wqe_srq_next_seg *waitq_tail; + + waitq_tail = get_wqe(srq, srq->waitq_tail); + waitq_tail->next_wqe_index = htobe16(ind); + srq->waitq_tail = ind; +} + +/* Take first in wait queue and put in tail of SRQ */ +static void srq_get_from_waitq(struct mlx5_srq *srq) +{ + struct mlx5_wqe_srq_next_seg *tail; + struct mlx5_wqe_srq_next_seg *waitq_head; + + tail = get_wqe(srq, srq->tail); + waitq_head = get_wqe(srq, srq->waitq_head); + + tail->next_wqe_index = htobe16(srq->waitq_head); + srq->tail = srq->waitq_head; + srq->waitq_head = be16toh(waitq_head->next_wqe_index); +} + +/* Put the given WQE that is in SW ownership at the end of the wait queue. + * Take a WQE from the wait queue and add it to WQEs in SW ownership instead. + */ +bool srq_cooldown_wqe(struct mlx5_srq *srq, int ind) +{ + if (!srq_has_waitq(srq)) + return false; + + srq_put_in_waitq(srq, ind); + srq_get_from_waitq(srq); + return true; +} + +/* Post a WQE internally, based on a previous application post. + * Copy a given WQE's data segments to the SRQ head, advance the head + * and ring the HW doorbell. + */ +static void srq_repost(struct mlx5_srq *srq, int ind) +{ + struct mlx5_wqe_srq_next_seg *src, *dst; + struct mlx5_wqe_data_seg *src_scat, *dst_scat; + int i; + + srq->wrid[srq->head] = srq->wrid[ind]; + + src = get_wqe(srq, ind); + dst = get_wqe(srq, srq->head); + src_scat = (struct mlx5_wqe_data_seg *)(src + 1); + dst_scat = (struct mlx5_wqe_data_seg *)(dst + 1); + + for (i = 0; i < srq->max_gs; ++i) { + dst_scat[i] = src_scat[i]; + + if (dst_scat[i].lkey == htobe32(MLX5_INVALID_LKEY)) + break; + } + + srq->head = be16toh(dst->next_wqe_index); + srq->counter++; + /* Flush descriptors */ + udma_to_device_barrier(); + *srq->db = htobe32(srq->counter); +} + +void mlx5_complete_odp_fault(struct mlx5_srq *srq, int ind) +{ + mlx5_spin_lock(&srq->lock); + + if (!srq_cooldown_wqe(srq, ind)) { + struct mlx5_wqe_srq_next_seg *tail = get_wqe(srq, srq->tail); + + /* Without a wait queue put the page-faulted wqe + * back in SRQ tail. The repost is still possible but + * the risk of overriding the page-faulted WQE with a future + * post_srq_recv() is now higher. + */ + tail->next_wqe_index = htobe16(ind); + srq->tail = ind; + } + + srq_repost(srq, ind); + + mlx5_spin_unlock(&srq->lock); +} + int mlx5_post_srq_recv(struct ibv_srq *ibsrq, struct ibv_recv_wr *wr, struct ibv_recv_wr **bad_wr)

[rdma-core,4/6] mlx5: Handle ODP fault completion in SRQ

Commit Message

Patch