@@ -49,7 +49,8 @@
enum {
CQ_OK = 0,
CQ_EMPTY = -1,
- CQ_POLL_ERR = -2
+ CQ_POLL_ERR = -2,
+ CQ_POLL_NODATA = ENOENT
};
enum {
@@ -659,6 +660,12 @@ static int handle_tag_matching(struct mlx5_cq *cq,
return CQ_OK;
}
+static inline int is_odp_pfault_err(struct mlx5_err_cqe *ecqe)
+{
+ return ecqe->syndrome == MLX5_CQE_SYNDROME_REMOTE_ABORTED_ERR &&
+ ecqe->vendor_err_synd == MLX5_CQE_VENDOR_SYNDROME_ODP_PFAULT;
+}
+
static inline int mlx5_parse_cqe(struct mlx5_cq *cq,
struct mlx5_cqe64 *cqe64,
void *cqe,
@@ -682,10 +689,14 @@ static inline int mlx5_parse_cqe(struct mlx5_cq *cq,
int idx;
uint8_t opcode;
struct mlx5_err_cqe *ecqe;
- int err = 0;
+ int err;
struct mlx5_qp *mqp;
struct mlx5_context *mctx;
- uint8_t is_srq = 0;
+ uint8_t is_srq;
+
+again:
+ is_srq = 0;
+ err = 0;
mctx = to_mctx(ibv_cq_ex_to_cq(&cq->ibv_cq)->context);
qpn = be32toh(cqe64->sop_drop_qpn) & 0xffffff;
@@ -811,7 +822,8 @@ static inline int mlx5_parse_cqe(struct mlx5_cq *cq,
wc->vendor_err = ecqe->vendor_err_synd;
if (unlikely(ecqe->syndrome != MLX5_CQE_SYNDROME_WR_FLUSH_ERR &&
- ecqe->syndrome != MLX5_CQE_SYNDROME_TRANSPORT_RETRY_EXC_ERR)) {
+ ecqe->syndrome != MLX5_CQE_SYNDROME_TRANSPORT_RETRY_EXC_ERR &&
+ !is_odp_pfault_err(ecqe))) {
FILE *fp = mctx->dbg_fp;
fprintf(fp, PFX "%s: got completion with error:\n",
mctx->hostname);
@@ -844,6 +856,17 @@ static inline int mlx5_parse_cqe(struct mlx5_cq *cq,
if (is_srq) {
wqe_ctr = be16toh(cqe64->wqe_counter);
+ if (is_odp_pfault_err(ecqe)) {
+ mlx5_complete_odp_fault(*cur_srq, wqe_ctr);
+ err = mlx5_get_next_cqe(cq, &cqe64, &cqe);
+ /* CQ_POLL_NODATA indicates that CQ was not empty but the polled CQE
+ * was handled internally and should not processed by the caller.
+ */
+ if (err == CQ_EMPTY)
+ return CQ_POLL_NODATA;
+ goto again;
+ }
+
if (lazy)
cq->ibv_cq.wr_id = (*cur_srq)->wrid[wqe_ctr];
else
@@ -1060,7 +1083,7 @@ static inline int mlx5_start_poll(struct ibv_cq_ex *ibcq, struct ibv_poll_cq_att
if (lock && err)
mlx5_spin_unlock(&cq->lock);
- if (stall && err) {
+ if (stall && err == CQ_POLL_ERR) {
if (stall == POLLING_MODE_STALL_ADAPTIVE) {
cq->stall_cycles = max(cq->stall_cycles - mlx5_stall_cq_dec_step,
mlx5_stall_cq_poll_min);
@@ -811,6 +811,7 @@ int mlx5_query_srq(struct ibv_srq *srq,
int mlx5_destroy_srq(struct ibv_srq *srq);
int mlx5_alloc_srq_buf(struct ibv_context *context, struct mlx5_srq *srq,
uint32_t nwr);
+void mlx5_complete_odp_fault(struct mlx5_srq *srq, int ind);
void mlx5_free_srq_wqe(struct mlx5_srq *srq, int ind);
int mlx5_post_srq_recv(struct ibv_srq *ibsrq,
struct ibv_recv_wr *wr,
@@ -1030,4 +1031,6 @@ static inline bool srq_has_waitq(struct mlx5_srq *srq)
return srq->waitq_head >= 0;
}
+bool srq_cooldown_wqe(struct mlx5_srq *srq, int ind);
+
#endif /* MLX5_H */
@@ -512,6 +512,10 @@ enum {
};
enum {
+ MLX5_CQE_VENDOR_SYNDROME_ODP_PFAULT = 0x93,
+};
+
+enum {
MLX5_CQE_L2_OK = 1 << 0,
MLX5_CQE_L3_OK = 1 << 1,
MLX5_CQE_L4_OK = 1 << 2,
@@ -82,6 +82,95 @@ void mlx5_free_srq_wqe(struct mlx5_srq *srq, int ind)
mlx5_spin_unlock(&srq->lock);
}
+/* Take an index and put it last in wait queue */
+static void srq_put_in_waitq(struct mlx5_srq *srq, int ind)
+{
+ struct mlx5_wqe_srq_next_seg *waitq_tail;
+
+ waitq_tail = get_wqe(srq, srq->waitq_tail);
+ waitq_tail->next_wqe_index = htobe16(ind);
+ srq->waitq_tail = ind;
+}
+
+/* Take first in wait queue and put in tail of SRQ */
+static void srq_get_from_waitq(struct mlx5_srq *srq)
+{
+ struct mlx5_wqe_srq_next_seg *tail;
+ struct mlx5_wqe_srq_next_seg *waitq_head;
+
+ tail = get_wqe(srq, srq->tail);
+ waitq_head = get_wqe(srq, srq->waitq_head);
+
+ tail->next_wqe_index = htobe16(srq->waitq_head);
+ srq->tail = srq->waitq_head;
+ srq->waitq_head = be16toh(waitq_head->next_wqe_index);
+}
+
+/* Put the given WQE that is in SW ownership at the end of the wait queue.
+ * Take a WQE from the wait queue and add it to WQEs in SW ownership instead.
+ */
+bool srq_cooldown_wqe(struct mlx5_srq *srq, int ind)
+{
+ if (!srq_has_waitq(srq))
+ return false;
+
+ srq_put_in_waitq(srq, ind);
+ srq_get_from_waitq(srq);
+ return true;
+}
+
+/* Post a WQE internally, based on a previous application post.
+ * Copy a given WQE's data segments to the SRQ head, advance the head
+ * and ring the HW doorbell.
+ */
+static void srq_repost(struct mlx5_srq *srq, int ind)
+{
+ struct mlx5_wqe_srq_next_seg *src, *dst;
+ struct mlx5_wqe_data_seg *src_scat, *dst_scat;
+ int i;
+
+ srq->wrid[srq->head] = srq->wrid[ind];
+
+ src = get_wqe(srq, ind);
+ dst = get_wqe(srq, srq->head);
+ src_scat = (struct mlx5_wqe_data_seg *)(src + 1);
+ dst_scat = (struct mlx5_wqe_data_seg *)(dst + 1);
+
+ for (i = 0; i < srq->max_gs; ++i) {
+ dst_scat[i] = src_scat[i];
+
+ if (dst_scat[i].lkey == htobe32(MLX5_INVALID_LKEY))
+ break;
+ }
+
+ srq->head = be16toh(dst->next_wqe_index);
+ srq->counter++;
+ /* Flush descriptors */
+ udma_to_device_barrier();
+ *srq->db = htobe32(srq->counter);
+}
+
+void mlx5_complete_odp_fault(struct mlx5_srq *srq, int ind)
+{
+ mlx5_spin_lock(&srq->lock);
+
+ if (!srq_cooldown_wqe(srq, ind)) {
+ struct mlx5_wqe_srq_next_seg *tail = get_wqe(srq, srq->tail);
+
+ /* Without a wait queue put the page-faulted wqe
+ * back in SRQ tail. The repost is still possible but
+ * the risk of overriding the page-faulted WQE with a future
+ * post_srq_recv() is now higher.
+ */
+ tail->next_wqe_index = htobe16(ind);
+ srq->tail = ind;
+ }
+
+ srq_repost(srq, ind);
+
+ mlx5_spin_unlock(&srq->lock);
+}
+
int mlx5_post_srq_recv(struct ibv_srq *ibsrq,
struct ibv_recv_wr *wr,
struct ibv_recv_wr **bad_wr)