diff mbox

[WIP,rdma-core] cxgb4: Add srq support for Chelsio Adapters

Message ID 20180424172522.21028-1-rajur@chelsio.com (mailing list archive)
State Not Applicable
Headers show

Commit Message

Raju Rangoju April 24, 2018, 5:25 p.m. UTC
Reference for the corresponding kernel mode srq submission

https://www.spinics.net/lists/linux-rdma/msg63695.html
https://www.spinics.net/lists/linux-rdma/msg63696.html
https://www.spinics.net/lists/linux-rdma/msg63697.html

This patch adds necessary changes for supporting srq feature to chelsio
adapters.

Signed-off-by: Raju Rangoju <rajur@chelsio.com>
Reviewed-by: Steve Wise <swise@opengridcomputing.com>
---
 kernel-headers/rdma/cxgb4-abi.h |  28 ++++-
 providers/cxgb4/cq.c            | 168 +++++++++++++++++++++++++++--
 providers/cxgb4/cxgb4-abi.h     |   5 +-
 providers/cxgb4/dev.c           |   2 +
 providers/cxgb4/libcxgb4.h      |  22 ++++
 providers/cxgb4/qp.c            | 228 ++++++++++++++++++++++++++++++---------
 providers/cxgb4/t4.h            | 164 +++++++++++++++++++++++++++-
 providers/cxgb4/t4_regs.h       |   4 +
 providers/cxgb4/t4fw_api.h      |   2 +
 providers/cxgb4/t4fw_ri_api.h   |  20 ++++
 providers/cxgb4/verbs.c         | 232 ++++++++++++++++++++++++++++++++--------
 11 files changed, 761 insertions(+), 114 deletions(-)

Comments

Jason Gunthorpe April 30, 2018, 7:57 p.m. UTC | #1
On Tue, Apr 24, 2018 at 10:55:22PM +0530, Raju Rangoju wrote:
> Reference for the corresponding kernel mode srq submission
> 
> https://www.spinics.net/lists/linux-rdma/msg63695.html
> https://www.spinics.net/lists/linux-rdma/msg63696.html
> https://www.spinics.net/lists/linux-rdma/msg63697.html
> 
> This patch adds necessary changes for supporting srq feature to chelsio
> adapters.
> 
> Signed-off-by: Raju Rangoju <rajur@chelsio.com>
> Reviewed-by: Steve Wise <swise@opengridcomputing.com>
>  kernel-headers/rdma/cxgb4-abi.h |  28 ++++-
>  providers/cxgb4/cq.c            | 168 +++++++++++++++++++++++++++--
>  providers/cxgb4/cxgb4-abi.h     |   5 +-
>  providers/cxgb4/dev.c           |   2 +
>  providers/cxgb4/libcxgb4.h      |  22 ++++
>  providers/cxgb4/qp.c            | 228 ++++++++++++++++++++++++++++++---------
>  providers/cxgb4/t4.h            | 164 +++++++++++++++++++++++++++-
>  providers/cxgb4/t4_regs.h       |   4 +
>  providers/cxgb4/t4fw_api.h      |   2 +
>  providers/cxgb4/t4fw_ri_api.h   |  20 ++++
>  providers/cxgb4/verbs.c         | 232 ++++++++++++++++++++++++++++++++--------
>  11 files changed, 761 insertions(+), 114 deletions(-)
> 
> diff --git a/kernel-headers/rdma/cxgb4-abi.h b/kernel-headers/rdma/cxgb4-abi.h
> index 1fefd014..55959158 100644
> +++ b/kernel-headers/rdma/cxgb4-abi.h
> @@ -44,6 +44,16 @@
>   * In particular do not use pointer types -- pass pointers in __aligned_u64
>   * instead.
>   */
> +
> +enum {
> +	C4IW_64B_CQE = (1 << 0)
> +};
> +
> +struct c4iw_create_cq {
> +	__u32 flags;
> +	__u32 reserved;
> +};
> +
>  struct c4iw_create_cq_resp {
>  	__aligned_u64 key;
>  	__aligned_u64 gts_key;
> @@ -51,7 +61,7 @@ struct c4iw_create_cq_resp {
>  	__u32 cqid;
>  	__u32 size;
>  	__u32 qid_mask;
> -	__u32 reserved; /* explicit padding (optional for i386) */
> +	__u32 flags;
>  };
>  
>  enum {
> @@ -84,4 +94,20 @@ struct c4iw_alloc_pd_resp {
>  	__u32 pdid;
>  };
>  
> +struct c4iw_create_srq_resp {
> +	//struct ibv_create_srq_resp ibv_resp;
> +	__u64 srq_key;
> +	__u64 srq_db_gts_key;
> +	__u64 srq_memsize;

These are supposed to be __aligned_u64 in the kernel patches


> +	__u32 srqid;
> +	__u32 srq_size;
> +	__u32 rqt_abs_idx;
> +	__u32 qid_mask;
> +	__u32 flags;
> +};
> +
> +enum {
> +	T4_SRQ_LIMIT_SUPPORT = (1<<0), /* HW supports SRQ_LIMIT_REACHED event */

() is unneeded

The above need to be fixed in the kernel patches.

Changes to kernel-headers/rdma/* need to be done via the
kernel-headers/update script which creates a dedicate patch for the
kernel header sync.

> diff --git a/providers/cxgb4/cxgb4-abi.h b/providers/cxgb4/cxgb4-abi.h
> index 14fe8feb..63945719 100644
> +++ b/providers/cxgb4/cxgb4-abi.h
> @@ -58,12 +58,13 @@ struct c4iw_create_qp_resp_v0 _c4iw_create_qp_resp_v0;
>  DECLARE_DRV_CMD(uc4iw_alloc_pd, IB_USER_VERBS_CMD_ALLOC_PD,
>  		empty, c4iw_alloc_pd_resp);
>  DECLARE_DRV_CMD(uc4iw_create_cq, IB_USER_VERBS_CMD_CREATE_CQ,
> -		empty, c4iw_create_cq_resp);
> +		c4iw_create_cq, c4iw_create_cq_resp);

How are you handling compatibility here?

>  DECLARE_DRV_CMD(uc4iw_create_qp, IB_USER_VERBS_CMD_CREATE_QP,
>  		empty, c4iw_create_qp_resp);
>  DECLARE_DRV_CMD(uc4iw_create_qp_v0, IB_USER_VERBS_CMD_CREATE_QP,
>  		empty, c4iw_create_qp_resp_v0);
>  DECLARE_DRV_CMD(uc4iw_alloc_ucontext, IB_USER_VERBS_CMD_GET_CONTEXT,
>  		empty, c4iw_alloc_ucontext_resp);
> -
> +DECLARE_DRV_CMD(uc4iw_create_srq, IB_USER_VERBS_CMD_CREATE_SRQ,
> +		empty, c4iw_create_srq_resp);

This list is sorted by IB_USER_VERBS_CMD_xxx

> +struct t4_srq {
> +	union  t4_recv_wr *queue;
> +	struct t4_swrqe *sw_rq;
> +	volatile u32 *udb;

use of volatile is probably wrong, what is this?

> +static inline void t4_ring_srq_db(struct t4_srq *srq, u16 inc, u8 len16,
> +				  union t4_recv_wr *wqe)
> +{
> +	mmio_wc_start();
> +	if (inc == 1 && srq->wc_reg_available) {
> +		PDBG("%s: WC srq->pidx = %d; len16=%d\n",
> +				__func__, srq->pidx, len16);
> +		copy_wqe_to_udb(srq->udb + 14, wqe);
> +	} else {
> +		PDBG("%s: DB srq->pidx = %d; len16=%d\n",
> +				__func__, srq->pidx, len16);
> +		writel(QID_V(srq->bar2_qid) | PIDX_T5_V(inc), srq->udb);
> +	}
> +	mmio_flush_writes();
> +	return;
> +}

Are you sure that shouldn't be in a lock?

> diff --git a/providers/cxgb4/verbs.c b/providers/cxgb4/verbs.c
> index 3c493697..435bb238 100644
> +++ b/providers/cxgb4/verbs.c
> @@ -168,6 +168,7 @@ int c4iw_dereg_mr(struct ibv_mr *mr)
>  struct ibv_cq *c4iw_create_cq(struct ibv_context *context, int cqe,
>  			      struct ibv_comp_channel *channel, int comp_vector)
>  {
> +	struct uc4iw_create_cq cmd;
>  	struct uc4iw_create_cq_resp resp;
>  	struct c4iw_cq *chp;
>  	struct c4iw_dev *dev = to_c4iw_dev(context->device);
> @@ -178,16 +179,22 @@ struct ibv_cq *c4iw_create_cq(struct ibv_context *context, int cqe,
>  		return NULL;
>  	}
>  
> -	resp.reserved = 0;
> +	resp.flags = 0;
> +	memset(&cmd, 0, sizeof cmd);
> +	cmd.flags = C4IW_64B_CQE;
> +
>  	ret = ibv_cmd_create_cq(context, cqe, channel, comp_vector,
> -				&chp->ibv_cq, NULL, 0,
> +				&chp->ibv_cq, &cmd.ibv_cmd, sizeof(cmd),
>  				&resp.ibv_resp, sizeof resp);
>  	if (ret)
>  		goto err1;
>  
> -	if (resp.reserved)
> -		PDBG("%s c4iw_create_cq_resp reserved field modified by kernel\n",
> -		     __FUNCTION__);
> +	if (!resp.flags) {
> +		fprintf(stderr, "libcxgb4 FATAL ERROR: downlevel iw_cxgb4 "
> +			"module. Cannot support RDMA with this driver/lib "
> +			"combination. Update your drivers!\n");

Oh, no, we don't do this. You have to support old kernels too.

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/kernel-headers/rdma/cxgb4-abi.h b/kernel-headers/rdma/cxgb4-abi.h
index 1fefd014..55959158 100644
--- a/kernel-headers/rdma/cxgb4-abi.h
+++ b/kernel-headers/rdma/cxgb4-abi.h
@@ -44,6 +44,16 @@ 
  * In particular do not use pointer types -- pass pointers in __aligned_u64
  * instead.
  */
+
+enum {
+	C4IW_64B_CQE = (1 << 0)
+};
+
+struct c4iw_create_cq {
+	__u32 flags;
+	__u32 reserved;
+};
+
 struct c4iw_create_cq_resp {
 	__aligned_u64 key;
 	__aligned_u64 gts_key;
@@ -51,7 +61,7 @@  struct c4iw_create_cq_resp {
 	__u32 cqid;
 	__u32 size;
 	__u32 qid_mask;
-	__u32 reserved; /* explicit padding (optional for i386) */
+	__u32 flags;
 };
 
 enum {
@@ -84,4 +94,20 @@  struct c4iw_alloc_pd_resp {
 	__u32 pdid;
 };
 
+struct c4iw_create_srq_resp {
+	//struct ibv_create_srq_resp ibv_resp;
+	__u64 srq_key;
+	__u64 srq_db_gts_key;
+	__u64 srq_memsize;
+	__u32 srqid;
+	__u32 srq_size;
+	__u32 rqt_abs_idx;
+	__u32 qid_mask;
+	__u32 flags;
+};
+
+enum {
+	T4_SRQ_LIMIT_SUPPORT = (1<<0), /* HW supports SRQ_LIMIT_REACHED event */
+};
+
 #endif /* CXGB4_ABI_USER_H */
diff --git a/providers/cxgb4/cq.c b/providers/cxgb4/cq.c
index be6cf2f2..0cdc1c09 100644
--- a/providers/cxgb4/cq.c
+++ b/providers/cxgb4/cq.c
@@ -40,7 +40,7 @@ 
 #include "libcxgb4.h"
 #include "cxgb4-abi.h"
 
-static void insert_recv_cqe(struct t4_wq *wq, struct t4_cq *cq)
+static void insert_recv_cqe(struct t4_wq *wq, struct t4_cq *cq, u32 srqidx)
 {
 	struct t4_cqe cqe;
 
@@ -53,6 +53,8 @@  static void insert_recv_cqe(struct t4_wq *wq, struct t4_cq *cq)
 				 V_CQE_SWCQE(1) |
 				 V_CQE_QPID(wq->sq.qid));
 	cqe.bits_type_ts = htobe64(V_CQE_GENBIT((u64)cq->gen));
+	if (srqidx)
+		cqe.u.srcqe.abs_rqe_idx = htobe32(srqidx);
 	cq->sw_queue[cq->sw_pidx] = cqe;
 	t4_swcq_produce(cq);
 }
@@ -66,7 +68,7 @@  int c4iw_flush_rq(struct t4_wq *wq, struct t4_cq *cq, int count)
 	PDBG("%s wq %p cq %p rq.in_use %u skip count %u\n", __func__,
 	     wq, cq, wq->rq.in_use, count);
 	while (in_use--) {
-		insert_recv_cqe(wq, cq);
+		insert_recv_cqe(wq, cq, 0);
 		flushed++;
 	}
 	return flushed;
@@ -327,6 +329,100 @@  static void dump_cqe(void *arg)
 	       (long long)be64toh(p[3]));
 }
 
+static void post_pending_srq_wrs(struct t4_srq *srq)
+{
+	struct t4_srq_pending_wr *pwr;
+	u16 idx = 0;
+
+	while (srq->pending_in_use) {
+
+		assert(!srq->sw_rq[srq->pidx].valid);
+
+		pwr = &srq->pending_wrs[srq->pending_cidx];
+		srq->sw_rq[srq->pidx].wr_id = pwr->wr_id;
+		srq->sw_rq[srq->pidx].valid = 1;
+
+		PDBG("%s posting pending cidx %u pidx %u wq_pidx %u "
+				"in_use %u rq_size %u wr_id %llx\n", __func__,
+				srq->cidx, srq->pidx,
+				srq->wq_pidx, srq->in_use, srq->size,
+				(unsigned long long)pwr->wr_id);
+
+		c4iw_copy_wr_to_srq(srq, &pwr->wqe, pwr->len16);
+		t4_srq_consume_pending_wr(srq);
+		t4_srq_produce(srq, pwr->len16);
+		idx += DIV_ROUND_UP(pwr->len16*16, T4_EQ_ENTRY_SIZE);
+	}
+
+	if (idx) {
+		t4_ring_srq_db(srq, idx, pwr->len16, &pwr->wqe);
+		srq->queue[srq->size].status.host_wq_pidx =
+			srq->wq_pidx;
+	}
+}
+
+struct t4_srq *find_srq(struct t4_cqe *hw_cqe, struct t4_cq *cq);
+struct t4_srq *find_srq(struct t4_cqe *hw_cqe, struct t4_cq *cq)
+{
+	struct c4iw_cq *chp = container_of(cq, struct c4iw_cq, cq);
+	struct c4iw_dev *dev = chp->rhp;
+	struct c4iw_srq *srq = NULL;
+	struct t4_srq *wq = NULL;
+
+	pthread_spin_lock(&dev->lock);
+	list_for_each(&dev->srq_list, srq, list) {
+		if ((CQE_ABS_RQE_IDX(hw_cqe) >= srq->wq.rqt_abs_idx) &&
+				(CQE_ABS_RQE_IDX(hw_cqe) <= srq->wq.rqt_abs_idx +
+				 srq->wq.size - 1)) {
+			wq = &srq->wq;
+			PDBG("%s found t4_srq\n", __func__);
+			break;
+		}
+	}
+	pthread_spin_unlock(&dev->lock);
+	return wq;
+}
+
+static u64 reap_srq_cqe(struct t4_cqe *hw_cqe, struct t4_srq *srq)
+{
+	int rel_idx = CQE_ABS_RQE_IDX(hw_cqe) - srq->rqt_abs_idx;
+	u64 wr_id;
+
+	BUG_ON(rel_idx >= srq->size);
+
+	assert(srq->sw_rq[rel_idx].valid);
+	srq->sw_rq[rel_idx].valid = 0;
+	wr_id = srq->sw_rq[rel_idx].wr_id;
+
+	if (rel_idx == srq->cidx) {
+		PDBG("%s in order cqe rel_idx %u cidx %u pidx %u wq_pidx %u "
+				"in_use %u rq_size %u wr_id %llx\n", __func__,
+				rel_idx, srq->cidx, srq->pidx,
+				srq->wq_pidx, srq->in_use, srq->size,
+				(unsigned long long)srq->sw_rq[rel_idx].wr_id);
+		t4_srq_consume(srq);
+		while (srq->ooo_count && !srq->sw_rq[srq->cidx].valid) {
+			PDBG("%s eat ooo cidx %u pidx %u wq_pidx %u "
+					"in_use %u rq_size %u ooo_count %u wr_id %llx\n", __func__,
+					srq->cidx, srq->pidx,
+					srq->wq_pidx, srq->in_use, srq->size, srq->ooo_count,
+					(unsigned long long)srq->sw_rq[srq->cidx].wr_id);
+			t4_srq_consume_ooo(srq);
+		}
+		if (srq->ooo_count == 0 && srq->pending_in_use)
+			post_pending_srq_wrs(srq);
+	} else {
+		BUG_ON(srq->in_use == 0);
+		PDBG("%s ooo cqe rel_idx %u cidx %u pidx %u wq_pidx %u "
+				"in_use %u rq_size %u ooo_count %u wr_id %llx\n", __func__,
+				rel_idx, srq->cidx, srq->pidx,
+				srq->wq_pidx, srq->in_use, srq->size, srq->ooo_count,
+				(unsigned long long)srq->sw_rq[rel_idx].wr_id);
+		t4_srq_produce_ooo(srq);
+	}
+	return wr_id;
+}
+
 /*
  * poll_cq
  *
@@ -344,7 +440,7 @@  static void dump_cqe(void *arg)
  *    -EOVERFLOW    CQ overflow detected.
  */
 static int poll_cq(struct t4_wq *wq, struct t4_cq *cq, struct t4_cqe *cqe,
-	           u8 *cqe_flushed, u64 *cookie, u32 *credit)
+	           u8 *cqe_flushed, u64 *cookie, u32 *credit, struct t4_srq *srq)
 {
 	int ret = 0;
 	struct t4_cqe *hw_cqe, read_cqe;
@@ -367,6 +463,13 @@  static int poll_cq(struct t4_wq *wq, struct t4_cq *cq, struct t4_cqe *cqe,
 	 * skip cqe's not affiliated with a QP.
 	 */
 	if (wq == NULL) {
+#if 0		/* If this is an SRQ CQE then update the srq state. */
+		if (CQE_IS_SRQ(hw_cqe) && (srq = find_srq(hw_cqe, cq))) {
+			PDBG("%s found srq, reaping it, hw_cqe %p srq %p\n",
+					__func__,hw_cqe, srq);
+			(void)reap_srq_cqe(hw_cqe, srq);
+		}
+#endif
 		ret = -EAGAIN;
 		goto skip_cqe;
 	}
@@ -454,11 +557,15 @@  static int poll_cq(struct t4_wq *wq, struct t4_cq *cq, struct t4_cqe *cqe,
 		 * error.
 		 */
 
-		if (t4_rq_empty(wq)) {
+		//BUG_ON(srq ? t4_srq_empty(srq) : t4_rq_empty(wq));
+		if (srq) {
+			t4_srq_empty(srq);
+		} else if (t4_rq_empty(wq)) {
 			t4_set_wq_in_error(wq);
 			ret = -EAGAIN;
 			goto skip_cqe;
 		}
+
 		if (unlikely((CQE_WRID_MSN(hw_cqe) != (wq->rq.msn)))) {
 			t4_set_wq_in_error(wq);
 			hw_cqe->header |= htobe32(V_CQE_STATUS(T4_ERR_MSN));
@@ -522,11 +629,15 @@  proc_cqe:
 		*cookie = wq->sq.sw_sq[wq->sq.cidx].wr_id;
 		t4_sq_consume(wq);
 	} else {
-		PDBG("%s completing rq idx %u\n", __func__, wq->rq.cidx);
-		BUG_ON(wq->rq.cidx >= wq->rq.size);
-		*cookie = wq->rq.sw_rq[wq->rq.cidx].wr_id;
-		BUG_ON(t4_rq_empty(wq));
-		t4_rq_consume(wq);
+		if (!srq) {
+			PDBG("%s completing rq idx %u\n", __func__, wq->rq.cidx);
+			BUG_ON(wq->rq.cidx >= wq->rq.size);
+			*cookie = wq->rq.sw_rq[wq->rq.cidx].wr_id;
+			BUG_ON(t4_rq_empty(wq));
+			t4_rq_consume(wq);
+		} else
+			*cookie = reap_srq_cqe(hw_cqe, srq);
+		wq->rq.msn++;
 		goto skip_cqe;
 	}
 
@@ -549,6 +660,18 @@  skip_cqe:
 	return ret;
 }
 
+static void generate_srq_limit_event(struct c4iw_srq *srq)
+{
+	struct ibv_modify_srq cmd;
+	struct ibv_srq_attr attr = {0};
+	int ret;
+
+	srq->armed = 0;
+	ret = ibv_cmd_modify_srq(&srq->ibv_srq, &attr, 0, &cmd, sizeof cmd);
+	if (ret)
+		fprintf(stderr, "Failure to send srq_limit event - ret %d errno %d\n", ret, errno);
+}
+
 /*
  * Get one cq entry from c4iw and map it to openib.
  *
@@ -561,6 +684,7 @@  skip_cqe:
 static int c4iw_poll_cq_one(struct c4iw_cq *chp, struct ibv_wc *wc)
 {
 	struct c4iw_qp *qhp = NULL;
+	struct c4iw_srq *srq = NULL;
 	struct t4_cqe uninitialized_var(cqe), *rd_cqe;
 	struct t4_wq *wq;
 	u32 credit = 0;
@@ -595,8 +719,12 @@  static int c4iw_poll_cq_one(struct c4iw_cq *chp, struct ibv_wc *wc)
 	else {
 		pthread_spin_lock(&qhp->lock);
 		wq = &(qhp->wq);
+		srq = qhp->srq;
+		if (srq)
+			pthread_spin_lock(&srq->lock);
 	}
-	ret = poll_cq(wq, &(chp->cq), &cqe, &cqe_flushed, &cookie, &credit);
+	ret = poll_cq(wq, &(chp->cq), &cqe, &cqe_flushed, &cookie, &credit,
+		      srq ? &srq->wq : NULL);
 	if (ret)
 		goto out;
 
@@ -606,6 +734,13 @@  static int c4iw_poll_cq_one(struct c4iw_cq *chp, struct ibv_wc *wc)
 	wc->vendor_err = CQE_STATUS(&cqe);
 	wc->wc_flags = 0;
 
+	/*
+	 * Simulate a SRQ_LIMIT_REACHED HW notification if required.
+	 */
+	if (srq && !(srq->flags & T4_SRQ_LIMIT_SUPPORT) && srq->armed &&
+			srq->wq.in_use < srq->srq_limit)
+		generate_srq_limit_event(srq);
+
 	PDBG("%s qpid 0x%x type %d opcode %d status 0x%x wrid hi 0x%x "
 	     "lo 0x%x cookie 0x%llx\n", __func__,
 	     CQE_QPID(&cqe), CQE_TYPE(&cqe),
@@ -704,8 +839,11 @@  static int c4iw_poll_cq_one(struct c4iw_cq *chp, struct ibv_wc *wc)
 			chp->cq.cqid, CQE_QPID(&cqe), CQE_TYPE(&cqe),
 			CQE_OPCODE(&cqe), CQE_STATUS(&cqe));
 out:
-	if (wq)
+	if (wq) {
 		pthread_spin_unlock(&qhp->lock);
+		if (srq)
+			pthread_spin_unlock(&srq->lock);
+	}
 	return ret;
 }
 
@@ -749,3 +887,11 @@  int c4iw_arm_cq(struct ibv_cq *ibcq, int solicited)
 	pthread_spin_unlock(&chp->lock);
 	return ret;
 }
+
+void c4iw_flush_srqidx(struct c4iw_qp *qhp, u32 srqidx)
+{
+	struct c4iw_cq * rchp = to_c4iw_cq(qhp->ibv_qp.recv_cq);
+
+	/* create a SRQ RECV CQE for srqidx */
+	insert_recv_cqe(&qhp->wq, &rchp->cq, srqidx);
+}
diff --git a/providers/cxgb4/cxgb4-abi.h b/providers/cxgb4/cxgb4-abi.h
index 14fe8feb..63945719 100644
--- a/providers/cxgb4/cxgb4-abi.h
+++ b/providers/cxgb4/cxgb4-abi.h
@@ -58,12 +58,13 @@  struct c4iw_create_qp_resp_v0 _c4iw_create_qp_resp_v0;
 DECLARE_DRV_CMD(uc4iw_alloc_pd, IB_USER_VERBS_CMD_ALLOC_PD,
 		empty, c4iw_alloc_pd_resp);
 DECLARE_DRV_CMD(uc4iw_create_cq, IB_USER_VERBS_CMD_CREATE_CQ,
-		empty, c4iw_create_cq_resp);
+		c4iw_create_cq, c4iw_create_cq_resp);
 DECLARE_DRV_CMD(uc4iw_create_qp, IB_USER_VERBS_CMD_CREATE_QP,
 		empty, c4iw_create_qp_resp);
 DECLARE_DRV_CMD(uc4iw_create_qp_v0, IB_USER_VERBS_CMD_CREATE_QP,
 		empty, c4iw_create_qp_resp_v0);
 DECLARE_DRV_CMD(uc4iw_alloc_ucontext, IB_USER_VERBS_CMD_GET_CONTEXT,
 		empty, c4iw_alloc_ucontext_resp);
-
+DECLARE_DRV_CMD(uc4iw_create_srq, IB_USER_VERBS_CMD_CREATE_SRQ,
+		empty, c4iw_create_srq_resp);
 #endif				/* IWCH_ABI_H */
diff --git a/providers/cxgb4/dev.c b/providers/cxgb4/dev.c
index b1870219..3479e561 100644
--- a/providers/cxgb4/dev.c
+++ b/providers/cxgb4/dev.c
@@ -84,6 +84,7 @@  static const struct verbs_context_ops  c4iw_ctx_common_ops = {
 	.create_srq = c4iw_create_srq,
 	.modify_srq = c4iw_modify_srq,
 	.destroy_srq = c4iw_destroy_srq,
+	.query_srq = c4iw_query_srq,
 	.create_qp = c4iw_create_qp,
 	.modify_qp = c4iw_modify_qp,
 	.destroy_qp = c4iw_destroy_qp,
@@ -456,6 +457,7 @@  static struct verbs_device *c4iw_device_alloc(struct verbs_sysfs_dev *sysfs_dev)
 	dev->abi_version = sysfs_dev->abi_ver;
 	list_node_init(&dev->list);
 
+	list_head_init(&dev->srq_list);
 	PDBG("%s device claimed\n", __FUNCTION__);
 	list_add_tail(&devices, &dev->list);
 #ifdef STALL_DETECTION
diff --git a/providers/cxgb4/libcxgb4.h b/providers/cxgb4/libcxgb4.h
index 893bd85d..d1b96791 100644
--- a/providers/cxgb4/libcxgb4.h
+++ b/providers/cxgb4/libcxgb4.h
@@ -59,6 +59,7 @@  struct c4iw_dev {
 	struct c4iw_qp **qpid2ptr;
 	int max_cq;
 	struct c4iw_cq **cqid2ptr;
+	struct list_head srq_list;
 	pthread_spinlock_t lock;
 	struct list_node list;
 	int abi_version;
@@ -117,11 +118,29 @@  struct c4iw_qp {
 	struct t4_wq wq;
 	pthread_spinlock_t lock;
 	int sq_sig_all;
+	struct c4iw_srq *srq;
 };
 
 #define to_c4iw_xxx(xxx, type)						\
 	container_of(ib##xxx, struct c4iw_##type, ibv_##xxx)
 
+struct c4iw_srq {
+	struct ibv_srq ibv_srq;
+	int type;                       /* must be 2nd in this struct */
+	struct c4iw_dev *rhp;
+	struct t4_srq wq;
+	struct list_node list;
+	pthread_spinlock_t lock;
+	uint32_t srq_limit;
+	int armed;
+	__u32 flags;
+};
+
+static inline struct c4iw_srq *to_c4iw_srq(struct ibv_srq *ibsrq)
+{
+	return to_c4iw_xxx(srq, srq);
+}
+
 static inline struct c4iw_dev *to_c4iw_dev(struct ibv_device *ibdev)
 {
 	return container_of(ibdev, struct c4iw_dev, ibv_dev.device);
@@ -201,6 +220,7 @@  int c4iw_destroy_srq(struct ibv_srq *srq);
 int c4iw_post_srq_recv(struct ibv_srq *ibsrq,
 			      struct ibv_recv_wr *wr,
 			      struct ibv_recv_wr **bad_wr);
+int c4iw_query_srq(struct ibv_srq *ibsrq, struct ibv_srq_attr *attr);
 
 struct ibv_qp *c4iw_create_qp(struct ibv_pd *pd,
 				     struct ibv_qp_init_attr *attr);
@@ -229,6 +249,8 @@  void c4iw_flush_hw_cq(struct c4iw_cq *chp);
 int c4iw_flush_rq(struct t4_wq *wq, struct t4_cq *cq, int count);
 void c4iw_flush_sq(struct c4iw_qp *qhp);
 void c4iw_count_rcqes(struct t4_cq *cq, struct t4_wq *wq, int *count);
+void c4iw_copy_wr_to_srq(struct t4_srq *srq, union t4_recv_wr *wqe, u8 len16);
+void c4iw_flush_srqidx(struct c4iw_qp *qhp, u32 srqidx);
 
 #define FW_MAJ 0
 #define FW_MIN 0
diff --git a/providers/cxgb4/qp.c b/providers/cxgb4/qp.c
index af04e3a1..cb4ea785 100644
--- a/providers/cxgb4/qp.c
+++ b/providers/cxgb4/qp.c
@@ -92,6 +92,23 @@  static void copy_wr_to_rq(struct t4_wq *wq, union t4_recv_wr *wqe, u8 len16)
 	}
 }
 
+void c4iw_copy_wr_to_srq(struct t4_srq *srq, union t4_recv_wr *wqe, u8 len16)
+{
+	u64 *src, *dst;
+
+	src = (u64 *)wqe;
+	dst = (u64 *)((u8 *)srq->queue + srq->wq_pidx * T4_EQ_ENTRY_SIZE);
+	while (len16) {
+		*dst++ = *src++;
+		if (dst >= (u64 *)&srq->queue[srq->size])
+			dst = (u64 *)srq->queue;
+		*dst++ = *src++;
+		if (dst >= (u64 *)&srq->queue[srq->size])
+			dst = (u64 *)srq->queue;
+		len16--;
+	}
+}
+
 static int build_immd(struct t4_sq *sq, struct fw_ri_immd *immdp,
 		      struct ibv_send_wr *wr, int max, u32 *plenp)
 {
@@ -277,6 +294,20 @@  static int build_rdma_recv(struct c4iw_qp *qhp, union t4_recv_wr *wqe,
 	return 0;
 }
 
+static int build_srq_recv(union t4_recv_wr *wqe, struct ibv_recv_wr *wr,
+		u8 *len16)
+{
+	int ret;
+
+	ret = build_isgl(&wqe->recv.isgl, wr->sg_list, wr->num_sge, NULL);
+	if (ret)
+		return ret;
+	*len16 = DIV_ROUND_UP(sizeof wqe->recv +
+			wr->num_sge * sizeof(struct fw_ri_sge), 16);
+	return 0;
+}
+
+
 static void ring_kernel_db(struct c4iw_qp *qhp, u32 qid, u16 idx)
 {
 	struct ibv_modify_qp cmd = {};
@@ -299,7 +330,7 @@  static void ring_kernel_db(struct c4iw_qp *qhp, u32 qid, u16 idx)
 }
 
 int c4iw_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
-	           struct ibv_send_wr **bad_wr)
+		   struct ibv_send_wr **bad_wr)
 {
 	int err = 0;
 	u8 uninitialized_var(len16);
@@ -339,37 +370,37 @@  int c4iw_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
 			fw_flags |= FW_RI_COMPLETION_FLAG;
 		swsqe = &qhp->wq.sq.sw_sq[qhp->wq.sq.pidx];
 		switch (wr->opcode) {
-		case IBV_WR_SEND:
-			INC_STAT(send);
-			if (wr->send_flags & IBV_SEND_FENCE)
-				fw_flags |= FW_RI_READ_FENCE_FLAG;
-			fw_opcode = FW_RI_SEND_WR;
-			swsqe->opcode = FW_RI_SEND;
-			err = build_rdma_send(&qhp->wq.sq, wqe, wr, &len16);
-			break;
-		case IBV_WR_RDMA_WRITE:
-			INC_STAT(write);
-			fw_opcode = FW_RI_RDMA_WRITE_WR;
-			swsqe->opcode = FW_RI_RDMA_WRITE;
-			err = build_rdma_write(&qhp->wq.sq, wqe, wr, &len16);
-			break;
-		case IBV_WR_RDMA_READ:
-			INC_STAT(read);
-			fw_opcode = FW_RI_RDMA_READ_WR;
-			swsqe->opcode = FW_RI_READ_REQ;
-			fw_flags = 0;
-			err = build_rdma_read(wqe, wr, &len16);
-			if (err)
+			case IBV_WR_SEND:
+				INC_STAT(send);
+				if (wr->send_flags & IBV_SEND_FENCE)
+					fw_flags |= FW_RI_READ_FENCE_FLAG;
+				fw_opcode = FW_RI_SEND_WR;
+				swsqe->opcode = FW_RI_SEND;
+				err = build_rdma_send(&qhp->wq.sq, wqe, wr, &len16);
 				break;
-			swsqe->read_len = wr->sg_list ? wr->sg_list[0].length :
-					  0;
-			if (!qhp->wq.sq.oldest_read)
-				qhp->wq.sq.oldest_read = swsqe;
-			break;
-		default:
-			PDBG("%s post of type=%d TBD!\n", __func__,
-			     wr->opcode);
-			err = -EINVAL;
+			case IBV_WR_RDMA_WRITE:
+				INC_STAT(write);
+				fw_opcode = FW_RI_RDMA_WRITE_WR;
+				swsqe->opcode = FW_RI_RDMA_WRITE;
+				err = build_rdma_write(&qhp->wq.sq, wqe, wr, &len16);
+				break;
+			case IBV_WR_RDMA_READ:
+				INC_STAT(read);
+				fw_opcode = FW_RI_RDMA_READ_WR;
+				swsqe->opcode = FW_RI_READ_REQ;
+				fw_flags = 0;
+				err = build_rdma_read(wqe, wr, &len16);
+				if (err)
+					break;
+				swsqe->read_len = wr->sg_list ? wr->sg_list[0].length :
+					0;
+				if (!qhp->wq.sq.oldest_read)
+					qhp->wq.sq.oldest_read = swsqe;
+				break;
+			default:
+				PDBG("%s post of type=%d TBD!\n", __func__,
+						wr->opcode);
+				err = -EINVAL;
 		}
 		if (err) {
 			*bad_wr = wr;
@@ -474,6 +505,89 @@  int c4iw_post_receive(struct ibv_qp *ibqp, struct ibv_recv_wr *wr,
 	return err;
 }
 
+static void defer_srq_wr(struct t4_srq *srq, union t4_recv_wr *wqe, uint64_t wr_id, u8 len16)
+{
+	struct t4_srq_pending_wr *pwr = &srq->pending_wrs[srq->pending_pidx];
+
+	PDBG("%s cidx %u pidx %u wq_pidx %u in_use %u ooo_count %u wr_id 0x%llx "
+		"pending_cidx %u pending_pidx %u pending_in_use %u\n",
+		__func__, srq->cidx, srq->pidx, srq->wq_pidx,
+		srq->in_use, srq->ooo_count, (unsigned long long)wr_id, srq->pending_cidx,
+		srq->pending_pidx, srq->pending_in_use);
+	pwr->wr_id = wr_id;
+	pwr->len16 = len16;
+	memcpy(&pwr->wqe, wqe, len16*16);
+	t4_srq_produce_pending_wr(srq);
+}
+
+int c4iw_post_srq_recv(struct ibv_srq *ibsrq, struct ibv_recv_wr *wr,
+		struct ibv_recv_wr **bad_wr)
+{
+	int err = 0;
+	struct c4iw_srq *srq;
+	union t4_recv_wr *wqe, lwqe;
+	u32 num_wrs;
+	u8 len16 = 0;
+	u16 idx = 0;
+
+	srq = to_c4iw_srq(ibsrq);
+	pthread_spin_lock(&srq->lock);
+	INC_STAT(srq_recv);
+	num_wrs = t4_srq_avail(&srq->wq);
+	if (num_wrs == 0) {
+		pthread_spin_unlock(&srq->lock);
+		return -ENOMEM;
+	}
+	while (wr) {
+		if (wr->num_sge > T4_MAX_RECV_SGE) {
+			err = -EINVAL;
+			*bad_wr = wr;
+			break;
+		}
+		wqe = &lwqe;
+		if (num_wrs)
+			err = build_srq_recv(wqe, wr, &len16);
+		else
+			err = -ENOMEM;
+		if (err) {
+			*bad_wr = wr;
+			break;
+		}
+
+		wqe->recv.opcode = FW_RI_RECV_WR;
+		wqe->recv.r1 = 0;
+		wqe->recv.wrid = srq->wq.pidx;
+		wqe->recv.r2[0] = 0;
+		wqe->recv.r2[1] = 0;
+		wqe->recv.r2[2] = 0;
+		wqe->recv.len16 = len16;
+
+		if (srq->wq.ooo_count || srq->wq.pending_in_use || srq->wq.sw_rq[srq->wq.pidx].valid)
+			defer_srq_wr(&srq->wq, wqe, wr->wr_id, len16);
+		else {
+			srq->wq.sw_rq[srq->wq.pidx].wr_id = wr->wr_id;
+			srq->wq.sw_rq[srq->wq.pidx].valid = 1;
+			c4iw_copy_wr_to_srq(&srq->wq, wqe, len16);
+			PDBG("%s cidx %u pidx %u wq_pidx %u in_use %u "
+					"wr_id 0x%llx \n", __func__, srq->wq.cidx,
+					srq->wq.pidx, srq->wq.wq_pidx, srq->wq.in_use,
+					(unsigned long long)wr->wr_id);
+			t4_srq_produce(&srq->wq, len16);
+			idx += DIV_ROUND_UP(len16*16, T4_EQ_ENTRY_SIZE);
+		}
+		wr = wr->next;
+		num_wrs--;
+	}
+
+	if (idx) {
+		t4_ring_srq_db(&srq->wq, idx, len16, wqe);
+		srq->wq.queue[srq->wq.size].status.host_wq_pidx =
+			srq->wq.wq_pidx;
+	}
+	pthread_spin_unlock(&srq->lock);
+	return err;
+}
+
 static void update_qp_state(struct c4iw_qp *qhp)
 {
 	struct ibv_query_qp cmd;
@@ -488,44 +602,56 @@  static void update_qp_state(struct c4iw_qp *qhp)
 		qhp->ibv_qp.state = attr.qp_state;
 }
 
-/*
- * Assumes qhp lock is held.
- */
 void c4iw_flush_qp(struct c4iw_qp *qhp)
 {
 	struct c4iw_cq *rchp, *schp;
 	int count;
-
-	if (qhp->wq.flushed)
-		return;
-
-	update_qp_state(qhp);
+	u32 srqidx = t4_wq_srqidx(&qhp->wq);
 
 	rchp = to_c4iw_cq(qhp->ibv_qp.recv_cq);
 	schp = to_c4iw_cq(qhp->ibv_qp.send_cq);
 
 	PDBG("%s qhp %p rchp %p schp %p\n", __func__, qhp, rchp, schp);
-	qhp->wq.flushed = 1;
-	pthread_spin_unlock(&qhp->lock);
 
 	/* locking heirarchy: cq lock first, then qp lock. */
 	pthread_spin_lock(&rchp->lock);
+	if (schp != rchp)
+		pthread_spin_lock(&schp->lock);
 	pthread_spin_lock(&qhp->lock);
-	c4iw_flush_hw_cq(rchp);
-	c4iw_count_rcqes(&rchp->cq, &qhp->wq, &count);
-	c4iw_flush_rq(&qhp->wq, &rchp->cq, count);
-	pthread_spin_unlock(&qhp->lock);
-	pthread_spin_unlock(&rchp->lock);
 
-	/* locking heirarchy: cq lock first, then qp lock. */
-	pthread_spin_lock(&schp->lock);
-	pthread_spin_lock(&qhp->lock);
+	if (qhp->wq.flushed) {
+		pthread_spin_unlock(&qhp->lock);
+		if (schp != rchp)
+			pthread_spin_unlock(&schp->lock);
+		pthread_spin_unlock(&rchp->lock);
+		return;
+	}
+
+	qhp->wq.flushed = 1;
+	t4_set_wq_in_error(&qhp->wq);
+
+	if (qhp->srq)
+		pthread_spin_lock(&qhp->srq->lock);
+
+	if (srqidx)
+		c4iw_flush_srqidx(qhp, srqidx);
+
+	update_qp_state(qhp);
+	c4iw_flush_hw_cq(rchp);
+	if (!qhp->srq) {
+		c4iw_count_rcqes(&rchp->cq, &qhp->wq, &count);
+		c4iw_flush_rq(&qhp->wq, &rchp->cq, count);
+	}
 	if (schp != rchp)
 		c4iw_flush_hw_cq(schp);
 	c4iw_flush_sq(qhp);
+
+	if (qhp->srq)
+		pthread_spin_unlock(&qhp->srq->lock);
 	pthread_spin_unlock(&qhp->lock);
-	pthread_spin_unlock(&schp->lock);
-	pthread_spin_lock(&qhp->lock);
+	if (schp != rchp)
+		pthread_spin_unlock(&schp->lock);
+	pthread_spin_unlock(&rchp->lock);
 }
 
 void c4iw_flush_qps(struct c4iw_dev *dev)
diff --git a/providers/cxgb4/t4.h b/providers/cxgb4/t4.h
index fb10002b..613b462a 100644
--- a/providers/cxgb4/t4.h
+++ b/providers/cxgb4/t4.h
@@ -67,7 +67,7 @@ 
 
 #ifdef DEBUG
 #define DBGLOG(s)
-#define PDBG(fmt, args...) do {syslog(LOG_DEBUG, fmt, ##args); } while (0)
+#define PDBG(fmt, args...) do {syslog(LOG_ALERT, fmt, ##args); } while (0)
 #else
 #define DBGLOG(s)
 #define PDBG(fmt, args...) do {} while (0)
@@ -100,10 +100,12 @@  struct t4_status_page {
 	__be16 pidx;
 	u8 qp_err;	/* flit 1 - sw owns */
 	u8 db_off;
-	u8 pad;
+	u8 pad[2];
 	u16 host_wq_pidx;
 	u16 host_cidx;
 	u16 host_pidx;
+	u16 pad2;
+	u32 srqidx;
 };
 
 #define T4_EQ_ENTRY_SIZE 64
@@ -212,8 +214,14 @@  struct t4_cqe {
 			__be32 wrid_hi;
 			__be32 wrid_low;
 		} gen;
+		struct {
+			__be32 stag;
+			__be32 msn;
+			__be32 reserved;
+			__be32 abs_rqe_idx;
+		} srcqe;
 	} u;
-	__be64 reserved;
+	__be64 reserved[4];
 	__be64 bits_type_ts;
 };
 
@@ -263,6 +271,7 @@  struct t4_cqe {
 /* used for RQ completion processing */
 #define CQE_WRID_STAG(x)  (be32toh((x)->u.rcqe.stag))
 #define CQE_WRID_MSN(x)   (be32toh((x)->u.rcqe.msn))
+#define CQE_ABS_RQE_IDX(x) (be32toh((x)->u.srcqe.abs_rqe_idx))
 
 /* used for SQ completion processing */
 #define CQE_WRID_SQ_IDX(x)	(x)->u.scqe.cidx
@@ -291,6 +300,7 @@  struct t4_cqe {
 #define CQE_OVFBIT(x)	((unsigned)G_CQE_OVFBIT(be64toh((x)->bits_type_ts)))
 #define CQE_GENBIT(x)	((unsigned)G_CQE_GENBIT(be64toh((x)->bits_type_ts)))
 #define CQE_TS(x)	(G_CQE_TS(be64toh((x)->bits_type_ts)))
+//#define CQE_IS_SRQ(x)   ((x)->rss.opcode == CPL_RDMA_CQE_SRQ)
 
 struct t4_swsqe {
 	u64			wr_id;
@@ -331,6 +341,7 @@  struct t4_sq {
 
 struct t4_swrqe {
 	u64 wr_id;
+	int valid;
 };
 
 struct t4_rq {
@@ -359,6 +370,8 @@  struct t4_wq {
 	int error;
 	int flushed;
 	u8 *db_offp;
+	u8 *qp_errp;
+	u32 *srqidxp;
 };
 
 static inline int t4_rqes_posted(struct t4_wq *wq)
@@ -396,7 +409,6 @@  static inline void t4_rq_produce(struct t4_wq *wq, u8 len16)
 static inline void t4_rq_consume(struct t4_wq *wq)
 {
 	wq->rq.in_use--;
-	wq->rq.msn++;
 	if (++wq->rq.cidx == wq->rq.size)
 		wq->rq.cidx = 0;
 	assert((wq->rq.cidx != wq->rq.pidx) || wq->rq.in_use == 0);
@@ -404,6 +416,122 @@  static inline void t4_rq_consume(struct t4_wq *wq)
 		wq->rq.queue[wq->rq.size].status.host_cidx = wq->rq.cidx;
 }
 
+struct t4_srq_pending_wr {
+	u64 wr_id;
+	union t4_recv_wr wqe;
+	u8 len16;
+};
+
+struct t4_srq {
+	union  t4_recv_wr *queue;
+	struct t4_swrqe *sw_rq;
+	volatile u32 *udb;
+	size_t memsize;
+	u32 qid;
+	u32 bar2_qid;
+	u32 msn;
+	u32 rqt_hwaddr;
+	u32 rqt_abs_idx;
+	u16 in_use;
+	u16 size;
+	u16 cidx;
+	u16 pidx;
+	u16 wq_pidx;
+	int wc_reg_available;
+	struct t4_srq_pending_wr *pending_wrs;
+	u16 pending_cidx;
+	u16 pending_pidx;
+	u16 pending_in_use;
+	u16 ooo_count;
+};
+
+static inline u32 t4_srq_avail(struct t4_srq *srq)
+{
+	return srq->size - 1 - srq->in_use;
+}
+
+static inline int t4_srq_empty(struct t4_srq *srq)
+{
+	return srq->in_use == 0;
+}
+
+static inline int t4_srq_cidx_at_end(struct t4_srq *srq)
+{
+	assert(srq->cidx != srq->pidx);
+	if (srq->cidx < srq->pidx)
+		return srq->cidx == (srq->pidx - 1);
+	else
+		return srq->cidx == (srq->size - 1) && srq->pidx == 0;
+}
+
+static inline int t4_srq_wrs_pending(struct t4_srq *srq)
+{
+	return srq->pending_cidx != srq->pending_pidx;
+}
+
+static inline void t4_srq_produce(struct t4_srq *srq, u8 len16)
+{
+	srq->in_use++;
+	assert(srq->in_use < srq->size);
+	if (++srq->pidx == srq->size)
+		srq->pidx = 0;
+	assert(srq->cidx != srq->pidx); /* overflow */
+	srq->wq_pidx += DIV_ROUND_UP(len16*16, T4_EQ_ENTRY_SIZE);
+	if (srq->wq_pidx >= srq->size * T4_RQ_NUM_SLOTS)
+		srq->wq_pidx %= srq->size * T4_RQ_NUM_SLOTS;
+	srq->queue[srq->size].status.host_pidx = srq->pidx;
+}
+
+static inline void t4_srq_produce_pending_wr(struct t4_srq *srq)
+{
+	srq->pending_in_use++;
+	srq->in_use++;
+	assert(srq->pending_in_use < srq->size);
+	assert(srq->in_use < srq->size);
+	assert(srq->pending_pidx < srq->size);
+	if (++srq->pending_pidx == srq->size)
+		srq->pending_pidx = 0;
+}
+
+static inline void t4_srq_consume_pending_wr(struct t4_srq *srq)
+{
+	assert(srq->pending_in_use > 0);
+	srq->pending_in_use--;
+	assert(srq->in_use > 0);
+	srq->in_use--;
+	if (++srq->pending_cidx == srq->size)
+		srq->pending_cidx = 0;
+	assert((srq->pending_cidx != srq->pending_pidx) || srq->pending_in_use == 0);
+}
+
+static inline void t4_srq_produce_ooo(struct t4_srq *srq)
+{
+	assert(srq->in_use > 0);
+	srq->in_use--;
+	srq->ooo_count++;
+	assert(srq->ooo_count < srq->size);
+}
+
+static inline void t4_srq_consume_ooo(struct t4_srq *srq)
+{
+	srq->cidx++;
+	if (srq->cidx == srq->size)
+		srq->cidx  = 0;
+	srq->queue[srq->size].status.host_cidx = srq->cidx;
+	assert(srq->ooo_count > 0);
+	srq->ooo_count--;
+}
+
+static inline void t4_srq_consume(struct t4_srq *srq)
+{
+	assert(srq->in_use > 0);
+	srq->in_use--;
+	if (++srq->cidx == srq->size)
+		srq->cidx = 0;
+	assert((srq->cidx != srq->pidx) || srq->in_use == 0);
+	srq->queue[srq->size].status.host_cidx = srq->cidx;
+}
+
 static inline int t4_sq_empty(struct t4_wq *wq)
 {
 	return wq->sq.in_use == 0;
@@ -471,6 +599,23 @@  static void copy_wqe_to_udb(volatile u32 *udb_offset, void *wqe)
 	}
 }
 
+static inline void t4_ring_srq_db(struct t4_srq *srq, u16 inc, u8 len16,
+				  union t4_recv_wr *wqe)
+{
+	mmio_wc_start();
+	if (inc == 1 && srq->wc_reg_available) {
+		PDBG("%s: WC srq->pidx = %d; len16=%d\n",
+				__func__, srq->pidx, len16);
+		copy_wqe_to_udb(srq->udb + 14, wqe);
+	} else {
+		PDBG("%s: DB srq->pidx = %d; len16=%d\n",
+				__func__, srq->pidx, len16);
+		writel(QID_V(srq->bar2_qid) | PIDX_T5_V(inc), srq->udb);
+	}
+	mmio_flush_writes();
+	return;
+}
+
 extern int ma_wr;
 extern int t5_en_wc;
 
@@ -552,6 +697,17 @@  static inline int t4_wq_in_error(struct t4_wq *wq)
 	return wq->error || wq->rq.queue[wq->rq.size].status.qp_err;
 }
 
+static inline u32 t4_wq_srqidx(struct t4_wq *wq)
+{
+	u32 srqidx;
+
+	if (!wq->srqidxp)
+		return 0;
+	srqidx = *wq->srqidxp;
+	wq->srqidxp = 0;
+	return srqidx;
+}
+
 static inline void t4_set_wq_in_error(struct t4_wq *wq)
 {
 	wq->rq.queue[wq->rq.size].status.qp_err = 1;
diff --git a/providers/cxgb4/t4_regs.h b/providers/cxgb4/t4_regs.h
index 9fea255c..c0627378 100644
--- a/providers/cxgb4/t4_regs.h
+++ b/providers/cxgb4/t4_regs.h
@@ -1437,6 +1437,10 @@ 
 #define TP_MIB_DATA_A	0x7e54
 #define TP_INT_CAUSE_A	0x7e74
 
+#define SRQTABLEPERR_S    1
+#define SRQTABLEPERR_V(x) ((x) << SRQTABLEPERR_S)
+#define SRQTABLEPERR_F    SRQTABLEPERR_V(1U)
+
 #define FLMTXFLSTEMPTY_S    30
 #define FLMTXFLSTEMPTY_V(x) ((x) << FLMTXFLSTEMPTY_S)
 #define FLMTXFLSTEMPTY_F    FLMTXFLSTEMPTY_V(1U)
diff --git a/providers/cxgb4/t4fw_api.h b/providers/cxgb4/t4fw_api.h
index 49bbca18..fabb16c7 100644
--- a/providers/cxgb4/t4fw_api.h
+++ b/providers/cxgb4/t4fw_api.h
@@ -1152,6 +1152,8 @@  enum fw_params_param_pfvf {
 	FW_PARAMS_PARAM_PFVF_SQRQ_END	= 0x16,
 	FW_PARAMS_PARAM_PFVF_CQ_START	= 0x17,
 	FW_PARAMS_PARAM_PFVF_CQ_END	= 0x18,
+	FW_PARAMS_PARAM_PFVF_SRQ_START  = 0x19,
+	FW_PARAMS_PARAM_PFVF_SRQ_END    = 0x1A,
 	FW_PARAMS_PARAM_PFVF_SCHEDCLASS_ETH = 0x20,
 	FW_PARAMS_PARAM_PFVF_VIID       = 0x24,
 	FW_PARAMS_PARAM_PFVF_CPMASK     = 0x25,
diff --git a/providers/cxgb4/t4fw_ri_api.h b/providers/cxgb4/t4fw_ri_api.h
index 1e266697..667e4096 100644
--- a/providers/cxgb4/t4fw_ri_api.h
+++ b/providers/cxgb4/t4fw_ri_api.h
@@ -263,6 +263,7 @@  enum fw_ri_res_type {
 	FW_RI_RES_TYPE_SQ,
 	FW_RI_RES_TYPE_RQ,
 	FW_RI_RES_TYPE_CQ,
+	FW_RI_RES_TYPE_SRQ,
 };
 
 enum fw_ri_res_op {
@@ -296,6 +297,21 @@  struct fw_ri_res {
 			__be32 r6_lo;
 			__be64 r7;
 		} cq;
+		struct fw_ri_res_srq {
+			__u8   restype;
+			__u8   op;
+			__be16 r3;
+			__be32 eqid;
+			__be32 r4[2];
+			__be32 fetchszm_to_iqid;
+			__be32 dcaen_to_eqsize;
+			__be64 eqaddr;
+			__be32 srqid;
+			__be32 pdid;
+			__be32 hwsrqsize;
+			__be32 hwsrqaddr;
+		} srq;
+
 	} u;
 };
 
@@ -695,6 +711,10 @@  enum fw_ri_init_p2ptype {
 	FW_RI_INIT_P2PTYPE_DISABLED		= 0xf,
 };
 
+enum fw_ri_init_rqeqid_srq {
+	FW_RI_INIT_RQEQID_SRQ                   = 1 << 31,
+};
+
 struct fw_ri_wr {
 	__be32 op_compl;
 	__be32 flowid_len16;
diff --git a/providers/cxgb4/verbs.c b/providers/cxgb4/verbs.c
index 3c493697..435bb238 100644
--- a/providers/cxgb4/verbs.c
+++ b/providers/cxgb4/verbs.c
@@ -168,6 +168,7 @@  int c4iw_dereg_mr(struct ibv_mr *mr)
 struct ibv_cq *c4iw_create_cq(struct ibv_context *context, int cqe,
 			      struct ibv_comp_channel *channel, int comp_vector)
 {
+	struct uc4iw_create_cq cmd;
 	struct uc4iw_create_cq_resp resp;
 	struct c4iw_cq *chp;
 	struct c4iw_dev *dev = to_c4iw_dev(context->device);
@@ -178,16 +179,22 @@  struct ibv_cq *c4iw_create_cq(struct ibv_context *context, int cqe,
 		return NULL;
 	}
 
-	resp.reserved = 0;
+	resp.flags = 0;
+	memset(&cmd, 0, sizeof cmd);
+	cmd.flags = C4IW_64B_CQE;
+
 	ret = ibv_cmd_create_cq(context, cqe, channel, comp_vector,
-				&chp->ibv_cq, NULL, 0,
+				&chp->ibv_cq, &cmd.ibv_cmd, sizeof(cmd),
 				&resp.ibv_resp, sizeof resp);
 	if (ret)
 		goto err1;
 
-	if (resp.reserved)
-		PDBG("%s c4iw_create_cq_resp reserved field modified by kernel\n",
-		     __FUNCTION__);
+	if (!resp.flags) {
+		fprintf(stderr, "libcxgb4 FATAL ERROR: downlevel iw_cxgb4 "
+			"module. Cannot support RDMA with this driver/lib "
+			"combination. Update your drivers!\n");
+		return NULL;
+	}
 
 	pthread_spin_init(&chp->lock, PTHREAD_PROCESS_PRIVATE);
 #ifdef STALL_DETECTION
@@ -279,24 +286,139 @@  int c4iw_destroy_cq(struct ibv_cq *ibcq)
 struct ibv_srq *c4iw_create_srq(struct ibv_pd *pd,
 				struct ibv_srq_init_attr *attr)
 {
+	struct ibv_create_srq cmd;
+	struct uc4iw_create_srq_resp resp;
+	struct c4iw_srq *srq;
+	struct c4iw_dev *dev = to_c4iw_dev(pd->context->device);
+	int ret;
+	void *dbva;
+	unsigned long segment_offset;
+
+	PDBG("%s enter\n", __func__);
+	srq = calloc(1, sizeof *srq);
+	if (!srq)
+		goto err;
+
+	ret = ibv_cmd_create_srq(pd, &srq->ibv_srq, attr, &cmd,
+			sizeof cmd, &resp.ibv_resp, sizeof resp);
+	if (ret)
+		goto err_free_srq_mem;
+
+	PDBG("%s srq id 0x%x srq key %" PRIx64 " srq db/gts key %" PRIx64
+			" qid_mask 0x%x\n", __func__,
+			resp.srqid, resp.srq_key, resp.srq_db_gts_key,
+			resp.qid_mask);
+
+	srq->rhp = dev;
+	srq->wq.qid = resp.srqid;
+	srq->wq.size = resp.srq_size;
+	srq->wq.memsize = resp.srq_memsize;
+	srq->wq.rqt_abs_idx = resp.rqt_abs_idx;
+	srq->flags = resp.flags;
+	pthread_spin_init(&srq->lock, PTHREAD_PROCESS_PRIVATE);
+
+	dbva = mmap(NULL, c4iw_page_size, PROT_WRITE, MAP_SHARED,
+			pd->context->cmd_fd, resp.srq_db_gts_key);
+	if (dbva == MAP_FAILED)
+		goto err_destroy_srq;
+	srq->wq.udb = dbva;
+
+	segment_offset = 128 * (srq->wq.qid & resp.qid_mask);
+	if (segment_offset < c4iw_page_size) {
+		srq->wq.udb += segment_offset / 4;
+		srq->wq.wc_reg_available = 1;
+	} else
+		srq->wq.bar2_qid = srq->wq.qid & resp.qid_mask;
+	srq->wq.udb += 2;
+
+	srq->wq.queue = mmap(NULL, srq->wq.memsize,
+			PROT_WRITE, MAP_SHARED,
+			pd->context->cmd_fd, resp.srq_key);
+	if (srq->wq.queue == MAP_FAILED)
+		goto err_unmap_udb;
+
+	srq->wq.sw_rq = calloc(srq->wq.size, sizeof (struct t4_swrqe));
+	if (!srq->wq.sw_rq)
+		goto err_unmap_queue;
+	srq->wq.pending_wrs = calloc(srq->wq.size, sizeof *srq->wq.pending_wrs);
+	if (!srq->wq.pending_wrs)
+		goto err_free_sw_rq;
+
+	pthread_spin_lock(&dev->lock);
+	list_add_tail(&dev->srq_list, &srq->list);
+	pthread_spin_unlock(&dev->lock);
+
+	PDBG("%s srq dbva %p srq qva %p srq depth %u srq memsize %lu\n",
+			__func__, srq->wq.udb, srq->wq.queue,
+			srq->wq.size, srq->wq.memsize);
+
+	INC_STAT(srq);
+	return &srq->ibv_srq;
+err_free_sw_rq:
+	free(srq->wq.sw_rq);
+err_unmap_queue:
+	munmap((void *)srq->wq.queue, srq->wq.memsize);
+err_unmap_udb:
+	munmap(MASKED(srq->wq.udb), c4iw_page_size);
+err_destroy_srq:
+	(void)ibv_cmd_destroy_srq(&srq->ibv_srq);
+err_free_srq_mem:
+	free(srq);
+err:
 	return NULL;
 }
 
-int c4iw_modify_srq(struct ibv_srq *srq, struct ibv_srq_attr *attr,
+int c4iw_modify_srq(struct ibv_srq *ibsrq, struct ibv_srq_attr *attr,
 		    int attr_mask)
 {
-	return ENOSYS;
+	struct c4iw_srq *srq = to_c4iw_srq(ibsrq);
+	struct ibv_modify_srq cmd;
+	int ret;
+
+	/* XXX no support for this yet */
+	if (attr_mask & IBV_SRQ_MAX_WR)
+		return ENOSYS;
+
+	ret = ibv_cmd_modify_srq(ibsrq, attr, attr_mask, &cmd, sizeof cmd);
+	if (!ret) {
+		if (attr_mask & IBV_SRQ_LIMIT) {
+			srq->armed = 1;
+			srq->srq_limit = attr->srq_limit;
+		}
+	}
+	return ret;
 }
 
-int c4iw_destroy_srq(struct ibv_srq *srq)
+int c4iw_destroy_srq(struct ibv_srq *ibsrq)
 {
-	return ENOSYS;
+	int ret;
+	struct c4iw_srq *srq = to_c4iw_srq(ibsrq);
+
+	PDBG("%s enter qp %p\n", __func__, ibsrq);
+
+	ret = ibv_cmd_destroy_srq(ibsrq);
+	if (ret) {
+		return ret;
+	}
+
+	pthread_spin_lock(&srq->rhp->lock);
+	list_del(&srq->list);
+	pthread_spin_unlock(&srq->rhp->lock);
+
+	munmap(MASKED(srq->wq.udb), c4iw_page_size);
+	munmap(srq->wq.queue, srq->wq.memsize);
+
+	free(srq->wq.pending_wrs);
+	free(srq->wq.sw_rq);
+	free(srq);
+	return 0;
 }
 
-int c4iw_post_srq_recv(struct ibv_srq *ibsrq, struct ibv_recv_wr *wr,
-		       struct ibv_recv_wr **bad_wr)
+int c4iw_query_srq(struct ibv_srq *ibsrq, struct ibv_srq_attr *attr)
 {
-	return ENOSYS;
+	struct ibv_query_srq cmd;
+
+	return ibv_cmd_query_srq(ibsrq, attr, &cmd, sizeof cmd);
 }
 
 static struct ibv_qp *create_qp_v0(struct ibv_pd *pd,
@@ -438,9 +560,12 @@  static struct ibv_qp *create_qp(struct ibv_pd *pd,
 	qhp->wq.sq.flags = resp.flags & C4IW_QPF_ONCHIP ? T4_SQ_ONCHIP : 0;
 	qhp->wq.sq.flush_cidx = -1;
 	qhp->wq.rq.msn = 1;
-	qhp->wq.rq.qid = resp.rqid;
-	qhp->wq.rq.size = resp.rq_size;
-	qhp->wq.rq.memsize = resp.rq_memsize;
+	qhp->srq = to_c4iw_srq(attr->srq);
+	if (!attr->srq) {
+		qhp->wq.rq.qid = resp.rqid;
+		qhp->wq.rq.size = resp.rq_size;
+		qhp->wq.rq.memsize = resp.rq_memsize;
+	}
 	if (ma_wr && resp.sq_memsize < (resp.sq_size + 1) *
 	    sizeof *qhp->wq.sq.queue + 16*sizeof(__be64) ) {
 		ma_wr = 0;
@@ -472,35 +597,39 @@  static struct ibv_qp *create_qp(struct ibv_pd *pd,
 	if (qhp->wq.sq.queue == MAP_FAILED)
 		goto err4;
 
-	dbva = mmap(NULL, c4iw_page_size, PROT_WRITE, MAP_SHARED,
-		    pd->context->cmd_fd, resp.rq_db_gts_key);
-	if (dbva == MAP_FAILED)
-		goto err5;
-	qhp->wq.rq.udb = dbva;
-	if (!dev_is_t4(qhp->rhp)) {
-		unsigned long segment_offset = 128 * (qhp->wq.rq.qid &
-						      qhp->wq.qid_mask);
-
-		if (segment_offset < c4iw_page_size) {
-			qhp->wq.rq.udb += segment_offset / 4;
-			qhp->wq.rq.wc_reg_available = 1;
-		} else
-			qhp->wq.rq.bar2_qid = qhp->wq.rq.qid & qhp->wq.qid_mask;
-		qhp->wq.rq.udb += 2;
+	if (!attr->srq) {
+		dbva = mmap(NULL, c4iw_page_size, PROT_WRITE, MAP_SHARED,
+				pd->context->cmd_fd, resp.rq_db_gts_key);
+		if (dbva == MAP_FAILED)
+			goto err5;
+		qhp->wq.rq.udb = dbva;
+		if (!dev_is_t4(qhp->rhp)) {
+			unsigned long segment_offset = 128 * (qhp->wq.rq.qid &
+					qhp->wq.qid_mask);
+
+			if (segment_offset < c4iw_page_size) {
+				qhp->wq.rq.udb += segment_offset / 4;
+				qhp->wq.rq.wc_reg_available = 1;
+			} else
+				qhp->wq.rq.bar2_qid = qhp->wq.rq.qid & qhp->wq.qid_mask;
+			qhp->wq.rq.udb += 2;
+		}
+		qhp->wq.rq.queue = mmap(NULL, qhp->wq.rq.memsize,
+				PROT_WRITE, MAP_SHARED,
+				pd->context->cmd_fd, resp.rq_key);
+		if (qhp->wq.rq.queue == MAP_FAILED)
+			goto err6;
 	}
-	qhp->wq.rq.queue = mmap(NULL, qhp->wq.rq.memsize,
-			    PROT_WRITE, MAP_SHARED,
-			    pd->context->cmd_fd, resp.rq_key);
-	if (qhp->wq.rq.queue == MAP_FAILED)
-		goto err6;
 
 	qhp->wq.sq.sw_sq = calloc(qhp->wq.sq.size, sizeof (struct t4_swsqe));
 	if (!qhp->wq.sq.sw_sq)
 		goto err7;
 
-	qhp->wq.rq.sw_rq = calloc(qhp->wq.rq.size, sizeof (uint64_t));
-	if (!qhp->wq.rq.sw_rq)
-		goto err8;
+	if (!attr->srq) {
+		qhp->wq.rq.sw_rq = calloc(qhp->wq.rq.size, sizeof (uint64_t));
+		if (!qhp->wq.rq.sw_rq)
+			goto err8;
+	}
 
 	if (t4_sq_onchip(&qhp->wq)) {
 		qhp->wq.sq.ma_sync = mmap(NULL, c4iw_page_size, PROT_WRITE,
@@ -513,11 +642,18 @@  static struct ibv_qp *create_qp(struct ibv_pd *pd,
 
 	if (ctx->status_page_size) {
 		qhp->wq.db_offp = &ctx->status_page->db_off;
-	} else {
+	} else if (!attr->srq) {
 		qhp->wq.db_offp = 
 			&qhp->wq.rq.queue[qhp->wq.rq.size].status.db_off;
 	}
 
+	if (!attr->srq)
+		qhp->wq.qp_errp = &qhp->wq.rq.queue[qhp->wq.rq.size].status.qp_err;
+	else {
+		qhp->wq.qp_errp = &qhp->wq.sq.queue[qhp->wq.sq.size].status.qp_err;
+		qhp->wq.srqidxp = &qhp->wq.sq.queue[qhp->wq.sq.size].status.srqidx;
+	}
+
 	PDBG("%s sq dbva %p sq qva %p sq depth %u sq memsize %lu "
 	       " rq dbva %p rq qva %p rq depth %u rq memsize %lu\n",
 	     __func__,
@@ -534,13 +670,16 @@  static struct ibv_qp *create_qp(struct ibv_pd *pd,
 	INC_STAT(qp);
 	return &qhp->ibv_qp;
 err9:
-	free(qhp->wq.rq.sw_rq);
+	if (!attr->srq)
+		free(qhp->wq.rq.sw_rq);
 err8:
 	free(qhp->wq.sq.sw_sq);
 err7:
-	munmap((void *)qhp->wq.rq.queue, qhp->wq.rq.memsize);
+	if (!attr->srq)
+		munmap((void *)qhp->wq.rq.queue, qhp->wq.rq.memsize);
 err6:
-	munmap(MASKED(qhp->wq.rq.udb), c4iw_page_size);
+	if (!attr->srq)
+		munmap(MASKED(qhp->wq.rq.udb), c4iw_page_size);
 err5:
 	munmap((void *)qhp->wq.sq.queue, qhp->wq.sq.memsize);
 err4:
@@ -614,15 +753,18 @@  int c4iw_destroy_qp(struct ibv_qp *ibqp)
 		munmap((void *)qhp->wq.sq.ma_sync, c4iw_page_size);
 	}
 	munmap(MASKED(qhp->wq.sq.udb), c4iw_page_size);
-	munmap(MASKED(qhp->wq.rq.udb), c4iw_page_size);
 	munmap(qhp->wq.sq.queue, qhp->wq.sq.memsize);
-	munmap(qhp->wq.rq.queue, qhp->wq.rq.memsize);
+	if (!qhp->srq) {
+		munmap(MASKED(qhp->wq.rq.udb), c4iw_page_size);
+		munmap(qhp->wq.rq.queue, qhp->wq.rq.memsize);
+	}
 
 	pthread_spin_lock(&dev->lock);
 	dev->qpid2ptr[qhp->wq.sq.qid] = NULL;
 	pthread_spin_unlock(&dev->lock);
 
-	free(qhp->wq.rq.sw_rq);
+	if (!qhp->srq)
+		free(qhp->wq.rq.sw_rq);
 	free(qhp->wq.sq.sw_sq);
 	free(qhp);
 	return 0;