@@ -57,6 +57,8 @@
#define PVRDMA_UAR_CQ_ARM_SOL BIT(29) /* Arm solicited bit. */
#define PVRDMA_UAR_CQ_ARM BIT(30) /* Arm bit. */
#define PVRDMA_UAR_CQ_POLL BIT(31) /* Poll bit. */
+#define PVRDMA_UAR_SRQ_OFFSET 8 /* SRQ doorbell. */
+#define PVRDMA_UAR_SRQ_RECV BIT(30) /* Recv bit. */
enum pvrdma_wr_opcode {
PVRDMA_WR_RDMA_WRITE,
@@ -157,6 +159,8 @@ struct pvrdma_resize_cq {
struct pvrdma_create_srq {
__u64 buf_addr;
+ __u32 buf_size;
+ __u32 reserved;
};
struct pvrdma_create_srq_resp {
@@ -69,6 +69,16 @@ struct user_pvrdma_create_cq_resp {
struct pvrdma_create_cq_resp udata;
};
+struct user_pvrdma_create_srq {
+ struct ibv_create_srq ibv_cmd;
+ struct pvrdma_create_srq udata;
+};
+
+struct user_pvrdma_create_srq_resp {
+ struct ibv_create_srq_resp ibv_resp;
+ struct pvrdma_create_srq_resp udata;
+};
+
struct user_pvrdma_create_qp {
struct ibv_create_qp ibv_cmd;
struct pvrdma_create_qp udata;
@@ -135,6 +135,21 @@ struct pvrdma_cq {
uint32_t cqn;
};
+struct pvrdma_srq {
+ struct ibv_srq ibv_srq;
+ struct pvrdma_buf buf;
+ pthread_spinlock_t lock;
+ uint64_t *wrid;
+ uint32_t srqn;
+ int wqe_cnt;
+ int wqe_size;
+ int max_gs;
+ int wqe_shift;
+ struct pvrdma_ring_state *ring_state;
+ uint16_t counter;
+ int offset;
+};
+
struct pvrdma_wq {
uint64_t *wrid;
pthread_spinlock_t lock;
@@ -156,6 +171,7 @@ struct pvrdma_qp {
int sq_spare_wqes;
struct pvrdma_wq sq;
struct pvrdma_wq rq;
+ int is_srq;
};
struct pvrdma_ah {
@@ -198,6 +214,11 @@ static inline struct pvrdma_cq *to_vcq(struct ibv_cq *ibcq)
return container_of(ibcq, struct pvrdma_cq, ibv_cq);
}
+static inline struct pvrdma_srq *to_vsrq(struct ibv_srq *ibsrq)
+{
+ return container_of(ibsrq, struct pvrdma_srq, ibv_srq);
+}
+
static inline struct pvrdma_qp *to_vqp(struct ibv_qp *ibqp)
{
return container_of(ibqp, struct pvrdma_qp, ibv_qp);
@@ -218,6 +239,11 @@ static inline void pvrdma_write_uar_cq(void *uar, unsigned value)
*(__le32 *)(uar + PVRDMA_UAR_CQ_OFFSET) = htole32(value);
}
+static inline void pvrdma_write_uar_srq(void *uar, unsigned int value)
+{
+ *(__le32 *)(uar + PVRDMA_UAR_SRQ_OFFSET) = htole32(value);
+}
+
static inline int ibv_send_flags_to_pvrdma(int flags)
{
return flags;
@@ -301,6 +327,21 @@ int pvrdma_store_qp(struct pvrdma_context *ctx, uint32_t qpn,
struct pvrdma_qp *qp);
void pvrdma_clear_qp(struct pvrdma_context *ctx, uint32_t qpn);
+struct ibv_srq *pvrdma_create_srq(struct ibv_pd *pd,
+ struct ibv_srq_init_attr *attr);
+int pvrdma_modify_srq(struct ibv_srq *srq, struct ibv_srq_attr *attr,
+ int attr_mask);
+int pvrdma_query_srq(struct ibv_srq *srq,
+ struct ibv_srq_attr *attr);
+int pvrdma_destroy_srq(struct ibv_srq *srq);
+int pvrdma_alloc_srq_buf(struct pvrdma_device *dev,
+ struct ibv_srq_attr *attr,
+ struct pvrdma_srq *srq);
+int pvrdma_post_srq_recv(struct ibv_srq *ibsrq,
+ struct ibv_recv_wr *wr,
+ struct ibv_recv_wr **bad_wr);
+void pvrdma_init_srq_queue(struct pvrdma_srq *srq);
+
struct ibv_ah *pvrdma_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr);
int pvrdma_destroy_ah(struct ibv_ah *ah);
@@ -69,6 +69,12 @@ static struct ibv_context_ops pvrdma_ctx_ops = {
.modify_qp = pvrdma_modify_qp,
.destroy_qp = pvrdma_destroy_qp,
+ .create_srq = pvrdma_create_srq,
+ .modify_srq = pvrdma_modify_srq,
+ .query_srq = pvrdma_query_srq,
+ .destroy_srq = pvrdma_destroy_srq,
+ .post_srq_recv = pvrdma_post_srq_recv,
+
.post_send = pvrdma_post_send,
.post_recv = pvrdma_post_recv,
.create_ah = pvrdma_create_ah,
@@ -47,7 +47,6 @@
#ifndef __PVRDMA_RING_H__
#define __PVRDMA_RING_H__
-#include <stdint.h>
#include <linux/types.h>
#define PVRDMA_INVALID_IDX -1 /* Invalid index. */
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2012-2016 VMware, Inc. All rights reserved.
+ * Copyright (c) 2012-2017 VMware, Inc. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of EITHER the GNU General Public License
@@ -54,37 +54,145 @@ int pvrdma_alloc_qp_buf(struct pvrdma_device *dev, struct ibv_qp_cap *cap,
if (!qp->sq.wrid)
return -1;
- qp->rq.wrid = calloc(qp->rq.wqe_cnt, sizeof(uint64_t));
- if (!qp->rq.wrid) {
- free(qp->sq.wrid);
- return -1;
- }
-
- /* Align page size for [rq][sq] */
- qp->rbuf.length = align(qp->rq.offset +
- qp->rq.wqe_cnt * qp->rq.wqe_size,
- dev->page_size);
+ /* Align page size for sq */
qp->sbuf.length = align(qp->sq.offset +
qp->sq.wqe_cnt * qp->sq.wqe_size,
dev->page_size);
- qp->buf_size = qp->rbuf.length + qp->sbuf.length;
-
- if (pvrdma_alloc_buf(&qp->rbuf, qp->rbuf.length, dev->page_size)) {
- free(qp->sq.wrid);
- free(qp->rq.wrid);
- return -1;
- }
if (pvrdma_alloc_buf(&qp->sbuf, qp->sbuf.length, dev->page_size)) {
free(qp->sq.wrid);
- free(qp->rq.wrid);
- pvrdma_free_buf(&qp->rbuf);
return -1;
}
- memset(qp->rbuf.buf, 0, qp->rbuf.length);
memset(qp->sbuf.buf, 0, qp->sbuf.length);
+ if (!qp->is_srq) {
+ qp->rq.wrid = calloc(qp->rq.wqe_cnt, sizeof(uint64_t));
+ if (!qp->rq.wrid) {
+ pvrdma_free_buf(&qp->sbuf);
+ free(qp->sq.wrid);
+ return -1;
+ }
+
+ /* Align page size for rq */
+ qp->rbuf.length = align(qp->rq.offset +
+ qp->rq.wqe_cnt * qp->rq.wqe_size,
+ dev->page_size);
+
+ if (pvrdma_alloc_buf(&qp->rbuf, qp->rbuf.length,
+ dev->page_size)) {
+ free(qp->sq.wrid);
+ free(qp->rq.wrid);
+ pvrdma_free_buf(&qp->sbuf);
+ return -1;
+ }
+ memset(qp->rbuf.buf, 0, qp->rbuf.length);
+ } else {
+ qp->rbuf.buf = NULL;
+ qp->rbuf.length = 0;
+ }
+
+ qp->buf_size = qp->rbuf.length + qp->sbuf.length;
+
+ return 0;
+}
+
+void pvrdma_init_srq_queue(struct pvrdma_srq *srq)
+{
+ srq->ring_state->rx.cons_head = 0;
+ srq->ring_state->rx.prod_tail = 0;
+}
+
+struct ibv_srq *pvrdma_create_srq(struct ibv_pd *pd,
+ struct ibv_srq_init_attr *attr)
+{
+ struct pvrdma_device *dev = to_vdev(pd->context->device);
+ struct user_pvrdma_create_srq cmd;
+ struct ibv_create_srq_resp resp;
+ struct pvrdma_srq *srq;
+ int ret;
+
+ attr->attr.max_wr = align_next_power2(max_t(uint32_t, 1U, attr->attr.max_wr));
+ attr->attr.max_sge = align_next_power2(max_t(uint32_t, 1U, attr->attr.max_sge));
+
+ srq = malloc(sizeof(*srq));
+ if (!srq)
+ return NULL;
+
+ if (pthread_spin_init(&srq->lock, PTHREAD_PROCESS_PRIVATE))
+ goto err;
+
+ srq->wqe_cnt = attr->attr.max_wr;
+ srq->max_gs = attr->attr.max_sge;
+ srq->wqe_size = align_next_power2(sizeof(struct pvrdma_rq_wqe_hdr) +
+ sizeof(struct ibv_sge) *
+ srq->max_gs);
+ /* Page reserved for queue metadata */
+ srq->offset = dev->page_size;
+
+ if (pvrdma_alloc_srq_buf(dev, &attr->attr, srq))
+ goto err_spinlock;
+
+ srq->ring_state = srq->buf.buf;
+ pvrdma_init_srq_queue(srq);
+
+ memset(&cmd, 0, sizeof(cmd));
+ cmd.udata.buf_addr = (uintptr_t) srq->buf.buf;
+ cmd.udata.buf_size = srq->buf.length;
+
+ ret = ibv_cmd_create_srq(pd, &srq->ibv_srq, attr,
+ &cmd.ibv_cmd, sizeof(cmd),
+ &resp, sizeof(resp));
+
+ if (ret)
+ goto err_free;
+
+ srq->srqn = resp.srqn;
+
+ return &srq->ibv_srq;
+
+err_free:
+ free(srq->wrid);
+ pvrdma_free_buf(&srq->buf);
+err_spinlock:
+ pthread_spin_destroy(&srq->lock);
+err:
+ free(srq);
+
+ return NULL;
+}
+
+int pvrdma_modify_srq(struct ibv_srq *srq,
+ struct ibv_srq_attr *attr,
+ int attr_mask)
+{
+ struct ibv_modify_srq cmd;
+
+ return ibv_cmd_modify_srq(srq, attr, attr_mask, &cmd, sizeof(cmd));
+}
+
+int pvrdma_query_srq(struct ibv_srq *srq,
+ struct ibv_srq_attr *attr)
+{
+ struct ibv_query_srq cmd;
+
+ return ibv_cmd_query_srq(srq, attr, &cmd, sizeof(cmd));
+}
+
+int pvrdma_destroy_srq(struct ibv_srq *ibsrq)
+{
+ struct pvrdma_srq *srq = to_vsrq(ibsrq);
+ int ret;
+
+ ret = ibv_cmd_destroy_srq(ibsrq);
+ if (ret)
+ return ret;
+
+ pthread_spin_destroy(&srq->lock);
+ pvrdma_free_buf(&srq->buf);
+ free(srq->wrid);
+ free(srq);
+
return 0;
}
@@ -92,8 +200,10 @@ static void pvrdma_init_qp_queue(struct pvrdma_qp *qp)
{
qp->sq.ring_state->cons_head = 0;
qp->sq.ring_state->prod_tail = 0;
- qp->rq.ring_state->cons_head = 0;
- qp->rq.ring_state->prod_tail = 0;
+ if (qp->rq.ring_state) {
+ qp->rq.ring_state->cons_head = 0;
+ qp->rq.ring_state->prod_tail = 0;
+ }
}
struct ibv_qp *pvrdma_create_qp(struct ibv_pd *pd,
@@ -104,26 +214,28 @@ struct ibv_qp *pvrdma_create_qp(struct ibv_pd *pd,
struct ibv_create_qp_resp resp;
struct pvrdma_qp *qp;
int ret;
+ int is_srq = !!(attr->srq);
- attr->cap.max_recv_sge =
- align_next_power2(max_t(uint32_t, 1U, attr->cap.max_recv_sge));
- attr->cap.max_recv_wr =
- align_next_power2(max_t(uint32_t, 1U, attr->cap.max_recv_wr));
attr->cap.max_send_sge =
align_next_power2(max_t(uint32_t, 1U, attr->cap.max_send_sge));
attr->cap.max_send_wr =
align_next_power2(max_t(uint32_t, 1U, attr->cap.max_send_wr));
+ if (!is_srq) {
+ attr->cap.max_recv_sge =
+ align_next_power2(max_t(uint32_t, 1U, attr->cap.max_recv_sge));
+ attr->cap.max_recv_wr =
+ align_next_power2(max_t(uint32_t, 1U, attr->cap.max_recv_wr));
+ } else {
+ attr->cap.max_recv_sge = 0;
+ attr->cap.max_recv_wr = 0;
+ }
+
qp = calloc(1, sizeof(*qp));
if (!qp)
return NULL;
- qp->rq.max_gs = attr->cap.max_recv_sge;
- qp->rq.wqe_cnt = attr->cap.max_recv_wr;
- qp->rq.offset = 0;
- qp->rq.wqe_size = align_next_power2(sizeof(struct pvrdma_rq_wqe_hdr) +
- sizeof(struct ibv_sge) *
- qp->rq.max_gs);
+ qp->is_srq = is_srq;
qp->sq.max_gs = attr->cap.max_send_sge;
qp->sq.wqe_cnt = attr->cap.max_send_wr;
@@ -133,10 +245,18 @@ struct ibv_qp *pvrdma_create_qp(struct ibv_pd *pd,
sizeof(struct ibv_sge) *
qp->sq.max_gs);
- /* Reset attr.cap, no srq for now */
- if (attr->srq) {
- attr->cap.max_recv_wr = 0;
+ if (!is_srq) {
+ qp->rq.max_gs = attr->cap.max_recv_sge;
+ qp->rq.wqe_cnt = attr->cap.max_recv_wr;
+ qp->rq.offset = 0;
+ qp->rq.wqe_size = align_next_power2(sizeof(struct pvrdma_rq_wqe_hdr) +
+ sizeof(struct ibv_sge) *
+ qp->rq.max_gs);
+ } else {
+ qp->rq.max_gs = 0;
qp->rq.wqe_cnt = 0;
+ qp->rq.offset = 0;
+ qp->rq.wqe_size = 0;
}
/* Allocate [rq][sq] memory */
@@ -144,18 +264,24 @@ struct ibv_qp *pvrdma_create_qp(struct ibv_pd *pd,
goto err;
qp->sq.ring_state = qp->sbuf.buf;
- qp->rq.ring_state = (struct pvrdma_ring *)&qp->sq.ring_state[1];
- pvrdma_init_qp_queue(qp);
-
- if (pthread_spin_init(&qp->sq.lock, PTHREAD_PROCESS_PRIVATE) ||
- pthread_spin_init(&qp->rq.lock, PTHREAD_PROCESS_PRIVATE))
+ if (pthread_spin_init(&qp->sq.lock, PTHREAD_PROCESS_PRIVATE))
goto err_free;
+ if (!is_srq) {
+ qp->rq.ring_state = (struct pvrdma_ring *)&qp->sq.ring_state[1];
+ if (pthread_spin_init(&qp->rq.lock, PTHREAD_PROCESS_PRIVATE))
+ goto err_free;
+ } else {
+ qp->rq.ring_state = NULL;
+ }
+
+ pvrdma_init_qp_queue(qp);
+
memset(&cmd, 0, sizeof(cmd));
- cmd.udata.rbuf_addr = (uintptr_t)qp->rbuf.buf;
- cmd.udata.rbuf_size = qp->rbuf.length;
cmd.udata.sbuf_addr = (uintptr_t)qp->sbuf.buf;
cmd.udata.sbuf_size = qp->sbuf.length;
+ cmd.udata.rbuf_addr = (uintptr_t)qp->rbuf.buf;
+ cmd.udata.rbuf_size = qp->rbuf.length;
cmd.udata.qp_addr = (uintptr_t) qp;
ret = ibv_cmd_create_qp(pd, &qp->ibv_qp, attr,
@@ -240,9 +366,9 @@ static void pvrdma_lock_cqs(struct ibv_qp *qp)
struct pvrdma_cq *send_cq = to_vcq(qp->send_cq);
struct pvrdma_cq *recv_cq = to_vcq(qp->recv_cq);
- if (send_cq == recv_cq)
+ if (send_cq == recv_cq) {
pthread_spin_lock(&send_cq->lock);
- else if (send_cq->cqn < recv_cq->cqn) {
+ } else if (send_cq->cqn < recv_cq->cqn) {
pthread_spin_lock(&send_cq->lock);
pthread_spin_lock(&recv_cq->lock);
} else {
@@ -256,9 +382,9 @@ static void pvrdma_unlock_cqs(struct ibv_qp *qp)
struct pvrdma_cq *send_cq = to_vcq(qp->send_cq);
struct pvrdma_cq *recv_cq = to_vcq(qp->recv_cq);
- if (send_cq == recv_cq)
+ if (send_cq == recv_cq) {
pthread_spin_unlock(&send_cq->lock);
- else if (send_cq->cqn < recv_cq->cqn) {
+ } else if (send_cq->cqn < recv_cq->cqn) {
pthread_spin_unlock(&recv_cq->lock);
pthread_spin_unlock(&send_cq->lock);
} else {
@@ -296,6 +422,11 @@ int pvrdma_destroy_qp(struct ibv_qp *ibqp)
return 0;
}
+static void *get_srq_wqe(struct pvrdma_srq *srq, int n)
+{
+ return srq->buf.buf + srq->offset + (n * srq->wqe_size);
+}
+
static void *get_rq_wqe(struct pvrdma_qp *qp, int n)
{
return qp->rbuf.buf + qp->rq.offset + (n * qp->rq.wqe_size);
@@ -438,6 +569,9 @@ int pvrdma_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr,
int i;
int ret = 0;
+ if (qp->is_srq)
+ return EINVAL;
+
if (!wr || !bad_wr)
return EINVAL;
@@ -503,3 +637,94 @@ out:
pthread_spin_unlock(&qp->rq.lock);
return ret;
}
+
+int pvrdma_post_srq_recv(struct ibv_srq *ibsrq,
+ struct ibv_recv_wr *wr,
+ struct ibv_recv_wr **bad_wr)
+{
+ struct pvrdma_context *ctx = to_vctx(ibsrq->context);
+ struct pvrdma_srq *srq = to_vsrq(ibsrq);
+ struct pvrdma_rq_wqe_hdr *wqe_hdr;
+ struct ibv_sge *sge;
+ int nreq;
+ int ind;
+ int i;
+ int ret = 0;
+
+ if (!wr || !bad_wr)
+ return EINVAL;
+
+ pthread_spin_lock(&srq->lock);
+
+ ind = pvrdma_idx(&(srq->ring_state->rx.prod_tail), srq->wqe_cnt);
+ if (ind < 0) {
+ pthread_spin_unlock(&srq->lock);
+ *bad_wr = wr;
+ return EINVAL;
+ }
+
+ for (nreq = 0; wr; ++nreq, wr = wr->next) {
+ unsigned int tail;
+
+ if (pvrdma_idx_ring_has_space(&srq->ring_state->rx,
+ srq->wqe_cnt, &tail) <= 0) {
+ ret = ENOMEM;
+ *bad_wr = wr;
+ break;
+ }
+
+ if (wr->num_sge > srq->max_gs) {
+ ret = EINVAL;
+ *bad_wr = wr;
+ break;
+ }
+
+ /* Fetch wqe */
+ wqe_hdr = (struct pvrdma_rq_wqe_hdr *)get_srq_wqe(srq, ind);
+ wqe_hdr->wr_id = wr->wr_id;
+ wqe_hdr->num_sge = wr->num_sge;
+
+ sge = (struct ibv_sge *)(wqe_hdr + 1);
+ for (i = 0; i < wr->num_sge; ++i) {
+ sge->addr = (uint64_t)wr->sg_list[i].addr;
+ sge->length = wr->sg_list[i].length;
+ sge->lkey = wr->sg_list[i].lkey;
+ sge++;
+ }
+
+ pvrdma_idx_ring_inc(&srq->ring_state->rx.prod_tail,
+ srq->wqe_cnt);
+
+ srq->wrid[ind] = wr->wr_id;
+ ind = (ind + 1) & (srq->wqe_cnt - 1);
+ }
+
+ if (nreq)
+ pvrdma_write_uar_srq(ctx->uar,
+ PVRDMA_UAR_SRQ_RECV | srq->srqn);
+
+ pthread_spin_unlock(&srq->lock);
+
+ return ret;
+}
+
+int pvrdma_alloc_srq_buf(struct pvrdma_device *dev,
+ struct ibv_srq_attr *attr,
+ struct pvrdma_srq *srq)
+{
+ srq->wrid = calloc(srq->wqe_cnt, sizeof(uint64_t));
+ if (!srq->wrid)
+ return -1;
+
+ srq->buf.length = align(srq->offset, dev->page_size);
+ srq->buf.length += 2 * align(srq->wqe_cnt * srq->wqe_size, dev->page_size);
+
+ if (pvrdma_alloc_buf(&srq->buf, srq->buf.length, dev->page_size)) {
+ free(srq->wrid);
+ return -1;
+ }
+
+ memset(srq->buf.buf, 0, srq->buf.length);
+
+ return 0;
+}