@@ -6,6 +6,56 @@
#include "rv.h"
#include "trace.h"
+/*
+ * select next sconn to post and claim WQE by inc outstand_send_write
+ * if all sconn SQs are full, next is left back where it started
+ */
+static struct rv_sconn *rv_conn_next_sconn_to_post(struct rv_conn *conn)
+{
+ unsigned long flags;
+ struct rv_sconn *sconn;
+ u8 i;
+ u32 qp_depth = conn->jdev->qp_depth;
+
+ spin_lock_irqsave(&conn->next_lock, flags);
+ for (i = 0; i < conn->num_conn; i++) {
+ sconn = &conn->sconn_arr[conn->next];
+ conn->next = (conn->next + 1) % conn->num_conn;
+ if (atomic_read(&sconn->stats.outstand_send_write) < qp_depth) {
+ atomic_inc(&sconn->stats.outstand_send_write);
+ goto unlock;
+ }
+ }
+ sconn = NULL;
+unlock:
+ spin_unlock_irqrestore(&conn->next_lock, flags);
+ return sconn;
+}
+
+static int rv_drv_post_write_immed(struct rv_pend_write *pend_wr)
+{
+ struct ib_rdma_wr wr;
+ const struct ib_send_wr *bad_wr;
+ struct ib_sge list;
+ struct rv_mr_cached *mrc = pend_wr->mrc;
+
+ /* we xlat the user space loc_addr to an iova appropriate for the MR */
+ list.addr = mrc->mr.ib_mr->iova + (pend_wr->loc_addr - mrc->addr);
+ list.length = pend_wr->length;
+ list.lkey = mrc->mr.ib_mr->lkey;
+
+ wr.wr.next = NULL;
+ wr.wr.wr_cqe = &pend_wr->cqe;
+ wr.wr.sg_list = &list;
+ wr.wr.num_sge = 1;
+ wr.wr.opcode = IB_WR_RDMA_WRITE_WITH_IMM;
+ wr.wr.send_flags = IB_SEND_SIGNALED;
+ wr.wr.ex.imm_data = cpu_to_be32(pend_wr->immed);
+ wr.remote_addr = pend_wr->rem_addr;
+ wr.rkey = pend_wr->rkey;
+ return ib_post_send(pend_wr->sconn->qp, &wr.wr, &bad_wr);
+}
+
/*
* This is called in Soft IRQs for CQE handling.
* We just report errors here, let the QP Async Event deal with
@@ -22,6 +72,246 @@ void rv_report_cqe_error(struct ib_cq *cq, struct ib_wc *wc,
wc->wr_cqe);
}
+static void rv_user_ring_post_event(struct rv_user_ring *ring,
+ struct rv_event *ev)
+{
+ unsigned long flags;
+ struct rv_ring_header *hdr = ring->hdr;
+ int next;
+
+ trace_rv_user_ring_post_event(ring->rv_inx, ring->num_entries,
+ ring->hdr->head, ring->hdr->tail);
+ trace_rv_event_post(ev->event_type, ev->wc.status, ev->wc.imm_data,
+ ev->wc.wr_id, ev->wc.conn_handle,
+ ev->wc.byte_len);
+ spin_lock_irqsave(&ring->lock, flags);
+ next = hdr->tail + 1;
+ if (next == ring->num_entries)
+ next = 0;
+ if (next == hdr->head) {
+ hdr->overflow_cnt++;
+ rv_err(ring->rv_inx, "event ring full: head %u tail %u\n",
+ hdr->head, hdr->tail);
+ goto unlock;
+ }
+
+ smp_rmb(); /* ensure we read tail before writing event */
+ hdr->entries[hdr->tail] = *ev;
+ smp_wmb(); /* ensure ev written before advance tail */
+
+ hdr->tail = next;
+ if (ev->wc.status) {
+ ring->stats.cqe_fail[ev->event_type]++;
+ } else {
+ ring->stats.cqe[ev->event_type]++;
+ ring->stats.bytes[ev->event_type] += ev->wc.byte_len;
+ }
+unlock:
+ spin_unlock_irqrestore(&ring->lock, flags);
+}
+
+static void rv_post_user_event_by_index(struct rv_job_dev *jdev, u16 index,
+ struct rv_event *ev)
+{
+ unsigned long flags;
+ struct rv_user *rv;
+
+ spin_lock_irqsave(&jdev->user_array_lock, flags);
+ if (index >= jdev->max_users)
+ goto unlock;
+ rv = jdev->user_array[index];
+ if (rv && rv->cqr)
+ rv_user_ring_post_event(rv->cqr, ev);
+unlock:
+ spin_unlock_irqrestore(&jdev->user_array_lock, flags);
+}
+
+/*
+ * We have a rv_conn reference for the pend_wr
+ * pass all failures to PSM to deal with. We can't attempt
+ * to retry the write in rv since it might have succeeded on remote
+ * end (eg. ack lost) and remote end may be using buffer for something
+ * else already
+ */
+static void rv_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+ struct rv_pend_write *pend_wr = container_of(wc->wr_cqe,
+ struct rv_pend_write, cqe);
+ struct rv_sconn *sconn = pend_wr->sconn;
+ struct rv_event ev = { 0 };
+
+ atomic_dec(&sconn->stats.outstand_send_write);
+ trace_rv_wc_write_done(pend_wr->wr_id, wc->status, wc->opcode,
+ wc->byte_len, be32_to_cpu(wc->ex.imm_data));
+ trace_rv_pend_write_done(pend_wr->user_index, pend_wr->sconn, pend_wr,
+ pend_wr->loc_addr, pend_wr->rkey,
+ pend_wr->rem_addr, pend_wr->length,
+ pend_wr->immed, pend_wr->wr_id);
+
+ if (unlikely(wc->status != IB_WC_SUCCESS))
+ rv_report_cqe_error(cq, wc, pend_wr->sconn, "RDMA Write");
+ else if (wc->qp != sconn->qp)
+ rv_report_cqe_error(cq, wc, pend_wr->sconn, "Stale RDMA Write");
+
+ ev.event_type = RV_WC_RDMA_WRITE;
+ ev.wc.status = wc->status;
+ ev.wc.wr_id = pend_wr->wr_id;
+ ev.wc.conn_handle = (u64)pend_wr->sconn->parent;
+ ev.wc.byte_len = pend_wr->length;
+ trace_rv_event_write_done(ev.event_type, ev.wc.status, ev.wc.imm_data,
+ ev.wc.wr_id, ev.wc.conn_handle,
+ ev.wc.byte_len);
+
+ rv_mr_cache_put(&pend_wr->umrs->cache, pend_wr->mrc);
+
+ rv_post_user_event_by_index(pend_wr->sconn->parent->jdev,
+ pend_wr->user_index, &ev);
+
+ if (wc->status)
+ atomic64_inc(&sconn->stats.send_write_cqe_fail);
+ else
+ atomic64_inc(&sconn->stats.send_write_cqe);
+
+ /* our rv_conn ref prevents user_mrs_put from triggering job cleanup */
+ rv_user_mrs_put(pend_wr->umrs);
+
+ /* rv_conn_put can put rv_job_dev and trigger whole job cleanup */
+ rv_conn_put(sconn->parent);
+
+ kfree(pend_wr);
+}
+
+/*
+ * we do not need a queue inside rv of unposted writes. If this fails
+ * PSM will try to repost later.
+ * We use loc_addr/length/access to lookup MR in cache and then verify RDMA is
+ * consistent with loc_addr and length
+ */
+int doit_post_rdma_write(struct rv_user *rv, unsigned long arg)
+{
+ struct rv_post_write_params pparams;
+ struct rv_conn *conn;
+ struct rv_sconn *sconn;
+ struct rv_mr_cached *mrc;
+ struct rv_pend_write *pend_wr;
+ int ret;
+
+ if (copy_from_user(&pparams.in, (void __user *)arg,
+ sizeof(pparams.in)))
+ return -EFAULT;
+
+ mutex_lock(&rv->mutex);
+
+ conn = user_conn_find(rv, pparams.in.handle);
+ if (!conn) {
+ rv_err(rv->inx, "post_write: No connection found\n");
+ ret = -EINVAL;
+ goto bail_unlock;
+ }
+ sconn = rv_conn_next_sconn_to_post(conn);
+ if (unlikely(!sconn)) {
+ ret = -ENOMEM;
+ goto bail_unlock;
+ }
+
+ mrc = rv_mr_cache_search_get(&rv->umrs->cache, pparams.in.loc_mr_addr,
+ pparams.in.loc_mr_length,
+ pparams.in.loc_mr_access, false);
+ if (!mrc) {
+ rv_err(rv->inx, "post_write: bad loc_mr\n");
+ ret = -EINVAL;
+ goto bail_dec;
+ }
+
+ if (mrc->addr > (u64)pparams.in.loc_addr ||
+ mrc->addr + mrc->len <
+ (u64)pparams.in.loc_addr + pparams.in.length) {
+ rv_err(rv->inx, "post_write: addr inconsistent with loc_mr\n");
+ ret = -EINVAL;
+ goto bail_put_mr;
+ }
+ if (!(mrc->access & IBV_ACCESS_KERNEL)) {
+ rv_err(rv->inx, "post_write: loc_mr not a kernel MR\n");
+ ret = -EINVAL;
+ goto bail_put_mr;
+ }
+
+ pend_wr = kzalloc(sizeof(*pend_wr), GFP_KERNEL);
+ if (!pend_wr) {
+ ret = -ENOMEM;
+ goto bail_put_mr;
+ }
+ pend_wr->cqe.done = rv_rdma_write_done;
+ pend_wr->user_index = rv->index;
+
+ rv_user_mrs_get(rv->umrs);
+ pend_wr->umrs = rv->umrs;
+
+ rv_conn_get(sconn->parent);
+ pend_wr->sconn = sconn;
+
+ pend_wr->mrc = mrc;
+ pend_wr->loc_addr = (u64)pparams.in.loc_addr;
+ pend_wr->rem_addr = pparams.in.rem_addr;
+ pend_wr->rkey = pparams.in.rkey;
+ pend_wr->length = pparams.in.length;
+ pend_wr->immed = pparams.in.immed;
+ pend_wr->wr_id = pparams.in.wr_id;
+
+ mutex_lock(&sconn->mutex);
+ if (sconn->state != RV_CONNECTED) {
+ if (sconn->state == RV_ERROR)
+ ret = -EIO;
+ else if (test_bit(RV_SCONN_WAS_CONNECTED, &sconn->flags))
+ ret = -EAGAIN;
+ else
+ ret = -EINVAL;
+ mutex_unlock(&sconn->mutex);
+ goto bail_free_pend;
+ }
+
+ trace_rv_pend_write_post(pend_wr->user_index, pend_wr->sconn, pend_wr,
+ pend_wr->loc_addr, pend_wr->rkey,
+ pend_wr->rem_addr, pend_wr->length,
+ pend_wr->immed, pend_wr->wr_id);
+ ret = rv_drv_post_write_immed(pend_wr);
+ if (ret) {
+ sconn->stats.post_write_fail++;
+ } else {
+ sconn->stats.post_write++;
+ sconn->stats.post_write_bytes += pparams.in.length;
+ }
+
+ pparams.out.sconn_index = sconn->index;
+ pparams.out.conn_count = sconn->stats.conn_recovery + 1;
+
+ mutex_unlock(&sconn->mutex);
+ if (ret) {
+ rv_err(rv->inx, "post_write: failed: %d\n", ret);
+ goto bail_free_pend;
+ }
+
+ if (copy_to_user((void __user *)arg, &pparams.out, sizeof(pparams.out)))
+ ret = -EFAULT;
+
+ mutex_unlock(&rv->mutex);
+
+ return 0;
+
+bail_free_pend:
+ rv_conn_put(pend_wr->sconn->parent);
+ rv_user_mrs_put(pend_wr->umrs);
+ kfree(pend_wr);
+
+bail_put_mr:
+ rv_mr_cache_put(&rv->umrs->cache, mrc);
+bail_dec:
+ atomic_dec(&sconn->stats.outstand_send_write);
+bail_unlock:
+ mutex_unlock(&rv->mutex);
+ return ret;
+}
+
static int rv_drv_post_recv(struct rv_sconn *sconn)
{
struct ib_recv_wr wr;
@@ -54,6 +344,27 @@ int rv_drv_prepost_recv(struct rv_sconn *sconn)
return 0;
}
+static void rv_recv_rdma_write(struct rv_sconn *sconn, struct ib_wc *wc)
+{
+ struct rv_job_dev *jdev = sconn->parent->jdev;
+ u32 index = be32_to_cpu(wc->ex.imm_data) >> (32 - jdev->index_bits);
+ struct rv_event ev = { 0 };
+
+ ev.event_type = RV_WC_RECV_RDMA_WITH_IMM;
+ ev.wc.status = wc->status;
+ ev.wc.resv1 = 0;
+ ev.wc.imm_data = be32_to_cpu(wc->ex.imm_data);
+ ev.wc.wr_id = 0; /* N/A */
+ ev.wc.conn_handle = (u64)sconn->parent;
+ ev.wc.byte_len = wc->byte_len;
+ ev.wc.resv2 = 0;
+ trace_rv_event_recv_write(ev.event_type, ev.wc.status, ev.wc.imm_data,
+ ev.wc.wr_id, ev.wc.conn_handle,
+ ev.wc.byte_len);
+
+ rv_post_user_event_by_index(jdev, index, &ev);
+}
+
/* drain_lock makes sure no recv WQEs get reposted after a drain WQE */
void rv_recv_done(struct ib_cq *cq, struct ib_wc *wc)
{
@@ -93,6 +404,8 @@ void rv_recv_done(struct ib_cq *cq, struct ib_wc *wc)
if (unlikely(wc->opcode != IB_WC_RECV_RDMA_WITH_IMM))
rv_report_cqe_error(cq, wc, sconn, "Recv bad opcode");
+ else
+ rv_recv_rdma_write(sconn, wc);
repost:
spin_lock_irqsave(&sconn->drain_lock, flags);
if (likely(!test_bit(RV_SCONN_DRAINING, &sconn->flags)))
@@ -11,9 +11,74 @@
#undef TRACE_SYSTEM
#define TRACE_SYSTEM rv_rdma
+#define RV_PEND_WRITE_PRN "user_inx %d sconn %p pend_wr %p loc_addr 0x%llx" \
+ " rkey 0x%x rem_addr 0x%llx len 0x%llx immed 0x%x" \
+ " wr_id 0x%llx"
+
#define RV_SCONN_RECV_PRN "sconn %p index %u qp 0x%x conn %p flags 0x%x " \
" state %u immed 0x%x"
+#define RV_EVENT_PRN "type 0x%x status 0x%x immed 0x%x wr_id 0x%llx " \
+ "conn_handle 0x%llx len 0x%x"
+
+DECLARE_EVENT_CLASS(/* pend_write */
+ rv_pend_write_template,
+ TP_PROTO(int user_inx, void *sconn, void *pend_wr, u64 loc_addr,
+ u32 rkey, u64 rem_addr, u64 len, u32 immed, u64 wr_id),
+ TP_ARGS(user_inx, sconn, pend_wr, loc_addr, rkey, rem_addr, len, immed,
+ wr_id),
+ TP_STRUCT__entry(/* entry */
+ __field(int, user_inx)
+ __field(void *, sconn)
+ __field(void *, pend_wr)
+ __field(u64, loc_addr)
+ __field(u32, rkey)
+ __field(u64, rem_addr)
+ __field(u64, len)
+ __field(u32, immed)
+ __field(u64, wr_id)
+ ),
+ TP_fast_assign(/* assign */
+ __entry->user_inx = user_inx;
+ __entry->sconn = sconn;
+ __entry->pend_wr = pend_wr;
+ __entry->loc_addr = loc_addr;
+ __entry->rkey = rkey;
+ __entry->rem_addr = rem_addr;
+ __entry->len = len;
+ __entry->immed = immed;
+ __entry->wr_id = wr_id;
+ ),
+ TP_printk(/* print */
+ RV_PEND_WRITE_PRN,
+ __entry->user_inx,
+ __entry->sconn,
+ __entry->pend_wr,
+ __entry->loc_addr,
+ __entry->rkey,
+ __entry->rem_addr,
+ __entry->len,
+ __entry->immed,
+ __entry->wr_id
+ )
+);
+
+DEFINE_EVENT(/* event */
+ rv_pend_write_template, rv_pend_write_post,
+ TP_PROTO(int user_inx, void *sconn, void *pend_wr, u64 loc_addr,
+ u32 rkey, u64 rem_addr, u64 len, u32 immed, u64 wr_id),
+ TP_ARGS(user_inx, sconn, pend_wr, loc_addr, rkey, rem_addr, len, immed,
+ wr_id)
+);
+
+DEFINE_EVENT(/* event */
+ rv_pend_write_template, rv_pend_write_done,
+ TP_PROTO(int user_inx, void *sconn, void *pend_wr, u64 loc_addr,
+ u32 rkey, u64 rem_addr, u64 len, u32 immed, u64 wr_id),
+ TP_ARGS(user_inx, sconn, pend_wr, loc_addr, rkey, rem_addr, len, immed,
+ wr_id)
+);
+
DECLARE_EVENT_CLASS(/* recv */
rv_sconn_recv_template,
TP_PROTO(void *ptr, u8 index, u32 qp_num, void *conn, u32 flags,
@@ -77,6 +142,59 @@ DEFINE_EVENT(/* event */
TP_ARGS(ptr, index, qp_num, conn, flags, state, immed)
);
+DECLARE_EVENT_CLASS(/* event */
+ rv_event_template,
+ TP_PROTO(u8 type, u8 status, u32 immed, u64 wr_id, u64 conn_handle,
+ u32 len),
+ TP_ARGS(type, status, immed, wr_id, conn_handle, len),
+ TP_STRUCT__entry(/* entry */
+ __field(u8, type)
+ __field(u8, status)
+ __field(u32, immed)
+ __field(u64, wr_id)
+ __field(u64, conn_handle)
+ __field(u32, len)
+ ),
+ TP_fast_assign(/* assign */
+ __entry->type = type;
+ __entry->status = status;
+ __entry->immed = immed;
+ __entry->wr_id = wr_id;
+ __entry->conn_handle = conn_handle;
+ __entry->len = len;
+ ),
+ TP_printk(/* print */
+ RV_EVENT_PRN,
+ __entry->type,
+ __entry->status,
+ __entry->immed,
+ __entry->wr_id,
+ __entry->conn_handle,
+ __entry->len
+ )
+);
+
+DEFINE_EVENT(/* event */
+ rv_event_template, rv_event_write_done,
+ TP_PROTO(u8 type, u8 status, u32 immed, u64 wr_id, u64 conn_handle,
+ u32 len),
+ TP_ARGS(type, status, immed, wr_id, conn_handle, len)
+);
+
+DEFINE_EVENT(/* event */
+ rv_event_template, rv_event_post,
+ TP_PROTO(u8 type, u8 status, u32 immed, u64 wr_id, u64 conn_handle,
+ u32 len),
+ TP_ARGS(type, status, immed, wr_id, conn_handle, len)
+);
+
+DEFINE_EVENT(/* event */
+ rv_event_template, rv_event_recv_write,
+ TP_PROTO(u8 type, u8 status, u32 immed, u64 wr_id, u64 conn_handle,
+ u32 len),
+ TP_ARGS(type, status, immed, wr_id, conn_handle, len)
+);
+
DECLARE_EVENT_CLASS(/* wc */
rv_wc_template,
TP_PROTO(u64 wr_id, u32 status, u32 opcode, u32 byte_len,
@@ -113,6 +231,13 @@ DEFINE_EVENT(/* event */
TP_ARGS(wr_id, status, opcode, byte_len, imm_data)
);
+DEFINE_EVENT(/* event */
+ rv_wc_template, rv_wc_write_done,
+ TP_PROTO(u64 wr_id, u32 status, u32 opcode, u32 byte_len,
+ u32 imm_data),
+ TP_ARGS(wr_id, status, opcode, byte_len, imm_data)
+);
+
DEFINE_EVENT(/* event */
rv_wc_template, rv_wc_hb_done,
TP_PROTO(u64 wr_id, u32 status, u32 opcode, u32 byte_len,
@@ -112,6 +112,37 @@ DEFINE_EVENT(/* event */
TP_ARGS(rv_inx, jdev, total_size, max_size, refcount)
);
+DECLARE_EVENT_CLASS(/* user_ring */
+ rv_user_ring_template,
+ TP_PROTO(int rv_inx, u32 count, u32 hd, u32 tail),
+ TP_ARGS(rv_inx, count, hd, tail),
+ TP_STRUCT__entry(/* entry */
+ __field(int, rv_inx)
+ __field(u32, count)
+ __field(u32, head)
+ __field(u32, tail)
+ ),
+ TP_fast_assign(/* assign */
+ __entry->rv_inx = rv_inx;
+ __entry->count = count;
+ __entry->head = hd;
+ __entry->tail = tail;
+ ),
+ TP_printk(/* print */
+ "rv_inx %d entries %u head %u tail %u",
+ __entry->rv_inx,
+ __entry->count,
+ __entry->head,
+ __entry->tail
+ )
+);
+
+DEFINE_EVENT(/* event */
+ rv_user_ring_template, rv_user_ring_post_event,
+ TP_PROTO(int rv_inx, u32 count, u32 hd, u32 tail),
+ TP_ARGS(rv_inx, count, hd, tail)
+);
+
#endif /* __RV_TRACE_USER_H */
#undef TRACE_INCLUDE_PATH