@@ -47,9 +47,12 @@
*/
#include "hfi.h"
+#include "qp.h"
+#include "rc.h"
#include "verbs.h"
#include "tid_rdma.h"
#include "user_exp_rcv.h"
+#include "trace.h"
/**
* DOC: TID RDMA READ protocol
@@ -98,7 +101,7 @@
#define GENERATION_MASK 0xFFFFF
-static inline u32 mask_generation(u32 a)
+static u32 mask_generation(u32 a)
{
return a & GENERATION_MASK;
}
@@ -316,15 +319,255 @@ static void tid_rdma_trigger_resume(struct work_struct *work)
{
}
+static int hfi1_kern_setup_hw_flow(struct hfi1_ctxtdata *rcd,
+ struct rvt_qp *qp)
+{
+ return 0;
+}
+
void hfi1_kern_clear_hw_flow(struct hfi1_ctxtdata *rcd, struct rvt_qp *qp)
{
}
+static int hfi1_kern_exp_rcv_setup(struct tid_rdma_request *req,
+ struct rvt_sge_state *ss, bool *last)
+{
+ return 0;
+}
+
+int hfi1_kern_exp_rcv_clear(struct tid_rdma_request *req)
+ __must_hold(&req->qp->s_lock)
+{
+ return 0;
+}
+
void hfi1_kern_exp_rcv_clear_all(struct tid_rdma_request *req)
__must_hold(&req->qp->s_lock)
{
}
+/*
+ * Validate and accept the TID RDMA READ request parameters.
+ * Return 0 if the request is accepted successfully;
+ * Return 1 otherwise.
+ */
+static int tid_rdma_rcv_read_request(struct rvt_qp *qp,
+ struct rvt_ack_entry *e,
+ struct hfi1_packet *packet,
+ struct ib_other_headers *ohdr,
+ u32 bth0, u32 psn, u64 vaddr, u32 len)
+{
+ int ret = 0;
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ struct tid_rdma_request *req;
+ struct tid_rdma_flow *flow;
+ u32 flow_psn, i, tidlen = 0, pktlen, tlen;
+
+ req = ack_to_tid_req(e);
+
+ /* Validate the payload first */
+ flow = &req->flows[req->setup_head];
+
+ /* payload length = packet length - (header length + ICRC length) */
+ pktlen = packet->tlen - (packet->hlen + 4);
+ memcpy(flow->fstate->tid_entry, packet->ebuf, pktlen);
+ flow->tidcnt = pktlen / sizeof(*flow->fstate->tid_entry);
+
+ /*
+ * Walk the TID_ENTRY list to make sure we have enough space for a
+ * complete segment. Also calculate the number of required packets.
+ */
+ flow->npkts = rvt_div_round_up_mtu(qp, len);
+ for (i = 0; i < flow->tidcnt; i++) {
+ tlen = EXP_TID_GET(flow->fstate->tid_entry[i], LEN);
+ if (!tlen) {
+ ret = 1;
+ goto done;
+ }
+ /*
+ * For tid pair (tidctr == 3), the buffer size of the pair
+ * should be the sum of the buffer size described by each
+ * tid entry. However, only the first entry needs to be
+ * specified in the request (see WFR HAS Section 8.5.7.1).
+ */
+ tidlen += tlen;
+ }
+ if (tidlen * PAGE_SIZE < len) {
+ ret = 1;
+ goto done;
+ }
+
+ /* Empty the flow array */
+ req->clear_tail = req->setup_head;
+ flow->req = req;
+ flow->pkt = 0;
+ flow->tid_idx = 0;
+ flow->tid_offset = 0;
+ flow->sent = 0;
+ flow->tid_qpn = be32_to_cpu(ohdr->u.tid_rdma.r_req.tid_flow_qp);
+ flow->idx = (flow->tid_qpn >> TID_RDMA_DESTQP_FLOW_SHIFT) &
+ TID_RDMA_DESTQP_FLOW_MASK;
+ flow_psn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.r_req.tid_flow_psn));
+ flow->flow_state.generation = flow_psn >> HFI1_KDETH_BTH_SEQ_SHIFT;
+ flow->flow_state.spsn = flow_psn & HFI1_KDETH_BTH_SEQ_MASK;
+ flow->length = len;
+
+ flow->flow_state.lpsn = flow->flow_state.spsn +
+ flow->npkts - 1;
+ flow->flow_state.ib_spsn = psn;
+ flow->flow_state.ib_lpsn = flow->flow_state.ib_spsn + flow->npkts - 1;
+
+ /* Set the initial flow index to the current flow. */
+ req->flow_idx = req->setup_head;
+
+ /* advance circular buffer head */
+ req->setup_head = (req->setup_head + 1) & (req->n_max_flows - 1);
+
+ /*
+ * Compute last PSN for request.
+ */
+ e->opcode = (bth0 >> 24) & 0xff;
+ e->psn = psn;
+ e->lpsn = psn + flow->npkts - 1;
+ e->sent = 0;
+
+ req->n_flows = qpriv->tid_rdma.local.max_read;
+ req->state = TID_REQUEST_ACTIVE;
+ req->cur_seg = 0;
+ req->comp_seg = 0;
+ req->ack_seg = 0;
+ req->isge = 0;
+ req->seg_len = qpriv->tid_rdma.local.max_len;
+ req->total_len = len;
+ req->total_segs = 1;
+ req->r_flow_psn = e->psn;
+done:
+ return ret;
+}
+
+static int tid_rdma_rcv_error(struct hfi1_packet *packet,
+ struct ib_other_headers *ohdr,
+ struct rvt_qp *qp, u32 psn, int diff)
+{
+ struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+ struct hfi1_ctxtdata *rcd = ((struct hfi1_qp_priv *)qp->priv)->rcd;
+ struct rvt_ack_entry *e;
+ struct tid_rdma_request *req;
+ unsigned long flags;
+ u8 prev;
+ bool old_req;
+
+ if (diff > 0) {
+ /* sequence error */
+ if (!qp->r_nak_state) {
+ ibp->rvp.n_rc_seqnak++;
+ qp->r_nak_state = IB_NAK_PSN_ERROR;
+ qp->r_ack_psn = qp->r_psn;
+ rc_defered_ack(rcd, qp);
+ }
+ goto done;
+ }
+
+ ibp->rvp.n_rc_dupreq++;
+
+ spin_lock_irqsave(&qp->s_lock, flags);
+ e = find_prev_entry(qp, psn, &prev, NULL, &old_req);
+ if (!e || (e->opcode != TID_OP(WRITE_REQ) &&
+ e->opcode != TID_OP(READ_REQ)))
+ goto unlock;
+
+ req = ack_to_tid_req(e);
+ req->r_flow_psn = psn;
+ if (e->opcode == TID_OP(WRITE_REQ)) {
+ } else {
+ struct ib_reth *reth;
+ u32 offset;
+ u32 len;
+ u32 rkey;
+ u64 vaddr;
+ int ok;
+ u32 bth0;
+
+ reth = &ohdr->u.tid_rdma.r_req.reth;
+ /*
+ * The requester always restarts from the start of the original
+ * request.
+ */
+ offset = delta_psn(psn, e->psn) * qp->pmtu;
+ len = be32_to_cpu(reth->length);
+ if (psn != e->psn || len != req->total_len)
+ goto unlock;
+
+ if (e->rdma_sge.mr) {
+ rvt_put_mr(e->rdma_sge.mr);
+ e->rdma_sge.mr = NULL;
+ }
+
+ rkey = be32_to_cpu(reth->rkey);
+ vaddr = get_ib_reth_vaddr(reth);
+
+ qp->r_len = len;
+ ok = rvt_rkey_ok(qp, &e->rdma_sge, len, vaddr, rkey,
+ IB_ACCESS_REMOTE_READ);
+ if (unlikely(!ok))
+ goto unlock;
+
+ /*
+ * If all the response packets for the current request have
+ * been sent out and this request is complete (old_request
+ * == false) and the TID flow may be unusable (the
+ * req->clear_tail is advanced). However, when an earlier
+ * request is received, this request will not be complete any
+ * more (qp->s_tail_ack_queue is moved back, see below).
+ * Consequently, we need to update the TID flow info everytime
+ * a duplicate request is received.
+ */
+ bth0 = be32_to_cpu(ohdr->bth[0]);
+ if (tid_rdma_rcv_read_request(qp, e, packet, ohdr, bth0, psn,
+ vaddr, len))
+ goto unlock;
+
+ /*
+ * If all the response packets for the current request have
+ * been sent out and this request is complete (old_request
+ * == false) and the TID flow may be unusable (the
+ * req->clear_tail is advanced). However, when an earlier
+ * request is received, this request will not be complete any
+ * more (qp->s_tail_ack_queue is moved back, see below).
+ * Consequently, we need to update the TID flow info everytime
+ * a duplicate request is received.
+ */
+ bth0 = be32_to_cpu(ohdr->bth[0]);
+ if (tid_rdma_rcv_read_request(qp, e, packet, ohdr, bth0, psn,
+ vaddr, len))
+ goto unlock;
+
+ /*
+ * True if the request is already scheduled (between
+ * qp->s_tail_ack_queue and qp->r_head_ack_queue);
+ */
+ if (old_req)
+ goto unlock;
+ }
+ /* Re-process old requests.*/
+ qp->s_tail_ack_queue = prev;
+ /*
+ * Since the qp->s_tail_ack_queue is modified, the
+ * qp->s_ack_state must be changed to re-initialize
+ * qp->s_ack_rdma_sge; Otherwise, we will end up in
+ * wrong memory region.
+ */
+ qp->s_ack_state = OP(ACKNOWLEDGE);
+ qp->r_state = e->opcode;
+ qp->r_nak_state = 0;
+ qp->s_flags |= RVT_S_RESP_PENDING;
+ hfi1_schedule_send(qp);
+unlock:
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+done:
+ return 1;
+}
+
void hfi1_rc_rcv_tid_rdma_write_req(struct hfi1_packet *packet)
{
}
@@ -339,10 +582,253 @@ void hfi1_rc_rcv_tid_rdma_write_resp(struct hfi1_packet *packet)
void hfi1_rc_rcv_tid_rdma_read_req(struct hfi1_packet *packet)
{
+ /* HANDLER FOR TID RDMA READ REQUEST packet (Responder side)*/
+
+ /*
+ * 1. Verify TID RDMA READ REQ as per IB_OPCODE_RC_RDMA_READ
+ * (see hfi1_rc_rcv())
+ * 2. Put TID RDMA READ REQ into the response queueu (s_ack_queue)
+ * - Setup struct tid_rdma_req with request info
+ * - Initialize struct tid_rdma_flow info;
+ * - Copy TID entries;
+ * 3. Set the qp->s_ack_state as state diagram in design doc.
+ * 4. Set RVT_S_RESP_PENDING in s_flags.
+ * 5. Kick the send engine (hfi1_schedule_send())
+ */
+ struct hfi1_ctxtdata *rcd = packet->rcd;
+ struct rvt_qp *qp = packet->qp;
+ struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+ struct ib_other_headers *ohdr = packet->ohdr;
+ struct rvt_ack_entry *e;
+ unsigned long flags;
+ struct ib_reth *reth;
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ u32 bth0, psn, len, rkey;
+ bool is_fecn;
+ u8 next;
+ u64 vaddr;
+ int diff;
+ u8 nack_state = IB_NAK_INVALID_REQUEST;
+
+ bth0 = be32_to_cpu(ohdr->bth[0]);
+ if (hfi1_ruc_check_hdr(ibp, packet))
+ return;
+
+ is_fecn = process_ecn(qp, packet, false);
+ psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
+
+ if (qp->state == IB_QPS_RTR && !(qp->r_flags & RVT_R_COMM_EST))
+ rvt_comm_est(qp);
+
+ if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
+ goto nack_inv;
+
+ reth = &ohdr->u.tid_rdma.r_req.reth;
+ vaddr = be64_to_cpu(reth->vaddr);
+ len = be32_to_cpu(reth->length);
+ /* The length needs to be in multiples of PAGE_SIZE */
+ if (!len || len & ~PAGE_MASK || len > qpriv->tid_rdma.local.max_len)
+ goto nack_inv;
+
+ diff = delta_psn(psn, qp->r_psn);
+ if (unlikely(diff)) {
+ if (tid_rdma_rcv_error(packet, ohdr, qp, psn, diff))
+ return;
+ goto send_ack;
+ }
+
+ /* We've verified the request, insert it into the ack queue. */
+ next = qp->r_head_ack_queue + 1;
+ if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
+ next = 0;
+ spin_lock_irqsave(&qp->s_lock, flags);
+ if (unlikely(next == qp->s_tail_ack_queue)) {
+ if (!qp->s_ack_queue[next].sent) {
+ nack_state = IB_NAK_REMOTE_OPERATIONAL_ERROR;
+ goto nack_inv_unlock;
+ }
+ update_ack_queue(qp, next);
+ }
+ e = &qp->s_ack_queue[qp->r_head_ack_queue];
+ if (e->rdma_sge.mr) {
+ rvt_put_mr(e->rdma_sge.mr);
+ e->rdma_sge.mr = NULL;
+ }
+
+ rkey = be32_to_cpu(reth->rkey);
+ qp->r_len = len;
+
+ if (unlikely(!rvt_rkey_ok(qp, &e->rdma_sge, qp->r_len, vaddr,
+ rkey, IB_ACCESS_REMOTE_READ)))
+ goto nack_acc;
+
+ /* Accept the request parameters */
+ if (tid_rdma_rcv_read_request(qp, e, packet, ohdr, bth0, psn, vaddr,
+ len))
+ goto nack_inv_unlock;
+
+ qp->r_state = e->opcode;
+ qp->r_nak_state = 0;
+ /*
+ * We need to increment the MSN here instead of when we
+ * finish sending the result since a duplicate request would
+ * increment it more than once.
+ */
+ qp->r_msn++;
+ qp->r_psn += e->lpsn - e->psn + 1;
+
+ qp->r_head_ack_queue = next;
+
+ /* Schedule the send tasklet. */
+ qp->s_flags |= RVT_S_RESP_PENDING;
+ hfi1_schedule_send(qp);
+
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ if (is_fecn)
+ goto send_ack;
+ return;
+
+nack_inv_unlock:
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+nack_inv:
+ rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
+ qp->r_nak_state = nack_state;
+ qp->r_ack_psn = qp->r_psn;
+ /* Queue NAK for later */
+ rc_defered_ack(rcd, qp);
+ return;
+nack_acc:
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ rvt_rc_error(qp, IB_WC_LOC_PROT_ERR);
+ qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR;
+ qp->r_ack_psn = qp->r_psn;
+send_ack:
+ hfi1_send_rc_ack(packet, is_fecn);
+}
+
+static inline struct tid_rdma_request *
+find_tid_request(struct rvt_qp *qp, u32 psn, enum ib_wr_opcode opcode)
+ __must_hold(&qp->s_lock)
+{
+ struct rvt_swqe *wqe;
+ struct tid_rdma_request *req = NULL;
+ u32 i, end;
+
+ end = qp->s_cur + 1;
+ if (end == qp->s_size)
+ end = 0;
+ for (i = qp->s_acked; i != end;) {
+ wqe = rvt_get_swqe_ptr(qp, i);
+ if (cmp_psn(psn, wqe->psn) >= 0 &&
+ cmp_psn(psn, wqe->lpsn) <= 0) {
+ if (wqe->wr.opcode == opcode)
+ req = wqe_to_tid_req(wqe);
+ break;
+ }
+ if (++i == qp->s_size)
+ i = 0;
+ }
+
+ return req;
}
void hfi1_rc_rcv_tid_rdma_read_resp(struct hfi1_packet *packet)
{
+ /* HANDLER FOR TID RDMA READ RESPONSE packet (Requestor side */
+
+ /*
+ * 1. Find matching SWQE
+ * 2. Check that the entire segment has been read.
+ * 3. Remove RVT_S_WAIT_TID_RESP from s_flags.
+ * 4. Free the TID flow resources.
+ * 5. Kick the send engine (hfi1_schedule_send())
+ */
+ struct ib_other_headers *ohdr = packet->ohdr;
+ struct rvt_qp *qp = packet->qp;
+ struct hfi1_qp_priv *priv = qp->priv;
+ struct hfi1_ctxtdata *rcd = packet->rcd;
+ struct tid_rdma_request *req;
+ struct tid_rdma_flow *flow;
+ u32 opcode, aeth;
+ bool is_fecn;
+ unsigned long flags;
+ u32 kpsn, ipsn;
+
+ is_fecn = process_ecn(qp, packet, false);
+ kpsn = mask_psn(be32_to_cpu(ohdr->bth[2]));
+ aeth = be32_to_cpu(ohdr->u.tid_rdma.r_rsp.aeth);
+ opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff;
+
+ spin_lock_irqsave(&qp->s_lock, flags);
+ ipsn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.r_rsp.verbs_psn));
+ req = find_tid_request(qp, ipsn, IB_WR_TID_RDMA_READ);
+ if (unlikely(!req))
+ goto ack_op_err;
+
+ flow = &req->flows[req->clear_tail];
+ /* When header suppression is disabled */
+ if (cmp_psn(ipsn, flow->flow_state.ib_lpsn))
+ goto ack_done;
+ req->ack_pending--;
+ priv->pending_tid_r_segs--;
+ qp->s_num_rd_atomic--;
+ if ((qp->s_flags & RVT_S_WAIT_FENCE) &&
+ !qp->s_num_rd_atomic) {
+ qp->s_flags &= ~(RVT_S_WAIT_FENCE |
+ RVT_S_WAIT_ACK);
+ hfi1_schedule_send(qp);
+ }
+ if (qp->s_flags & RVT_S_WAIT_RDMAR) {
+ qp->s_flags &= ~(RVT_S_WAIT_RDMAR | RVT_S_WAIT_ACK);
+ hfi1_schedule_send(qp);
+ }
+
+ trace_hfi1_ack(qp, ipsn);
+
+ if (!do_rc_ack(qp, aeth, ipsn, opcode, 0, rcd))
+ goto ack_done;
+
+ /* Release the tid resources */
+ hfi1_kern_exp_rcv_clear(req);
+
+ /* If not done yet, build next read request */
+ if (++req->comp_seg >= req->total_segs) {
+ priv->tid_r_comp++;
+ req->state = TID_REQUEST_COMPLETE;
+ }
+
+ /*
+ * Clear the hw flow under two conditions:
+ * 1. This request is a sync point and it is complete;
+ * 2. Current request is completed and there are no more requests.
+ */
+ if ((req->state == TID_REQUEST_SYNC &&
+ req->comp_seg == req->cur_seg) ||
+ priv->tid_r_comp == priv->tid_r_reqs) {
+ hfi1_kern_clear_hw_flow(priv->rcd, qp);
+ if (req->state == TID_REQUEST_SYNC)
+ req->state = TID_REQUEST_ACTIVE;
+ }
+
+ hfi1_schedule_send(qp);
+ goto ack_done;
+
+ack_op_err:
+ /*
+ * The test indicates that the send engine has finished its cleanup
+ * after sending the request and it's now safe to put the QP into error
+ * state. However, if the wqe queue is empty (qp->s_acked == qp->s_tail
+ * == qp->s_head), it would be unsafe to complete the wqe pointed by
+ * qp->s_acked here. Putting the qp into error state will safely flush
+ * all remaining requests.
+ */
+ if (qp->s_last == qp->s_acked)
+ rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+
+ack_done:
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ if (is_fecn)
+ hfi1_send_rc_ack(packet, is_fecn);
}
void hfi1_rc_rcv_tid_rdma_resync(struct hfi1_packet *packet)
@@ -547,23 +1033,245 @@ void hfi1_tid_rdma_restart_req(struct rvt_qp *qp, struct rvt_swqe *wqe,
}
u32 hfi1_build_tid_rdma_read_packet(struct rvt_swqe *wqe,
- struct ib_other_headers *ohdr,
- u32 *bth1, u32 *bth2, u32 *len)
+ struct ib_other_headers *ohdr, u32 *bth1,
+ u32 *bth2, u32 *len)
{
- return 0;
+ struct tid_rdma_request *req = wqe_to_tid_req(wqe);
+ struct tid_rdma_flow *flow = &req->flows[req->flow_idx];
+ struct rvt_qp *qp = req->qp;
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ struct hfi1_swqe_priv *wpriv = wqe->priv;
+ struct tid_rdma_read_req *rreq = &ohdr->u.tid_rdma.r_req;
+ struct tid_rdma_params *remote;
+ u32 req_len = 0;
+ void *req_addr = NULL;
+
+ /* This is the IB psn used to send the request */
+ *bth2 = mask_psn(flow->flow_state.ib_spsn + flow->pkt);
+
+ /* TID Entries for TID RDMA READ payload */
+ req_addr = &flow->fstate->tid_entry[flow->tid_idx];
+ req_len = sizeof(*flow->fstate->tid_entry) *
+ (flow->tidcnt - flow->tid_idx);
+
+ memset(&ohdr->u.tid_rdma.r_req, 0, sizeof(ohdr->u.tid_rdma.r_req));
+ wpriv->ss.sge.vaddr = req_addr;
+ wpriv->ss.sge.sge_length = req_len;
+ wpriv->ss.sge.length = wpriv->ss.sge.sge_length;
+ /*
+ * We can safely zero these out. Since the first SGE covers the
+ * entire packet, nothing else should even look at the MR.
+ */
+ wpriv->ss.sge.mr = NULL;
+ wpriv->ss.sge.m = 0;
+ wpriv->ss.sge.n = 0;
+
+ wpriv->ss.sg_list = NULL;
+ wpriv->ss.total_len = wpriv->ss.sge.sge_length;
+ wpriv->ss.num_sge = 1;
+
+ /* Construct the TID RDMA READ REQ packet header */
+ rcu_read_lock();
+ remote = rcu_dereference(qpriv->tid_rdma.remote);
+
+ KDETH_RESET(rreq->kdeth0, KVER, 0x1);
+ KDETH_RESET(rreq->kdeth1, JKEY, remote->jkey);
+ rreq->reth.vaddr = cpu_to_be64(wqe->rdma_wr.remote_addr +
+ req->cur_seg * req->seg_len + flow->sent);
+ rreq->reth.rkey = cpu_to_be32(wqe->rdma_wr.rkey);
+ rreq->reth.length = cpu_to_be32(*len);
+ rreq->tid_flow_psn =
+ cpu_to_be32((flow->flow_state.generation <<
+ HFI1_KDETH_BTH_SEQ_SHIFT) |
+ ((flow->flow_state.spsn + flow->pkt) &
+ HFI1_KDETH_BTH_SEQ_MASK));
+ rreq->tid_flow_qp =
+ cpu_to_be32(qpriv->tid_rdma.local.qp |
+ ((flow->idx & TID_RDMA_DESTQP_FLOW_MASK) <<
+ TID_RDMA_DESTQP_FLOW_SHIFT) |
+ qpriv->rcd->ctxt);
+ rreq->verbs_qp = cpu_to_be32(qp->remote_qpn);
+ *bth1 &= ~RVT_QPN_MASK;
+ *bth1 |= remote->qp;
+ *bth2 |= IB_BTH_REQ_ACK;
+ rcu_read_unlock();
+
+ /* We are done with this segment */
+ flow->sent += *len;
+ req->cur_seg++;
+ qp->s_state = TID_OP(READ_REQ);
+ req->ack_pending++;
+ req->flow_idx = (req->flow_idx + 1) & (req->n_max_flows - 1);
+ qpriv->pending_tid_r_segs++;
+ qp->s_num_rd_atomic++;
+
+ /* Set the TID RDMA READ request payload size */
+ *len = req_len;
+
+ return sizeof(ohdr->u.tid_rdma.r_req) / sizeof(u32);
}
+/*
+ * @len: contains the data length to read upon entry and the read request
+ * payload length upon exit.
+ */
u32 hfi1_build_tid_rdma_read_req(struct rvt_qp *qp, struct rvt_swqe *wqe,
struct ib_other_headers *ohdr, u32 *bth1,
u32 *bth2, u32 *len)
__must_hold(&qp->s_lock)
{
- return 0;
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ struct tid_rdma_request *req = wqe_to_tid_req(wqe);
+ struct tid_rdma_flow *flow = NULL;
+ u32 hdwords = 0;
+ bool last;
+ bool retry = true;
+ u32 npkts = rvt_div_round_up_mtu(qp, *len);
+
+ /*
+ * Check sync conditions. Make sure that there are no pending
+ * segments before freeing the flow.
+ */
+sync_check:
+ if (req->state == TID_REQUEST_SYNC) {
+ if (qpriv->pending_tid_r_segs)
+ goto done;
+
+ hfi1_kern_clear_hw_flow(req->rcd, qp);
+ req->state = TID_REQUEST_ACTIVE;
+ }
+
+ /*
+ * If the request for this segment is resent, the tid resources should
+ * have been allocated before. In this case, req->flow_idx should
+ * fall behind req->setup_head.
+ */
+ if (req->flow_idx == req->setup_head) {
+ retry = false;
+ if (req->state == TID_REQUEST_RESEND) {
+ /*
+ * This is the first new segment for a request whose
+ * earlier segments have been re-sent. We need to
+ * set up the sge pointer correctly.
+ */
+ restart_sge(&qp->s_sge, wqe, req->s_next_psn,
+ qp->pmtu);
+ req->isge = 0;
+ req->state = TID_REQUEST_ACTIVE;
+ }
+
+ /*
+ * Check sync. The last PSN of each generation is reserved for
+ * RESYNC.
+ */
+ if ((qpriv->flow_state.psn + npkts) > MAX_TID_FLOW_PSN - 1) {
+ req->state = TID_REQUEST_SYNC;
+ goto sync_check;
+ }
+
+ /* Allocate the flow if not yet */
+ if (hfi1_kern_setup_hw_flow(qpriv->rcd, qp))
+ goto done;
+
+ /*
+ * The following call will advance req->setup_head after
+ * allocating the tid entries.
+ */
+ if (hfi1_kern_exp_rcv_setup(req, &qp->s_sge, &last)) {
+ req->state = TID_REQUEST_QUEUED;
+
+ /*
+ * We don't have resources for this segment. The QP has
+ * already been queued.
+ */
+ goto done;
+ }
+ }
+
+ /* req->flow_idx should only be one slot behind req->setup_head */
+ flow = &req->flows[req->flow_idx];
+ flow->pkt = 0;
+ flow->tid_idx = 0;
+ flow->sent = 0;
+ if (!retry) {
+ /* Set the first and last IB PSN for the flow in use.*/
+ flow->flow_state.ib_spsn = req->s_next_psn;
+ flow->flow_state.ib_lpsn =
+ flow->flow_state.ib_spsn + flow->npkts - 1;
+ }
+
+ /* Calculate the next segment start psn.*/
+ req->s_next_psn += flow->npkts;
+
+ /* Build the packet header */
+ hdwords = hfi1_build_tid_rdma_read_packet(wqe, ohdr, bth1, bth2, len);
+done:
+ return hdwords;
}
u32 hfi1_build_tid_rdma_read_resp(struct rvt_qp *qp, struct rvt_ack_entry *e,
struct ib_other_headers *ohdr, u32 *bth0,
u32 *bth1, u32 *bth2, u32 *len, bool *last)
{
- return 0;
+ struct hfi1_ack_priv *epriv = e->priv;
+ struct tid_rdma_request *req = &epriv->tid_req;
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ struct tid_rdma_flow *flow = &req->flows[req->clear_tail];
+ u32 tidentry = flow->fstate->tid_entry[flow->tid_idx];
+ u32 tidlen = EXP_TID_GET(tidentry, LEN) << PAGE_SHIFT;
+ struct tid_rdma_read_resp *resp = &ohdr->u.tid_rdma.r_rsp;
+ u32 next_offset, om = KDETH_OM_LARGE;
+ bool last_pkt;
+ u32 hdwords = 0;
+ struct tid_rdma_params *remote;
+
+ *len = min_t(u32, qp->pmtu, tidlen - flow->tid_offset);
+ flow->sent += *len;
+ next_offset = flow->tid_offset + *len;
+ last_pkt = (flow->sent >= flow->length);
+
+ rcu_read_lock();
+ remote = rcu_dereference(qpriv->tid_rdma.remote);
+ if (!remote) {
+ rcu_read_unlock();
+ goto done;
+ }
+ KDETH_RESET(resp->kdeth0, KVER, 0x1);
+ KDETH_SET(resp->kdeth0, SH, !last_pkt);
+ KDETH_SET(resp->kdeth0, INTR, !!(!last_pkt && remote->urg));
+ KDETH_SET(resp->kdeth0, TIDCTRL, EXP_TID_GET(tidentry, CTRL));
+ KDETH_SET(resp->kdeth0, TID, EXP_TID_GET(tidentry, IDX));
+ KDETH_SET(resp->kdeth0, OM, om == KDETH_OM_LARGE);
+ KDETH_SET(resp->kdeth0, OFFSET, flow->tid_offset / om);
+ KDETH_RESET(resp->kdeth1, JKEY, remote->jkey);
+ resp->verbs_qp = cpu_to_be32(qp->remote_qpn);
+ rcu_read_unlock();
+
+ resp->aeth = rvt_compute_aeth(qp);
+ resp->verbs_psn = cpu_to_be32(mask_psn(flow->flow_state.ib_spsn +
+ flow->pkt));
+
+ *bth0 = TID_OP(READ_RESP) << 24;
+ *bth1 = flow->tid_qpn;
+ *bth2 = mask_psn(((flow->flow_state.spsn + flow->pkt++) &
+ HFI1_KDETH_BTH_SEQ_MASK) |
+ (flow->flow_state.generation <<
+ HFI1_KDETH_BTH_SEQ_SHIFT));
+ *last = last_pkt;
+ if (last_pkt)
+ /* Advance to next flow */
+ req->clear_tail = (req->clear_tail + 1) &
+ (req->n_max_flows - 1);
+
+ if (next_offset >= tidlen) {
+ flow->tid_offset = 0;
+ flow->tid_idx++;
+ } else {
+ flow->tid_offset = next_offset;
+ }
+
+ hdwords = sizeof(ohdr->u.tid_rdma.r_rsp) / sizeof(u32);
+
+done:
+ return hdwords;
}
@@ -259,6 +259,7 @@ struct trdma_flow_state {
void hfi1_compute_tid_rdma_flow_wt(void);
void hfi1_kern_clear_hw_flow(struct hfi1_ctxtdata *rcd, struct rvt_qp *qp);
int hfi1_kern_exp_rcv_init(struct hfi1_ctxtdata *rcd, int reinit);
+int hfi1_kern_exp_rcv_clear(struct tid_rdma_request *req);
void hfi1_kern_exp_rcv_clear_all(struct tid_rdma_request *req);
void hfi1_kern_read_tid_flow_free(struct rvt_qp *qp);
@@ -173,6 +173,9 @@ struct hfi1_qp_priv {
unsigned long tid_timer_timeout_jiffies;
unsigned long tid_retry_timeout_jiffies;
u8 s_retry;
+ /* For TID RDMA READ */
+ u32 tid_r_reqs; /* Num of tid reads requested */
+ u32 tid_r_comp; /* Num of tid reads completed */
u32 pending_tid_r_segs; /* Num of pending tid read segments */
u16 pkts_ps; /* packets per segment */
u8 timeout_shift; /* account for number of packets per segment */