diff mbox series

[for-next,18/23] IB/hfi1: Add the dual leg code

Message ID 20190124055122.10736.72042.stgit@scvm10.sc.intel.com (mailing list archive)
State Accepted
Delegated to: Doug Ledford
Headers show
Series IB/hfi1: Add TID RDMA Write | expand

Commit Message

Dennis Dalessandro Jan. 24, 2019, 5:51 a.m. UTC
From: Kaike Wan <kaike.wan@intel.com>

The "Second Leg" of the TID RDMA WRITE protocol deals with
the transfer of data and ack packets, which are in the KDETH
PSN space, as opposed to the IB PSN space.

Therefore, the Second Leg could be considered as a separate
state machine. As such, it is handled by a different work
queue item which is scheduled along with the normal IB state
machine work item.

Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Signed-off-by: Mitko Haralanov <mitko.haralanov@intel.com>
Signed-off-by: Kaike Wan <kaike.wan@intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
---
 drivers/infiniband/hw/hfi1/iowait.h   |   12 +++
 drivers/infiniband/hw/hfi1/qp.c       |   34 +++++++-
 drivers/infiniband/hw/hfi1/qp.h       |    1 
 drivers/infiniband/hw/hfi1/ruc.c      |   32 ++++++-
 drivers/infiniband/hw/hfi1/tid_rdma.c |  141 +++++++++++++++++++++++++++++++++
 drivers/infiniband/hw/hfi1/tid_rdma.h |    4 +
 drivers/infiniband/hw/hfi1/verbs.h    |    3 +
 7 files changed, 217 insertions(+), 10 deletions(-)
diff mbox series

Patch

diff --git a/drivers/infiniband/hw/hfi1/iowait.h b/drivers/infiniband/hw/hfi1/iowait.h
index 23a58ac..bd91370 100644
--- a/drivers/infiniband/hw/hfi1/iowait.h
+++ b/drivers/infiniband/hw/hfi1/iowait.h
@@ -186,6 +186,18 @@  static inline bool iowait_schedule(struct iowait *wait,
 }
 
 /**
+ * iowait_tid_schedule - schedule the tid SE
+ * @wait: the iowait structure
+ * @wq: the work queue
+ * @cpu: the cpu
+ */
+static inline bool iowait_tid_schedule(struct iowait *wait,
+				       struct workqueue_struct *wq, int cpu)
+{
+	return !!queue_work_on(cpu, wq, &wait->wait[IOWAIT_TID_SE].iowork);
+}
+
+/**
  * iowait_sdma_drain() - wait for DMAs to drain
  *
  * @wait: iowait structure
diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c
index 31b4b60..96632c7 100644
--- a/drivers/infiniband/hw/hfi1/qp.c
+++ b/drivers/infiniband/hw/hfi1/qp.c
@@ -431,6 +431,11 @@  static void hfi1_qp_schedule(struct rvt_qp *qp)
 		if (ret)
 			iowait_clear_flag(&priv->s_iowait, IOWAIT_PENDING_IB);
 	}
+	if (iowait_flag_set(&priv->s_iowait, IOWAIT_PENDING_TID)) {
+		ret = hfi1_schedule_tid_send(qp);
+		if (ret)
+			iowait_clear_flag(&priv->s_iowait, IOWAIT_PENDING_TID);
+	}
 }
 
 void hfi1_qp_wakeup(struct rvt_qp *qp, u32 flag)
@@ -450,8 +455,27 @@  void hfi1_qp_wakeup(struct rvt_qp *qp, u32 flag)
 
 void hfi1_qp_unbusy(struct rvt_qp *qp, struct iowait_work *wait)
 {
-	if (iowait_set_work_flag(wait) == IOWAIT_IB_SE)
+	struct hfi1_qp_priv *priv = qp->priv;
+
+	if (iowait_set_work_flag(wait) == IOWAIT_IB_SE) {
 		qp->s_flags &= ~RVT_S_BUSY;
+		/*
+		 * If we are sending a first-leg packet from the second leg,
+		 * we need to clear the busy flag from priv->s_flags to
+		 * avoid a race condition when the qp wakes up before
+		 * the call to hfi1_verbs_send() returns to the second
+		 * leg. In that case, the second leg will terminate without
+		 * being re-scheduled, resulting in failure to send TID RDMA
+		 * WRITE DATA and TID RDMA ACK packets.
+		 */
+		if (priv->s_flags & HFI1_S_TID_BUSY_SET) {
+			priv->s_flags &= ~(HFI1_S_TID_BUSY_SET |
+					   RVT_S_BUSY);
+			iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_TID);
+		}
+	} else {
+		priv->s_flags &= ~RVT_S_BUSY;
+	}
 }
 
 static int iowait_sleep(
@@ -694,7 +718,7 @@  void qp_iter_print(struct seq_file *s, struct rvt_qp_iter *iter)
 		&priv->s_iowait,
 		1,
 		_hfi1_do_send,
-		NULL,
+		_hfi1_do_tid_send,
 		iowait_sleep,
 		iowait_wakeup,
 		iowait_sdma_drained);
@@ -851,7 +875,8 @@  void notify_error_qp(struct rvt_qp *qp)
 	if (lock) {
 		write_seqlock(lock);
 		if (!list_empty(&priv->s_iowait.list) &&
-		    !(qp->s_flags & RVT_S_BUSY)) {
+		    !(qp->s_flags & RVT_S_BUSY) &&
+		    !(priv->s_flags & RVT_S_BUSY)) {
 			qp->s_flags &= ~RVT_S_ANY_WAIT_IO;
 			list_del_init(&priv->s_iowait.list);
 			priv->s_iowait.lock = NULL;
@@ -860,7 +885,8 @@  void notify_error_qp(struct rvt_qp *qp)
 		write_sequnlock(lock);
 	}
 
-	if (!(qp->s_flags & RVT_S_BUSY)) {
+	if (!(qp->s_flags & RVT_S_BUSY) && !(priv->s_flags & RVT_S_BUSY)) {
+		qp->s_hdrwords = 0;
 		if (qp->s_rdma_mr) {
 			rvt_put_mr(qp->s_rdma_mr);
 			qp->s_rdma_mr = NULL;
diff --git a/drivers/infiniband/hw/hfi1/qp.h b/drivers/infiniband/hw/hfi1/qp.h
index d531b76..b670321 100644
--- a/drivers/infiniband/hw/hfi1/qp.h
+++ b/drivers/infiniband/hw/hfi1/qp.h
@@ -82,6 +82,7 @@ 
 
 #define HFI1_S_ANY_WAIT_IO (RVT_S_ANY_WAIT_IO | HFI1_S_WAIT_PIO_DRAIN)
 #define HFI1_S_ANY_WAIT (HFI1_S_ANY_WAIT_IO | RVT_S_ANY_WAIT_SEND)
+#define HFI1_S_ANY_TID_WAIT_SEND (RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_DMA)
 
 /*
  * Send if not busy or waiting for I/O and either
diff --git a/drivers/infiniband/hw/hfi1/ruc.c b/drivers/infiniband/hw/hfi1/ruc.c
index f96c0f5..124a3ec 100644
--- a/drivers/infiniband/hw/hfi1/ruc.c
+++ b/drivers/infiniband/hw/hfi1/ruc.c
@@ -453,11 +453,13 @@  void hfi1_make_ruc_header(struct rvt_qp *qp, struct ib_other_headers *ohdr,
 #define SEND_RESCHED_TIMEOUT (5 * HZ)  /* 5s in jiffies */
 
 /**
- * schedule_send_yield - test for a yield required for QP send engine
+ * hfi1_schedule_send_yield - test for a yield required for QP
+ * send engine
  * @timeout: Final time for timeout slice for jiffies
  * @qp: a pointer to QP
  * @ps: a pointer to a structure with commonly lookup values for
  *      the the send engine progress
+ * @tid - true if it is the tid leg
  *
  * This routine checks if the time slice for the QP has expired
  * for RC QPs, if so an additional work entry is queued. At this
@@ -465,8 +467,8 @@  void hfi1_make_ruc_header(struct rvt_qp *qp, struct ib_other_headers *ohdr,
  * returns true if a yield is required, otherwise, false
  * is returned.
  */
-static bool schedule_send_yield(struct rvt_qp *qp,
-				struct hfi1_pkt_state *ps)
+bool hfi1_schedule_send_yield(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
+			      bool tid)
 {
 	ps->pkts_sent = true;
 
@@ -474,8 +476,24 @@  static bool schedule_send_yield(struct rvt_qp *qp,
 		if (!ps->in_thread ||
 		    workqueue_congested(ps->cpu, ps->ppd->hfi1_wq)) {
 			spin_lock_irqsave(&qp->s_lock, ps->flags);
-			qp->s_flags &= ~RVT_S_BUSY;
-			hfi1_schedule_send(qp);
+			if (!tid) {
+				qp->s_flags &= ~RVT_S_BUSY;
+				hfi1_schedule_send(qp);
+			} else {
+				struct hfi1_qp_priv *priv = qp->priv;
+
+				if (priv->s_flags &
+				    HFI1_S_TID_BUSY_SET) {
+					qp->s_flags &= ~RVT_S_BUSY;
+					priv->s_flags &=
+						~(HFI1_S_TID_BUSY_SET |
+						  RVT_S_BUSY);
+				} else {
+					priv->s_flags &= ~RVT_S_BUSY;
+				}
+				hfi1_schedule_tid_send(qp);
+			}
+
 			spin_unlock_irqrestore(&qp->s_lock, ps->flags);
 			this_cpu_inc(*ps->ppd->dd->send_schedule);
 			trace_hfi1_rc_expired_time_slice(qp, true);
@@ -576,6 +594,8 @@  void hfi1_do_send(struct rvt_qp *qp, bool in_thread)
 	do {
 		/* Check for a constructed packet to be sent. */
 		if (ps.s_txreq) {
+			if (priv->s_flags & HFI1_S_TID_BUSY_SET)
+				qp->s_flags |= RVT_S_BUSY;
 			spin_unlock_irqrestore(&qp->s_lock, ps.flags);
 			/*
 			 * If the packet cannot be sent now, return and
@@ -585,7 +605,7 @@  void hfi1_do_send(struct rvt_qp *qp, bool in_thread)
 				return;
 
 			/* allow other tasks to run */
-			if (schedule_send_yield(qp, &ps))
+			if (hfi1_schedule_send_yield(qp, &ps, false))
 				return;
 
 			spin_lock_irqsave(&qp->s_lock, ps.flags);
diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c
index 6ce7e1f..19f6b6a 100644
--- a/drivers/infiniband/hw/hfi1/tid_rdma.c
+++ b/drivers/infiniband/hw/hfi1/tid_rdma.c
@@ -127,6 +127,7 @@  static void hfi1_init_trdma_req(struct rvt_qp *qp,
 static int make_tid_rdma_ack(struct rvt_qp *qp,
 			     struct ib_other_headers *ohdr,
 			     struct hfi1_pkt_state *ps);
+static void hfi1_do_tid_send(struct rvt_qp *qp);
 
 static u64 tid_rdma_opfn_encode(struct tid_rdma_params *p)
 {
@@ -3044,6 +3045,7 @@  bool hfi1_handle_kdeth_eflags(struct hfi1_ctxtdata *rcd,
 		qpriv->s_flags |= RVT_S_ACK_PENDING;
 		if (qpriv->r_tid_ack == HFI1_QP_WQE_INVALID)
 			qpriv->r_tid_ack = qpriv->r_tid_tail;
+		hfi1_schedule_tid_send(qp);
 	}
 	goto unlock;
 }
@@ -3513,6 +3515,7 @@  static void hfi1_tid_write_alloc_resources(struct rvt_qp *qp, bool intr_ctx)
 			ret = -EAGAIN;
 			to_seg = MAX_FLOWS >> 1;
 			qpriv->s_flags |= RVT_S_ACK_PENDING;
+			hfi1_schedule_tid_send(qp);
 			break;
 		}
 
@@ -4124,6 +4127,7 @@  void hfi1_rc_rcv_tid_rdma_write_resp(struct hfi1_packet *packet)
 	}
 	qp->s_flags &= ~HFI1_S_WAIT_TID_RESP;
 
+	hfi1_schedule_tid_send(qp);
 	goto ack_done;
 
 ack_op_err:
@@ -4283,6 +4287,7 @@  void hfi1_rc_rcv_tid_rdma_write_data(struct hfi1_packet *packet)
 
 done:
 	priv->s_flags |= RVT_S_ACK_PENDING;
+	hfi1_schedule_tid_send(qp);
 exit:
 	priv->r_next_psn_kdeth = flow->flow_state.r_next_psn;
 	spin_unlock_irqrestore(&qp->s_lock, flags);
@@ -4295,6 +4300,7 @@  void hfi1_rc_rcv_tid_rdma_write_data(struct hfi1_packet *packet)
 		priv->s_flags |= RVT_S_ACK_PENDING;
 		if (priv->r_tid_ack == HFI1_QP_WQE_INVALID)
 			priv->r_tid_ack = priv->r_tid_tail;
+		hfi1_schedule_tid_send(qp);
 	}
 	goto done;
 }
@@ -4563,6 +4569,7 @@  void hfi1_rc_rcv_tid_rdma_ack(struct hfi1_packet *packet)
 			req->cur_seg = req->ack_seg;
 			qpriv->s_tid_tail = qp->s_acked;
 			qpriv->s_state = TID_OP(WRITE_REQ);
+			hfi1_schedule_tid_send(qp);
 		}
 done:
 		qpriv->s_retry = qp->s_retry_cnt;
@@ -4580,6 +4587,7 @@  void hfi1_rc_rcv_tid_rdma_ack(struct hfi1_packet *packet)
 			qpriv->s_tid_tail = qp->s_acked;
 			qpriv->s_state = TID_OP(WRITE_REQ);
 			qpriv->s_retry = qp->s_retry_cnt;
+			hfi1_schedule_tid_send(qp);
 			break;
 
 		default:
@@ -4669,6 +4677,7 @@  static void hfi1_tid_retry_timeout(struct timer_list *t)
 			qp->s_flags |= HFI1_S_WAIT_HALT;
 			priv->s_state = TID_OP(RESYNC);
 			priv->s_retry--;
+			hfi1_schedule_tid_send(qp);
 		}
 	}
 	spin_unlock(&qp->s_lock);
@@ -4800,6 +4809,7 @@  void hfi1_rc_rcv_tid_rdma_resync(struct hfi1_packet *packet)
 	/* RESYNC request always gets a TID RDMA ACK. */
 	qpriv->s_nak_state = 0;
 	qpriv->s_flags |= RVT_S_ACK_PENDING;
+	hfi1_schedule_tid_send(qp);
 bail:
 	spin_unlock_irqrestore(&qp->s_lock, flags);
 }
@@ -5151,3 +5161,134 @@  static int make_tid_rdma_ack(struct rvt_qp *qp,
 	qpriv->s_flags &= ~RVT_S_ACK_PENDING;
 	return 0;
 }
+
+static int hfi1_send_tid_ok(struct rvt_qp *qp)
+{
+	struct hfi1_qp_priv *priv = qp->priv;
+
+	return !(priv->s_flags & RVT_S_BUSY ||
+		 qp->s_flags & HFI1_S_ANY_WAIT_IO) &&
+		(verbs_txreq_queued(iowait_get_tid_work(&priv->s_iowait)) ||
+		 (priv->s_flags & RVT_S_RESP_PENDING) ||
+		 !(qp->s_flags & HFI1_S_ANY_TID_WAIT_SEND));
+}
+
+void _hfi1_do_tid_send(struct work_struct *work)
+{
+	struct iowait_work *w = container_of(work, struct iowait_work, iowork);
+	struct rvt_qp *qp = iowait_to_qp(w->iow);
+
+	hfi1_do_tid_send(qp);
+}
+
+static void hfi1_do_tid_send(struct rvt_qp *qp)
+{
+	struct hfi1_pkt_state ps;
+	struct hfi1_qp_priv *priv = qp->priv;
+
+	ps.dev = to_idev(qp->ibqp.device);
+	ps.ibp = to_iport(qp->ibqp.device, qp->port_num);
+	ps.ppd = ppd_from_ibp(ps.ibp);
+	ps.wait = iowait_get_tid_work(&priv->s_iowait);
+	ps.in_thread = false;
+	ps.timeout_int = qp->timeout_jiffies / 8;
+
+	spin_lock_irqsave(&qp->s_lock, ps.flags);
+
+	/* Return if we are already busy processing a work request. */
+	if (!hfi1_send_tid_ok(qp)) {
+		if (qp->s_flags & HFI1_S_ANY_WAIT_IO)
+			iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_TID);
+		spin_unlock_irqrestore(&qp->s_lock, ps.flags);
+		return;
+	}
+
+	priv->s_flags |= RVT_S_BUSY;
+
+	ps.timeout = jiffies + ps.timeout_int;
+	ps.cpu = priv->s_sde ? priv->s_sde->cpu :
+		cpumask_first(cpumask_of_node(ps.ppd->dd->node));
+	ps.pkts_sent = false;
+
+	/* insure a pre-built packet is handled  */
+	ps.s_txreq = get_waiting_verbs_txreq(ps.wait);
+	do {
+		/* Check for a constructed packet to be sent. */
+		if (ps.s_txreq) {
+			if (priv->s_flags & HFI1_S_TID_BUSY_SET) {
+				qp->s_flags |= RVT_S_BUSY;
+				ps.wait = iowait_get_ib_work(&priv->s_iowait);
+			}
+			spin_unlock_irqrestore(&qp->s_lock, ps.flags);
+
+			/*
+			 * If the packet cannot be sent now, return and
+			 * the send tasklet will be woken up later.
+			 */
+			if (hfi1_verbs_send(qp, &ps))
+				return;
+
+			/* allow other tasks to run */
+			if (hfi1_schedule_send_yield(qp, &ps, true))
+				return;
+
+			spin_lock_irqsave(&qp->s_lock, ps.flags);
+			if (priv->s_flags & HFI1_S_TID_BUSY_SET) {
+				qp->s_flags &= ~RVT_S_BUSY;
+				priv->s_flags &= ~HFI1_S_TID_BUSY_SET;
+				ps.wait = iowait_get_tid_work(&priv->s_iowait);
+				if (iowait_flag_set(&priv->s_iowait,
+						    IOWAIT_PENDING_IB))
+					hfi1_schedule_send(qp);
+			}
+		}
+	} while (hfi1_make_tid_rdma_pkt(qp, &ps));
+	iowait_starve_clear(ps.pkts_sent, &priv->s_iowait);
+	spin_unlock_irqrestore(&qp->s_lock, ps.flags);
+}
+
+static bool _hfi1_schedule_tid_send(struct rvt_qp *qp)
+{
+	struct hfi1_qp_priv *priv = qp->priv;
+	struct hfi1_ibport *ibp =
+		to_iport(qp->ibqp.device, qp->port_num);
+	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+	struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
+
+	return iowait_tid_schedule(&priv->s_iowait, ppd->hfi1_wq,
+				   priv->s_sde ?
+				   priv->s_sde->cpu :
+				   cpumask_first(cpumask_of_node(dd->node)));
+}
+
+/**
+ * hfi1_schedule_tid_send - schedule progress on TID RDMA state machine
+ * @qp: the QP
+ *
+ * This schedules qp progress on the TID RDMA state machine. Caller
+ * should hold the s_lock.
+ * Unlike hfi1_schedule_send(), this cannot use hfi1_send_ok() because
+ * the two state machines can step on each other with respect to the
+ * RVT_S_BUSY flag.
+ * Therefore, a modified test is used.
+ * @return true if the second leg is scheduled;
+ *  false if the second leg is not scheduled.
+ */
+bool hfi1_schedule_tid_send(struct rvt_qp *qp)
+{
+	lockdep_assert_held(&qp->s_lock);
+	if (hfi1_send_tid_ok(qp)) {
+		/*
+		 * The following call returns true if the qp is not on the
+		 * queue and false if the qp is already on the queue before
+		 * this call. Either way, the qp will be on the queue when the
+		 * call returns.
+		 */
+		_hfi1_schedule_tid_send(qp);
+		return true;
+	}
+	if (qp->s_flags & HFI1_S_ANY_WAIT_IO)
+		iowait_set_flag(&((struct hfi1_qp_priv *)qp->priv)->s_iowait,
+				IOWAIT_PENDING_TID);
+	return false;
+}
diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.h b/drivers/infiniband/hw/hfi1/tid_rdma.h
index 0ce0ef6..7f8f17b 100644
--- a/drivers/infiniband/hw/hfi1/tid_rdma.h
+++ b/drivers/infiniband/hw/hfi1/tid_rdma.h
@@ -305,4 +305,8 @@  u32 hfi1_build_tid_rdma_resync(struct rvt_qp *qp, struct rvt_swqe *wqe,
 struct hfi1_pkt_state;
 int hfi1_make_tid_rdma_pkt(struct rvt_qp *qp, struct hfi1_pkt_state *ps);
 
+void _hfi1_do_tid_send(struct work_struct *work);
+
+bool hfi1_schedule_tid_send(struct rvt_qp *qp);
+
 #endif /* HFI1_TID_RDMA_H */
diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h
index 3e45149..bee3d21 100644
--- a/drivers/infiniband/hw/hfi1/verbs.h
+++ b/drivers/infiniband/hw/hfi1/verbs.h
@@ -443,6 +443,9 @@  void hfi1_make_ruc_header(struct rvt_qp *qp, struct ib_other_headers *ohdr,
 			  u32 bth0, u32 bth1, u32 bth2, int middle,
 			  struct hfi1_pkt_state *ps);
 
+bool hfi1_schedule_send_yield(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
+			      bool tid);
+
 void _hfi1_do_send(struct work_struct *work);
 
 void hfi1_do_send_from_rvt(struct rvt_qp *qp);