@@ -186,6 +186,18 @@ static inline bool iowait_schedule(struct iowait *wait,
}
/**
+ * iowait_tid_schedule - schedule the tid SE
+ * @wait: the iowait structure
+ * @wq: the work queue
+ * @cpu: the cpu
+ */
+static inline bool iowait_tid_schedule(struct iowait *wait,
+ struct workqueue_struct *wq, int cpu)
+{
+ return !!queue_work_on(cpu, wq, &wait->wait[IOWAIT_TID_SE].iowork);
+}
+
+/**
* iowait_sdma_drain() - wait for DMAs to drain
* @wait: iowait structure
*
@@ -414,6 +414,11 @@ static void hfi1_qp_schedule(struct rvt_qp *qp)
if (ret)
iowait_clear_flag(&priv->s_iowait, IOWAIT_PENDING_IB);
}
+ if (iowait_flag_set(&priv->s_iowait, IOWAIT_PENDING_TID)) {
+ ret = hfi1_schedule_tid_send(qp);
+ if (ret)
+ iowait_clear_flag(&priv->s_iowait, IOWAIT_PENDING_TID);
+ }
}
void hfi1_qp_wakeup(struct rvt_qp *qp, u32 flag)
@@ -433,8 +438,27 @@ void hfi1_qp_wakeup(struct rvt_qp *qp, u32 flag)
void hfi1_qp_unbusy(struct rvt_qp *qp, struct iowait_work *wait)
{
- if (iowait_set_work_flag(wait) == IOWAIT_IB_SE)
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ if (iowait_set_work_flag(wait) == IOWAIT_IB_SE) {
qp->s_flags &= ~RVT_S_BUSY;
+ /*
+ * If we are sending a first-leg packet from the second leg,
+ * we need to clear the busy flag from priv->s_flags to
+ * avoid a race condition when the qp wakes up before
+ * the call to hfi1_verbs_send() returns to the second
+ * leg. In that case, the second leg will terminate without
+ * being re-scheduled, resulting in failure to send TID RDMA
+ * WRITE DATA and TID RDMA ACK packets.
+ */
+ if (priv->s_flags & HFI1_S_TID_BUSY_SET) {
+ priv->s_flags &= ~(HFI1_S_TID_BUSY_SET |
+ RVT_S_BUSY);
+ iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_TID);
+ }
+ } else {
+ priv->s_flags &= ~RVT_S_BUSY;
+ }
}
static int iowait_sleep(
@@ -679,7 +703,7 @@ void qp_iter_print(struct seq_file *s, struct rvt_qp_iter *iter)
&priv->s_iowait,
1,
_hfi1_do_send,
- NULL,
+ _hfi1_do_tid_send,
iowait_sleep,
iowait_wakeup,
iowait_sdma_drained);
@@ -833,7 +857,8 @@ void notify_error_qp(struct rvt_qp *qp)
if (lock) {
write_seqlock(lock);
if (!list_empty(&priv->s_iowait.list) &&
- !(qp->s_flags & RVT_S_BUSY)) {
+ !(qp->s_flags & RVT_S_BUSY) &&
+ !(priv->s_flags & RVT_S_BUSY)) {
qp->s_flags &= ~RVT_S_ANY_WAIT_IO;
list_del_init(&priv->s_iowait.list);
priv->s_iowait.lock = NULL;
@@ -842,7 +867,8 @@ void notify_error_qp(struct rvt_qp *qp)
write_sequnlock(lock);
}
- if (!(qp->s_flags & RVT_S_BUSY)) {
+ if (!(qp->s_flags & RVT_S_BUSY) && !(priv->s_flags & RVT_S_BUSY)) {
+ qp->s_hdrwords = 0;
if (qp->s_rdma_mr) {
rvt_put_mr(qp->s_rdma_mr);
qp->s_rdma_mr = NULL;
@@ -78,6 +78,7 @@
#define HFI1_S_ANY_WAIT_IO (RVT_S_ANY_WAIT_IO | HFI1_S_WAIT_PIO_DRAIN)
#define HFI1_S_ANY_WAIT (HFI1_S_ANY_WAIT_IO | RVT_S_ANY_WAIT_SEND)
+#define HFI1_S_ANY_TID_WAIT_SEND (RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_DMA)
/*
* Send if not busy or waiting for I/O and either
@@ -780,8 +780,8 @@ void hfi1_make_ruc_header(struct rvt_qp *qp, struct ib_other_headers *ohdr,
#define SEND_RESCHED_TIMEOUT (5 * HZ) /* 5s in jiffies */
/**
- * schedule_send_yield - test for a yield required for QP send engine
- * @timeout: Final time for timeout slice for jiffies
+ * hfi1_schedule_send_yield - test for a yield required for QP
+ * send engine
* @qp: a pointer to QP
* @ps: a pointer to a structure with commonly lookup values for
* the the send engine progress
@@ -792,8 +792,8 @@ void hfi1_make_ruc_header(struct rvt_qp *qp, struct ib_other_headers *ohdr,
* returns true if a yield is required, otherwise, false
* is returned.
*/
-static bool schedule_send_yield(struct rvt_qp *qp,
- struct hfi1_pkt_state *ps)
+bool hfi1_schedule_send_yield(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
+ bool tid)
{
ps->pkts_sent = true;
@@ -801,8 +801,23 @@ static bool schedule_send_yield(struct rvt_qp *qp,
if (!ps->in_thread ||
workqueue_congested(ps->cpu, ps->ppd->hfi1_wq)) {
spin_lock_irqsave(&qp->s_lock, ps->flags);
- qp->s_flags &= ~RVT_S_BUSY;
- hfi1_schedule_send(qp);
+ if (!tid) {
+ qp->s_flags &= ~RVT_S_BUSY;
+ hfi1_schedule_send(qp);
+ } else {
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ if (priv->s_flags &
+ HFI1_S_TID_BUSY_SET) {
+ qp->s_flags &= ~RVT_S_BUSY;
+ priv->s_flags &=
+ ~(HFI1_S_TID_BUSY_SET |
+ RVT_S_BUSY);
+ } else {
+ priv->s_flags &= ~RVT_S_BUSY;
+ }
+ hfi1_schedule_tid_send(qp);
+ }
spin_unlock_irqrestore(&qp->s_lock, ps->flags);
this_cpu_inc(*ps->ppd->dd->send_schedule);
trace_hfi1_rc_expired_time_slice(qp, true);
@@ -903,6 +918,8 @@ void hfi1_do_send(struct rvt_qp *qp, bool in_thread)
do {
/* Check for a constructed packet to be sent. */
if (ps.s_txreq) {
+ if (priv->s_flags & HFI1_S_TID_BUSY_SET)
+ qp->s_flags |= RVT_S_BUSY;
spin_unlock_irqrestore(&qp->s_lock, ps.flags);
/*
* If the packet cannot be sent now, return and
@@ -912,7 +929,7 @@ void hfi1_do_send(struct rvt_qp *qp, bool in_thread)
return;
/* allow other tasks to run */
- if (schedule_send_yield(qp, &ps))
+ if (hfi1_schedule_send_yield(qp, &ps, false))
return;
spin_lock_irqsave(&qp->s_lock, ps.flags);
@@ -152,6 +152,8 @@ static u32 mask_generation(u32 a)
static u32 tid_rdma_flow_wt;
+static void hfi1_do_tid_send(struct rvt_qp *qp);
+
static struct tid_rdma_flow *find_flow_ib(struct tid_rdma_request *req,
u32 psn, u16 *fidx)
{
@@ -180,6 +182,17 @@ static u8 trdma_pset_order(struct tid_rdma_pageset *s)
return ilog2(count) + 1;
}
+static int hfi1_send_tid_ok(struct rvt_qp *qp)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ return !(priv->s_flags & RVT_S_BUSY ||
+ qp->s_flags & HFI1_S_ANY_WAIT_IO) &&
+ (verbs_txreq_queued(iowait_get_tid_work(&priv->s_iowait)) ||
+ (priv->s_flags & RVT_S_RESP_PENDING) ||
+ !(qp->s_flags & HFI1_S_ANY_TID_WAIT_SEND));
+}
+
static u64 tid_rdma_opfn_encode(struct tid_rdma_params *p)
{
return
@@ -2059,6 +2072,52 @@ bool hfi1_handle_kdeth_eflags(struct hfi1_ctxtdata *rcd,
return true;
}
+static bool _hfi1_schedule_tid_send(struct rvt_qp *qp)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+ struct hfi1_ibport *ibp =
+ to_iport(qp->ibqp.device, qp->port_num);
+ struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+ struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
+
+ return iowait_tid_schedule(&priv->s_iowait, ppd->hfi1_wq,
+ priv->s_sde ?
+ priv->s_sde->cpu :
+ cpumask_first(cpumask_of_node(dd->node)));
+}
+
+/**
+ * hfi1_schedule_tid_send - schedule progress on TID RDMA state machine
+ * @qp: the QP
+ *
+ * This schedules qp progress on the TID RDMA state machine. Caller
+ * should hold the s_lock.
+ * Unlike hfi1_schedule_send(), this cannot use hfi1_send_ok() because
+ * the two state machines can step on each other with respect to the
+ * RVT_S_BUSY flag.
+ * Therefore, a modified test is used.
+ * @return true if the second leg is scheduled;
+ * false if the second leg is not scheduled.
+ */
+bool hfi1_schedule_tid_send(struct rvt_qp *qp)
+{
+ lockdep_assert_held(&qp->s_lock);
+ if (hfi1_send_tid_ok(qp)) {
+ /*
+ * The following call returns true if the qp is not on the
+ * queue and false if the qp is already on the queue before
+ * this call. Either way, the qp will be on the queue when the
+ * call returns.
+ */
+ _hfi1_schedule_tid_send(qp);
+ return true;
+ }
+ if (qp->s_flags & HFI1_S_ANY_WAIT_IO)
+ iowait_set_flag(&((struct hfi1_qp_priv *)qp->priv)->s_iowait,
+ IOWAIT_PENDING_TID);
+ return false;
+}
+
/**
* qp_to_rcd - determine the receive context used by a qp
* @qp - the qp
@@ -2204,6 +2263,12 @@ u64 hfi1_access_sw_tid_wait(const struct cntr_entry *entry,
return dd->verbs_dev.n_tidwait;
}
+static int hfi1_make_tid_rdma_pkt(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
+ __must_hold(&qp->s_lock)
+{
+ return 0;
+}
+
/*
* "Rewind" the TID request information.
* This means that we reset the state back to ACTIVE,
@@ -2265,6 +2330,82 @@ void hfi1_tid_rdma_restart_req(struct rvt_qp *qp, struct rvt_swqe *wqe,
req->state = TID_REQUEST_ACTIVE;
}
+void _hfi1_do_tid_send(struct work_struct *work)
+{
+ struct iowait_work *w = container_of(work, struct iowait_work, iowork);
+ struct rvt_qp *qp = iowait_to_qp(w->iow);
+
+ hfi1_do_tid_send(qp);
+}
+
+static void hfi1_do_tid_send(struct rvt_qp *qp)
+{
+ struct hfi1_pkt_state ps;
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ ps.dev = to_idev(qp->ibqp.device);
+ ps.ibp = to_iport(qp->ibqp.device, qp->port_num);
+ ps.ppd = ppd_from_ibp(ps.ibp);
+ ps.wait = iowait_get_tid_work(&priv->s_iowait);
+ ps.in_thread = false;
+ ps.timeout_int = qp->timeout_jiffies / 8;
+
+ trace_hfi1_rc_do_tid_send(qp, false);
+
+ spin_lock_irqsave(&qp->s_lock, ps.flags);
+
+ /* Return if we are already busy processing a work request. */
+ if (!hfi1_send_tid_ok(qp)) {
+ if (qp->s_flags & HFI1_S_ANY_WAIT_IO)
+ iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_TID);
+ spin_unlock_irqrestore(&qp->s_lock, ps.flags);
+ return;
+ }
+
+ priv->s_flags |= RVT_S_BUSY;
+
+ ps.timeout = jiffies + ps.timeout_int;
+ ps.cpu = priv->s_sde ? priv->s_sde->cpu :
+ cpumask_first(cpumask_of_node(ps.ppd->dd->node));
+ ps.pkts_sent = false;
+
+ /* insure a pre-built packet is handled */
+ ps.s_txreq = get_waiting_verbs_txreq(ps.wait);
+ do {
+ /* Check for a constructed packet to be sent. */
+ if (ps.s_txreq) {
+ if (priv->s_flags & HFI1_S_TID_BUSY_SET) {
+ qp->s_flags |= RVT_S_BUSY;
+ ps.wait = iowait_get_ib_work(&priv->s_iowait);
+ }
+ spin_unlock_irqrestore(&qp->s_lock, ps.flags);
+
+ /*
+ * If the packet cannot be sent now, return and
+ * the send tasklet will be woken up later.
+ */
+ if (hfi1_verbs_send(qp, &ps))
+ return;
+
+ /* allow other tasks to run */
+ if (hfi1_schedule_send_yield(qp, &ps, true))
+ return;
+
+ spin_lock_irqsave(&qp->s_lock, ps.flags);
+ if (priv->s_flags & HFI1_S_TID_BUSY_SET) {
+ qp->s_flags &= ~RVT_S_BUSY;
+ priv->s_flags &= ~HFI1_S_TID_BUSY_SET;
+ ps.wait = iowait_get_tid_work(&priv->s_iowait);
+ if (iowait_flag_set(&priv->s_iowait,
+ IOWAIT_PENDING_IB))
+ hfi1_schedule_send(qp);
+ }
+ }
+ } while (hfi1_make_tid_rdma_pkt(qp, &ps));
+ iowait_starve_clear(ps.pkts_sent, &priv->s_iowait);
+ spin_unlock_irqrestore(&qp->s_lock, ps.flags);
+}
+
u32 hfi1_build_tid_rdma_read_packet(struct rvt_swqe *wqe,
struct ib_other_headers *ohdr, u32 *bth1,
u32 *bth2, u32 *len)
@@ -285,6 +285,8 @@ bool hfi1_handle_kdeth_eflags(struct hfi1_ctxtdata *rcd,
struct hfi1_pportdata *ppd,
struct hfi1_packet *packet);
+bool hfi1_schedule_tid_send(struct rvt_qp *qp);
+
int hfi1_qp_priv_init(struct rvt_dev_info *rdi, struct rvt_qp *qp,
struct ib_qp_init_attr *init_attr);
void hfi1_qp_priv_tid_free(struct rvt_dev_info *rdi, struct rvt_qp *qp);
@@ -292,6 +294,7 @@ int hfi1_qp_priv_init(struct rvt_dev_info *rdi, struct rvt_qp *qp,
void hfi1_tid_rdma_restart_req(struct rvt_qp *qp, struct rvt_swqe *wqe,
u32 *bth2);
+void _hfi1_do_tid_send(struct work_struct *work);
void tid_rdma_opfn_init(struct rvt_qp *qp, struct tid_rdma_params *p);
u32 hfi1_build_tid_rdma_read_packet(struct rvt_swqe *wqe,
@@ -1,5 +1,5 @@
/*
- * Copyright(c) 2015 - 2017 Intel Corporation.
+ * Copyright(c) 2015 - 2018 Intel Corporation.
*
* This file is provided under a dual BSD/GPLv2 license. When using or
* redistributing this file, you may do so under either license.
@@ -839,6 +839,12 @@
);
DEFINE_EVENT(
+ hfi1_do_send_template, hfi1_rc_do_tid_send,
+ TP_PROTO(struct rvt_qp *qp, bool flag),
+ TP_ARGS(qp, flag)
+);
+
+DEFINE_EVENT(/* expired time slice */
hfi1_do_send_template, hfi1_rc_expired_time_slice,
TP_PROTO(struct rvt_qp *qp, bool flag),
TP_ARGS(qp, flag)
@@ -172,7 +172,9 @@ struct hfi1_qp_priv {
u8 hdr_type; /* 9B or 16B */
unsigned long tid_timer_timeout_jiffies;
unsigned long tid_retry_timeout_jiffies;
+ /* variables for the TID RDMA SE state machine */
u8 s_retry;
+ u32 s_flags;
/* For TID RDMA READ */
u32 tid_r_reqs; /* Num of tid reads requested */
u32 tid_r_comp; /* Num of tid reads completed */
@@ -414,6 +416,9 @@ void hfi1_make_ruc_header(struct rvt_qp *qp, struct ib_other_headers *ohdr,
u32 bth0, u32 bth1, u32 bth2, int middle,
struct hfi1_pkt_state *ps);
+bool hfi1_schedule_send_yield(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
+ bool tid);
+
void _hfi1_do_send(struct work_struct *work);
void hfi1_do_send_from_rvt(struct rvt_qp *qp);