diff mbox series

[for-next,1/5] IB/hfi1: Ignore LNI errors before DC8051 transitions to Polling state

Message ID 20181128181855.30921.40738.stgit@scvm10.sc.intel.com (mailing list archive)
State Accepted
Commit c1a797c0818e0122c7ec8422edd971cfec9b15ea
Delegated to: Jason Gunthorpe
Headers show
Series IB/hfi1: Random fixes for driver to land in for-next | expand

Commit Message

Dennis Dalessandro Nov. 28, 2018, 6:19 p.m. UTC
From: Kaike Wan <kaike.wan@intel.com>

When it is requested to change its physical state back to Offline while
in the process to go up, DC8051 will set the ERROR field in the
DC8051_DBG_ERR_INFO_SET_BY_8051 register. This ERROR field will remain
until the next time when DC8051 transitions from Offline to Polling.
Subsequently, when the host requests DC8051 to change its physical
state to Polling again, it may receive a DC8051 interrupt with the
stale ERROR field still in DC8051_DBG_ERR_INFO_SET_BY_8051. If the
host link state has been changed to Polling, this stale ERROR will
force the host to transition to Offline state, resulting in a vicious
cycle of Polling ->Offline->Polling->Offline. On the other hand, if
the host link state is still Offline when the stale ERROR is received,
the stale ERROR will be ignored, and the link will come up correctly.
This patch implements the correct behavior by changing host link state
to Polling only after DC8051 changes its physical state to Polling.

Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Signed-off-by: Krzysztof Goreczny <krzysztof.goreczny@intel.com>
Signed-off-by: Kaike Wan <kaike.wan@intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
---
 drivers/infiniband/hw/hfi1/chip.c |   47 ++++++++++++++++++++++++++++++++++++-
 1 files changed, 46 insertions(+), 1 deletions(-)
diff mbox series

Patch

diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c
index 7e6d709..b443642 100644
--- a/drivers/infiniband/hw/hfi1/chip.c
+++ b/drivers/infiniband/hw/hfi1/chip.c
@@ -1072,6 +1072,8 @@  static int wait_logical_linkstate(struct hfi1_pportdata *ppd, u32 state,
 static void log_physical_state(struct hfi1_pportdata *ppd, u32 state);
 static int wait_physical_linkstate(struct hfi1_pportdata *ppd, u32 state,
 				   int msecs);
+static int wait_phys_link_out_of_offline(struct hfi1_pportdata *ppd,
+					 int msecs);
 static void read_planned_down_reason_code(struct hfi1_devdata *dd, u8 *pdrrc);
 static void read_link_down_reason(struct hfi1_devdata *dd, u8 *ldr);
 static void handle_temp_err(struct hfi1_devdata *dd);
@@ -10770,13 +10772,15 @@  int set_link_state(struct hfi1_pportdata *ppd, u32 state)
 			break;
 
 		ppd->port_error_action = 0;
-		ppd->host_link_state = HLS_DN_POLL;
 
 		if (quick_linkup) {
 			/* quick linkup does not go into polling */
 			ret = do_quick_linkup(dd);
 		} else {
 			ret1 = set_physical_link_state(dd, PLS_POLLING);
+			if (!ret1)
+				ret1 = wait_phys_link_out_of_offline(ppd,
+								     3000);
 			if (ret1 != HCMD_SUCCESS) {
 				dd_dev_err(dd,
 					   "Failed to transition to Polling link state, return 0x%x\n",
@@ -10784,6 +10788,14 @@  int set_link_state(struct hfi1_pportdata *ppd, u32 state)
 				ret = -EINVAL;
 			}
 		}
+
+		/*
+		 * Change the host link state after requesting DC8051 to
+		 * change its physical state so that we can ignore any
+		 * interrupt with stale LNI(XX) error, which will not be
+		 * cleared until DC8051 transitions to Polling state.
+		 */
+		ppd->host_link_state = HLS_DN_POLL;
 		ppd->offline_disabled_reason =
 			HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NONE);
 		/*
@@ -12928,6 +12940,39 @@  static int wait_phys_link_offline_substates(struct hfi1_pportdata *ppd,
 	return read_state;
 }
 
+/*
+ * wait_phys_link_out_of_offline - wait for any out of offline state
+ * @ppd: port device
+ * @msecs: the number of milliseconds to wait
+ *
+ * Wait up to msecs milliseconds for any out of offline physical link
+ * state change to occur.
+ * Returns 0 if at least one state is reached, otherwise -ETIMEDOUT.
+ */
+static int wait_phys_link_out_of_offline(struct hfi1_pportdata *ppd,
+					 int msecs)
+{
+	u32 read_state;
+	unsigned long timeout;
+
+	timeout = jiffies + msecs_to_jiffies(msecs);
+	while (1) {
+		read_state = read_physical_state(ppd->dd);
+		if ((read_state & 0xF0) != PLS_OFFLINE)
+			break;
+		if (time_after(jiffies, timeout)) {
+			dd_dev_err(ppd->dd,
+				   "timeout waiting for phy link out of offline. Read state 0x%x, %dms\n",
+				   read_state, msecs);
+			return -ETIMEDOUT;
+		}
+		usleep_range(1950, 2050); /* sleep 2ms-ish */
+	}
+
+	log_state_transition(ppd, read_state);
+	return read_state;
+}
+
 #define CLEAR_STATIC_RATE_CONTROL_SMASK(r) \
 (r &= ~SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK)