diff mbox series

[3/7] mpt3sas: Cancel the running work during host reset.

Message ID 1596096229-3341-4-git-send-email-suganath-prabu.subramani@broadcom.com (mailing list archive)
State Accepted
Headers show
Series mpt3sas: Enhancements and bug fixes | expand

Commit Message

Suganath Prabu S July 30, 2020, 8:03 a.m. UTC
Currently during host reset time driver is cancelling only those Firmware
event works which are pending in Firmware event workqueue. It is not
cancelling the work which is running. With this patch driver cancels the
running work also.

Issue Description:

Even though it is not recommended to issue back to back host reset without
any delay, but if someone issues back to back host reset then we observe
that target devices gets unregistered and re-register with SML.
And if OS drive is behind the HBA then when it get unregistered, than
file-system goes into read-only mode. Normally during host reset driver
marks the target device as responding (if they are accessible) and add the
event 'MPT3SAS_REMOVE_UNRESPONDING_DEVICES' to remove the non-responding
devices through FW worker thread. while processing this event driver
unregistered the non-responding devices and clears the responding flag for
all the devices.

The reason why target devices are getting unregistered during successive host
resets is that during the host reset driver has to cleanup all the
outstanding FW event work (both queued one and the currently processing one)
but actually driver is cleaning only the queued events. So if
MPT3SAS_REMOVE_UNRESPONDING_DEVICES event is currently under process then
this event is not getting cleaned up, so at end of all successive host
reset this same event is getting processed more than once. And after the
event got processed for the first time, all the target devices responding
flag is cleared, so when the same is processed for the second time it see
that responding flag is zero, so driver unregistered all the target drives
even-though drives are responding. If driver cleanups the current running
work along with pending work, this type of behavior won't be observed.

Signed-off-by: Suganath Prabu S <suganath-prabu.subramani@broadcom.com>
---
 drivers/scsi/mpt3sas/mpt3sas_base.h  |  4 ++++
 drivers/scsi/mpt3sas/mpt3sas_scsih.c | 17 ++++++++++++-----
 2 files changed, 16 insertions(+), 5 deletions(-)
diff mbox series

Patch

diff --git a/drivers/scsi/mpt3sas/mpt3sas_base.h b/drivers/scsi/mpt3sas/mpt3sas_base.h
index 4fca393..4ed704c 100644
--- a/drivers/scsi/mpt3sas/mpt3sas_base.h
+++ b/drivers/scsi/mpt3sas/mpt3sas_base.h
@@ -1036,6 +1036,8 @@  typedef void (*MPT3SAS_FLUSH_RUNNING_CMDS)(struct MPT3SAS_ADAPTER *ioc);
  * @firmware_event_thread: ""
  * @fw_event_lock:
  * @fw_event_list: list of fw events
+ * @current_evet: current processing firmware event
+ * @fw_event_cleanup: set to one while cleaning up the fw events
  * @aen_event_read_flag: event log was read
  * @broadcast_aen_busy: broadcast aen waiting to be serviced
  * @shost_recovery: host reset in progress
@@ -1217,6 +1219,8 @@  struct MPT3SAS_ADAPTER {
 	struct workqueue_struct	*firmware_event_thread;
 	spinlock_t	fw_event_lock;
 	struct list_head fw_event_list;
+	struct fw_event_work	*current_event;
+	u8		fw_events_cleanup;
 
 	 /* misc flags */
 	int		aen_event_read_flag;
diff --git a/drivers/scsi/mpt3sas/mpt3sas_scsih.c b/drivers/scsi/mpt3sas/mpt3sas_scsih.c
index 08fc4b3..66b29d4 100644
--- a/drivers/scsi/mpt3sas/mpt3sas_scsih.c
+++ b/drivers/scsi/mpt3sas/mpt3sas_scsih.c
@@ -3323,11 +3323,13 @@  _scsih_fw_event_cleanup_queue(struct MPT3SAS_ADAPTER *ioc)
 {
 	struct fw_event_work *fw_event;
 
-	if (list_empty(&ioc->fw_event_list) ||
+	if ((list_empty(&ioc->fw_event_list) && !ioc->current_event) ||
 	     !ioc->firmware_event_thread || in_interrupt())
 		return;
 
-	while ((fw_event = dequeue_next_fw_event(ioc))) {
+	ioc->fw_events_cleanup = 1;
+	while ((fw_event = dequeue_next_fw_event(ioc)) ||
+	     (fw_event = ioc->current_event)) {
 		/*
 		 * Wait on the fw_event to complete. If this returns 1, then
 		 * the event was never executed, and we need a put for the
@@ -3341,6 +3343,7 @@  _scsih_fw_event_cleanup_queue(struct MPT3SAS_ADAPTER *ioc)
 
 		fw_event_work_put(fw_event);
 	}
+	ioc->fw_events_cleanup = 0;
 }
 
 /**
@@ -9421,11 +9424,13 @@  mpt3sas_scsih_reset_done_handler(struct MPT3SAS_ADAPTER *ioc)
 static void
 _mpt3sas_fw_work(struct MPT3SAS_ADAPTER *ioc, struct fw_event_work *fw_event)
 {
+	ioc->current_event = fw_event;
 	_scsih_fw_event_del_from_list(ioc, fw_event);
 
 	/* the queue is being flushed so ignore this event */
 	if (ioc->remove_host || ioc->pci_error_recovery) {
 		fw_event_work_put(fw_event);
+		ioc->current_event = NULL;
 		return;
 	}
 
@@ -9439,10 +9444,10 @@  _mpt3sas_fw_work(struct MPT3SAS_ADAPTER *ioc, struct fw_event_work *fw_event)
 		while (scsi_host_in_recovery(ioc->shost) ||
 					 ioc->shost_recovery) {
 			/*
-			 * If we're unloading, bail. Otherwise, this can become
-			 * an infinite loop.
+			 * If we're unloading or cancelling the work, bail.
+			 * Otherwise, this can become an infinite loop.
 			 */
-			if (ioc->remove_host)
+			if (ioc->remove_host || ioc->fw_events_cleanup)
 				goto out;
 			ssleep(1);
 		}
@@ -9503,11 +9508,13 @@  _mpt3sas_fw_work(struct MPT3SAS_ADAPTER *ioc, struct fw_event_work *fw_event)
 		break;
 	case MPI2_EVENT_PCIE_TOPOLOGY_CHANGE_LIST:
 		_scsih_pcie_topology_change_event(ioc, fw_event);
+		ioc->current_event = NULL;
 			return;
 	break;
 	}
 out:
 	fw_event_work_put(fw_event);
+	ioc->current_event = NULL;
 }
 
 /**