diff mbox series

[v3,02/17] scsi: iscsi: sync libiscsi and driver reset cleanup

Message ID 20210416020440.259271-3-michael.christie@oracle.com (mailing list archive)
State Changes Requested
Headers show
Series libicsi and qedi TMF fixes | expand

Commit Message

Mike Christie April 16, 2021, 2:04 a.m. UTC
If we are handling a sg io reset there is a small window where cmds can
sneak into iscsi_queuecommand even though libiscsi has sent a TMF to the
driver. This does seems to not be a problem for normal drivers because they
know exactly what was sent to the FW. For libiscsi this is a problem
because it knows what it has sent to the driver but not what the driver
sent to the FW. When we go to cleanup the running tasks, libiscsi might
cleanup the sneaky cmd but the driver/FQ may not have seen the sneaky cmd
and it's left running in FW.

This has libiscsi just stop queueing cmds when it sends the TMF down to the
driver. Both then know they see the same set of cmds.

Signed-off-by: Mike Christie <michael.christie@oracle.com>
---
 drivers/scsi/libiscsi.c | 104 +++++++++++++++++++++++++++-------------
 1 file changed, 72 insertions(+), 32 deletions(-)

Comments

Lee Duncan April 17, 2021, 5:22 p.m. UTC | #1
On 4/15/21 7:04 PM, Mike Christie wrote:
> If we are handling a sg io reset there is a small window where cmds can
> sneak into iscsi_queuecommand even though libiscsi has sent a TMF to the
> driver. This does seems to not be a problem for normal drivers because they
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
"doesn't seem to be a problem"?

> know exactly what was sent to the FW. For libiscsi this is a problem
> because it knows what it has sent to the driver but not what the driver
> sent to the FW. When we go to cleanup the running tasks, libiscsi might
> cleanup the sneaky cmd but the driver/FQ may not have seen the sneaky cmd

FO?

> and it's left running in FW.
> 
> This has libiscsi just stop queueing cmds when it sends the TMF down to the
> driver. Both then know they see the same set of cmds.
> 
> Signed-off-by: Mike Christie <michael.christie@oracle.com>
> ---
>  drivers/scsi/libiscsi.c | 104 +++++++++++++++++++++++++++-------------
>  1 file changed, 72 insertions(+), 32 deletions(-)
> 
> diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c
> index 4834219497ee..aa5ceaffc697 100644
> --- a/drivers/scsi/libiscsi.c
> +++ b/drivers/scsi/libiscsi.c
> @@ -1669,29 +1669,11 @@ enum {
>  	FAILURE_SESSION_NOT_READY,
>  };
>  
> -int iscsi_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *sc)
> +static bool iscsi_eh_running(struct iscsi_conn *conn, struct scsi_cmnd *sc,
> +			     int *reason)
>  {
> -	struct iscsi_cls_session *cls_session;
> -	struct iscsi_host *ihost;
> -	int reason = 0;
> -	struct iscsi_session *session;
> -	struct iscsi_conn *conn;
> -	struct iscsi_task *task = NULL;
> -
> -	sc->result = 0;
> -	sc->SCp.ptr = NULL;
> -
> -	ihost = shost_priv(host);
> -
> -	cls_session = starget_to_session(scsi_target(sc->device));
> -	session = cls_session->dd_data;
> -	spin_lock_bh(&session->frwd_lock);
> -
> -	reason = iscsi_session_chkready(cls_session);
> -	if (reason) {
> -		sc->result = reason;
> -		goto fault;
> -	}
> +	struct iscsi_session *session = conn->session;
> +	struct iscsi_tm *tmf;
>  
>  	if (session->state != ISCSI_STATE_LOGGED_IN) {
>  		/*
> @@ -1707,31 +1689,92 @@ int iscsi_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *sc)
>  			 * state is bad, allowing completion to happen
>  			 */
>  			if (unlikely(system_state != SYSTEM_RUNNING)) {
> -				reason = FAILURE_SESSION_FAILED;
> +				*reason = FAILURE_SESSION_FAILED;
>  				sc->result = DID_NO_CONNECT << 16;
>  				break;
>  			}
>  			fallthrough;
>  		case ISCSI_STATE_IN_RECOVERY:
> -			reason = FAILURE_SESSION_IN_RECOVERY;
> +			*reason = FAILURE_SESSION_IN_RECOVERY;
>  			sc->result = DID_IMM_RETRY << 16;
>  			break;
>  		case ISCSI_STATE_LOGGING_OUT:
> -			reason = FAILURE_SESSION_LOGGING_OUT;
> +			*reason = FAILURE_SESSION_LOGGING_OUT;
>  			sc->result = DID_IMM_RETRY << 16;
>  			break;
>  		case ISCSI_STATE_RECOVERY_FAILED:
> -			reason = FAILURE_SESSION_RECOVERY_TIMEOUT;
> +			*reason = FAILURE_SESSION_RECOVERY_TIMEOUT;
>  			sc->result = DID_TRANSPORT_FAILFAST << 16;
>  			break;
>  		case ISCSI_STATE_TERMINATE:
> -			reason = FAILURE_SESSION_TERMINATE;
> +			*reason = FAILURE_SESSION_TERMINATE;
>  			sc->result = DID_NO_CONNECT << 16;
>  			break;
>  		default:
> -			reason = FAILURE_SESSION_FREED;
> +			*reason = FAILURE_SESSION_FREED;
>  			sc->result = DID_NO_CONNECT << 16;
>  		}
> +		goto eh_running;
> +	}
> +
> +	if (test_bit(ISCSI_SUSPEND_BIT, &conn->suspend_tx)) {
> +		*reason = FAILURE_SESSION_IN_RECOVERY;
> +		sc->result = DID_REQUEUE << 16;
> +		goto eh_running;
> +	}
> +
> +	/*
> +	 * For sg resets make sure libiscsi, the drivers and their fw see the
> +	 * same cmds. Once we get a TMF that can affect multiple cmds stop
> +	 * queueing.
> +	 */
> +	if (conn->tmf_state != TMF_INITIAL) {
> +		tmf = &conn->tmhdr;
> +
> +		switch (ISCSI_TM_FUNC_VALUE(tmf)) {
> +		case ISCSI_TM_FUNC_LOGICAL_UNIT_RESET:
> +			if (sc->device->lun != scsilun_to_int(&tmf->lun))
> +				break;
> +
> +			ISCSI_DBG_EH(conn->session,
> +				     "Requeue cmd sent during LU RESET processing.\n");
> +			sc->result = DID_REQUEUE << 16;
> +			goto eh_running;
> +		case ISCSI_TM_FUNC_TARGET_WARM_RESET:
> +			ISCSI_DBG_EH(conn->session,
> +				     "Requeue cmd sent during TARGET RESET processing.\n");
> +			sc->result = DID_REQUEUE << 16;
> +			goto eh_running;
> +		}

Is it common to have no default case? I thought the compiler disliked
that. If the compiler and convention are fine with this, I'm fine with
it too.

> +	}
> +
> +	return false;
> +
> +eh_running:
> +	return true;
> +}
> +
> +int iscsi_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *sc)
> +{
> +	struct iscsi_cls_session *cls_session;
> +	struct iscsi_host *ihost;
> +	int reason = 0;
> +	struct iscsi_session *session;
> +	struct iscsi_conn *conn;
> +	struct iscsi_task *task = NULL;
> +
> +	sc->result = 0;
> +	sc->SCp.ptr = NULL;
> +
> +	ihost = shost_priv(host);
> +
> +	cls_session = starget_to_session(scsi_target(sc->device));
> +	session = cls_session->dd_data;
> +	spin_lock_bh(&session->frwd_lock);
> +
> +	reason = iscsi_session_chkready(cls_session);
> +	if (reason) {
> +		sc->result = reason;
>  		goto fault;
>  	}
>  
> @@ -1742,11 +1785,8 @@ int iscsi_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *sc)
>  		goto fault;
>  	}
>  
> -	if (test_bit(ISCSI_SUSPEND_BIT, &conn->suspend_tx)) {
> -		reason = FAILURE_SESSION_IN_RECOVERY;
> -		sc->result = DID_REQUEUE << 16;
> +	if (iscsi_eh_running(conn, sc, &reason))
>  		goto fault;
> -	}
>  
>  	if (iscsi_check_cmdsn_window_closed(conn)) {
>  		reason = FAILURE_WINDOW_CLOSED;
>
Mike Christie April 17, 2021, 5:26 p.m. UTC | #2
On 4/17/21 12:22 PM, Lee Duncan wrote:
> On 4/15/21 7:04 PM, Mike Christie wrote:
>> If we are handling a sg io reset there is a small window where cmds can
>> sneak into iscsi_queuecommand even though libiscsi has sent a TMF to the
>> driver. This does seems to not be a problem for normal drivers because they
>                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> "doesn't seem to be a problem"?

Yeah.

> 
>> know exactly what was sent to the FW. For libiscsi this is a problem
>> because it knows what it has sent to the driver but not what the driver
>> sent to the FW. When we go to cleanup the running tasks, libiscsi might
>> cleanup the sneaky cmd but the driver/FQ may not have seen the sneaky cmd
> 
> FO?
> 

FW.

>> and it's left running in FW.
>>
>> This has libiscsi just stop queueing cmds when it sends the TMF down to the
>> driver. Both then know they see the same set of cmds.
>>
>> Signed-off-by: Mike Christie <michael.christie@oracle.com>
>> ---
>>  drivers/scsi/libiscsi.c | 104 +++++++++++++++++++++++++++-------------
>>  1 file changed, 72 insertions(+), 32 deletions(-)
>>
>> diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c
>> index 4834219497ee..aa5ceaffc697 100644
>> --- a/drivers/scsi/libiscsi.c
>> +++ b/drivers/scsi/libiscsi.c
>> @@ -1669,29 +1669,11 @@ enum {
>>  	FAILURE_SESSION_NOT_READY,
>>  };
>>  
>> -int iscsi_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *sc)
>> +static bool iscsi_eh_running(struct iscsi_conn *conn, struct scsi_cmnd *sc,
>> +			     int *reason)
>>  {
>> -	struct iscsi_cls_session *cls_session;
>> -	struct iscsi_host *ihost;
>> -	int reason = 0;
>> -	struct iscsi_session *session;
>> -	struct iscsi_conn *conn;
>> -	struct iscsi_task *task = NULL;
>> -
>> -	sc->result = 0;
>> -	sc->SCp.ptr = NULL;
>> -
>> -	ihost = shost_priv(host);
>> -
>> -	cls_session = starget_to_session(scsi_target(sc->device));
>> -	session = cls_session->dd_data;
>> -	spin_lock_bh(&session->frwd_lock);
>> -
>> -	reason = iscsi_session_chkready(cls_session);
>> -	if (reason) {
>> -		sc->result = reason;
>> -		goto fault;
>> -	}
>> +	struct iscsi_session *session = conn->session;
>> +	struct iscsi_tm *tmf;
>>  
>>  	if (session->state != ISCSI_STATE_LOGGED_IN) {
>>  		/*
>> @@ -1707,31 +1689,92 @@ int iscsi_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *sc)
>>  			 * state is bad, allowing completion to happen
>>  			 */
>>  			if (unlikely(system_state != SYSTEM_RUNNING)) {
>> -				reason = FAILURE_SESSION_FAILED;
>> +				*reason = FAILURE_SESSION_FAILED;
>>  				sc->result = DID_NO_CONNECT << 16;
>>  				break;
>>  			}
>>  			fallthrough;
>>  		case ISCSI_STATE_IN_RECOVERY:
>> -			reason = FAILURE_SESSION_IN_RECOVERY;
>> +			*reason = FAILURE_SESSION_IN_RECOVERY;
>>  			sc->result = DID_IMM_RETRY << 16;
>>  			break;
>>  		case ISCSI_STATE_LOGGING_OUT:
>> -			reason = FAILURE_SESSION_LOGGING_OUT;
>> +			*reason = FAILURE_SESSION_LOGGING_OUT;
>>  			sc->result = DID_IMM_RETRY << 16;
>>  			break;
>>  		case ISCSI_STATE_RECOVERY_FAILED:
>> -			reason = FAILURE_SESSION_RECOVERY_TIMEOUT;
>> +			*reason = FAILURE_SESSION_RECOVERY_TIMEOUT;
>>  			sc->result = DID_TRANSPORT_FAILFAST << 16;
>>  			break;
>>  		case ISCSI_STATE_TERMINATE:
>> -			reason = FAILURE_SESSION_TERMINATE;
>> +			*reason = FAILURE_SESSION_TERMINATE;
>>  			sc->result = DID_NO_CONNECT << 16;
>>  			break;
>>  		default:
>> -			reason = FAILURE_SESSION_FREED;
>> +			*reason = FAILURE_SESSION_FREED;
>>  			sc->result = DID_NO_CONNECT << 16;
>>  		}
>> +		goto eh_running;
>> +	}
>> +
>> +	if (test_bit(ISCSI_SUSPEND_BIT, &conn->suspend_tx)) {
>> +		*reason = FAILURE_SESSION_IN_RECOVERY;
>> +		sc->result = DID_REQUEUE << 16;
>> +		goto eh_running;
>> +	}
>> +
>> +	/*
>> +	 * For sg resets make sure libiscsi, the drivers and their fw see the
>> +	 * same cmds. Once we get a TMF that can affect multiple cmds stop
>> +	 * queueing.
>> +	 */
>> +	if (conn->tmf_state != TMF_INITIAL) {
>> +		tmf = &conn->tmhdr;
>> +
>> +		switch (ISCSI_TM_FUNC_VALUE(tmf)) {
>> +		case ISCSI_TM_FUNC_LOGICAL_UNIT_RESET:
>> +			if (sc->device->lun != scsilun_to_int(&tmf->lun))
>> +				break;
>> +
>> +			ISCSI_DBG_EH(conn->session,
>> +				     "Requeue cmd sent during LU RESET processing.\n");
>> +			sc->result = DID_REQUEUE << 16;
>> +			goto eh_running;
>> +		case ISCSI_TM_FUNC_TARGET_WARM_RESET:
>> +			ISCSI_DBG_EH(conn->session,
>> +				     "Requeue cmd sent during TARGET RESET processing.\n");
>> +			sc->result = DID_REQUEUE << 16;
>> +			goto eh_running;
>> +		}
> 
> Is it common to have no default case? I thought the compiler disliked
> that. If the compiler and convention are fine with this, I'm fine with
> it too.
> 

There is no compiler warnings or errors.
diff mbox series

Patch

diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c
index 4834219497ee..aa5ceaffc697 100644
--- a/drivers/scsi/libiscsi.c
+++ b/drivers/scsi/libiscsi.c
@@ -1669,29 +1669,11 @@  enum {
 	FAILURE_SESSION_NOT_READY,
 };
 
-int iscsi_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *sc)
+static bool iscsi_eh_running(struct iscsi_conn *conn, struct scsi_cmnd *sc,
+			     int *reason)
 {
-	struct iscsi_cls_session *cls_session;
-	struct iscsi_host *ihost;
-	int reason = 0;
-	struct iscsi_session *session;
-	struct iscsi_conn *conn;
-	struct iscsi_task *task = NULL;
-
-	sc->result = 0;
-	sc->SCp.ptr = NULL;
-
-	ihost = shost_priv(host);
-
-	cls_session = starget_to_session(scsi_target(sc->device));
-	session = cls_session->dd_data;
-	spin_lock_bh(&session->frwd_lock);
-
-	reason = iscsi_session_chkready(cls_session);
-	if (reason) {
-		sc->result = reason;
-		goto fault;
-	}
+	struct iscsi_session *session = conn->session;
+	struct iscsi_tm *tmf;
 
 	if (session->state != ISCSI_STATE_LOGGED_IN) {
 		/*
@@ -1707,31 +1689,92 @@  int iscsi_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *sc)
 			 * state is bad, allowing completion to happen
 			 */
 			if (unlikely(system_state != SYSTEM_RUNNING)) {
-				reason = FAILURE_SESSION_FAILED;
+				*reason = FAILURE_SESSION_FAILED;
 				sc->result = DID_NO_CONNECT << 16;
 				break;
 			}
 			fallthrough;
 		case ISCSI_STATE_IN_RECOVERY:
-			reason = FAILURE_SESSION_IN_RECOVERY;
+			*reason = FAILURE_SESSION_IN_RECOVERY;
 			sc->result = DID_IMM_RETRY << 16;
 			break;
 		case ISCSI_STATE_LOGGING_OUT:
-			reason = FAILURE_SESSION_LOGGING_OUT;
+			*reason = FAILURE_SESSION_LOGGING_OUT;
 			sc->result = DID_IMM_RETRY << 16;
 			break;
 		case ISCSI_STATE_RECOVERY_FAILED:
-			reason = FAILURE_SESSION_RECOVERY_TIMEOUT;
+			*reason = FAILURE_SESSION_RECOVERY_TIMEOUT;
 			sc->result = DID_TRANSPORT_FAILFAST << 16;
 			break;
 		case ISCSI_STATE_TERMINATE:
-			reason = FAILURE_SESSION_TERMINATE;
+			*reason = FAILURE_SESSION_TERMINATE;
 			sc->result = DID_NO_CONNECT << 16;
 			break;
 		default:
-			reason = FAILURE_SESSION_FREED;
+			*reason = FAILURE_SESSION_FREED;
 			sc->result = DID_NO_CONNECT << 16;
 		}
+		goto eh_running;
+	}
+
+	if (test_bit(ISCSI_SUSPEND_BIT, &conn->suspend_tx)) {
+		*reason = FAILURE_SESSION_IN_RECOVERY;
+		sc->result = DID_REQUEUE << 16;
+		goto eh_running;
+	}
+
+	/*
+	 * For sg resets make sure libiscsi, the drivers and their fw see the
+	 * same cmds. Once we get a TMF that can affect multiple cmds stop
+	 * queueing.
+	 */
+	if (conn->tmf_state != TMF_INITIAL) {
+		tmf = &conn->tmhdr;
+
+		switch (ISCSI_TM_FUNC_VALUE(tmf)) {
+		case ISCSI_TM_FUNC_LOGICAL_UNIT_RESET:
+			if (sc->device->lun != scsilun_to_int(&tmf->lun))
+				break;
+
+			ISCSI_DBG_EH(conn->session,
+				     "Requeue cmd sent during LU RESET processing.\n");
+			sc->result = DID_REQUEUE << 16;
+			goto eh_running;
+		case ISCSI_TM_FUNC_TARGET_WARM_RESET:
+			ISCSI_DBG_EH(conn->session,
+				     "Requeue cmd sent during TARGET RESET processing.\n");
+			sc->result = DID_REQUEUE << 16;
+			goto eh_running;
+		}
+	}
+
+	return false;
+
+eh_running:
+	return true;
+}
+
+int iscsi_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *sc)
+{
+	struct iscsi_cls_session *cls_session;
+	struct iscsi_host *ihost;
+	int reason = 0;
+	struct iscsi_session *session;
+	struct iscsi_conn *conn;
+	struct iscsi_task *task = NULL;
+
+	sc->result = 0;
+	sc->SCp.ptr = NULL;
+
+	ihost = shost_priv(host);
+
+	cls_session = starget_to_session(scsi_target(sc->device));
+	session = cls_session->dd_data;
+	spin_lock_bh(&session->frwd_lock);
+
+	reason = iscsi_session_chkready(cls_session);
+	if (reason) {
+		sc->result = reason;
 		goto fault;
 	}
 
@@ -1742,11 +1785,8 @@  int iscsi_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *sc)
 		goto fault;
 	}
 
-	if (test_bit(ISCSI_SUSPEND_BIT, &conn->suspend_tx)) {
-		reason = FAILURE_SESSION_IN_RECOVERY;
-		sc->result = DID_REQUEUE << 16;
+	if (iscsi_eh_running(conn, sc, &reason))
 		goto fault;
-	}
 
 	if (iscsi_check_cmdsn_window_closed(conn)) {
 		reason = FAILURE_WINDOW_CLOSED;