diff mbox series

[v12,06/20] scsi: Have scsi-ml retry sd_spinup_disk errors

Message ID 20231114013750.76609-7-michael.christie@oracle.com (mailing list archive)
State Changes Requested
Headers show
Series scsi: Allow scsi_execute users to request retries | expand

Commit Message

Mike Christie Nov. 14, 2023, 1:37 a.m. UTC
This simplifies sd_spinup_disk so scsi-ml retries errors for it. Note that
we retried specifically on a UA and also if scsi_status_is_good returned
failed which would happen for all check conditions. In this patch we use
SCMD_FAILURE_STAT_ANY which will trigger for the same conditions as
when scsi_status_is_good returns false and there is status. This will
cover all CCs including UAs so there is no explicit failures arrary entry
for UAs.

There is one behavior change where we no longer retry when
scsi_execute_cmd returns < 0, but we should be ok. We don't need to retry
for failures like the queue being removed, and for the case where there
are no tags/reqs the block layer waits/retries for us. For possible memory
allocation failures from blk_rq_map_kern we use GFP_NOIO, so retrying
will probably not help.

We do not handle the outside loop's retries because we want to sleep
between tries and we don't support that yet.

Signed-off-by: Mike Christie <michael.christie@oracle.com>
---
 drivers/scsi/sd.c | 77 +++++++++++++++++++++++++++--------------------
 1 file changed, 45 insertions(+), 32 deletions(-)

Comments

John Garry Nov. 16, 2023, 12:13 p.m. UTC | #1
On 14/11/2023 01:37, Mike Christie wrote:

Hi Mike,

> This simplifies sd_spinup_disk so scsi-ml retries errors for it. Note that
> we retried specifically on a UA and also if scsi_status_is_good returned
> failed which would happen for all check conditions. In this patch we use
> SCMD_FAILURE_STAT_ANY which will trigger for the same conditions as
> when scsi_status_is_good returns false and there is status. This will
> cover all CCs including UAs so there is no explicit failures arrary entry

/s/arrary/array/

> for UAs.

But the first failure_defs member is for UNIT_ATTENTION, below, right?

Thanks,
John

> 
> There is one behavior change where we no longer retry when
> scsi_execute_cmd returns < 0, but we should be ok. We don't need to retry
> for failures like the queue being removed, and for the case where there
> are no tags/reqs the block layer waits/retries for us. For possible memory
> allocation failures from blk_rq_map_kern we use GFP_NOIO, so retrying
> will probably not help.
> 
> We do not handle the outside loop's retries because we want to sleep
> between tries and we don't support that yet.
> 
> Signed-off-by: Mike Christie<michael.christie@oracle.com>
> ---
>   drivers/scsi/sd.c | 77 +++++++++++++++++++++++++++--------------------
>   1 file changed, 45 insertions(+), 32 deletions(-)
> 
> diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
> index 641f9c9c0674..cda0d029ab7f 100644
> --- a/drivers/scsi/sd.c
> +++ b/drivers/scsi/sd.c
> @@ -2235,55 +2235,68 @@ static int sd_done(struct scsi_cmnd *SCpnt)
>   static void
>   sd_spinup_disk(struct scsi_disk *sdkp)
>   {
> -	unsigned char cmd[10];
> +	static const u8 cmd[10] = { TEST_UNIT_READY };
>   	unsigned long spintime_expire = 0;
> -	int retries, spintime;
> +	int spintime, sense_valid = 0;
>   	unsigned int the_result;
>   	struct scsi_sense_hdr sshdr;
> +	struct scsi_failure failure_defs[] = {
> +		/* Do not retry Medium Not Present */
> +		{
> +			.sense = UNIT_ATTENTION,
> +			.asc = 0x3A,
> +			.ascq = SCMD_FAILURE_ASCQ_ANY,
> +			.result = SAM_STAT_CHECK_CONDITION,
> +		},
> +		{
> +			.sense = NOT_READY,
> +			.asc = 0x3A,
> +			.ascq = SCMD_FAILURE_ASCQ_ANY,
> +			.result = SAM_STAT_CHECK_CONDITION,
> +		},
> +		/* Retry when scsi_status_is_good would return false 3 times */
> +		{
> +			.result = SCMD_FAILURE_STAT_ANY,
> +			.allowed = 3,
> +		},
> +		{}
Mike Christie Nov. 16, 2023, 4:44 p.m. UTC | #2
On 11/16/23 6:13 AM, John Garry wrote:
> On 14/11/2023 01:37, Mike Christie wrote:
> 
> Hi Mike,
> 
>> This simplifies sd_spinup_disk so scsi-ml retries errors for it. Note that
>> we retried specifically on a UA and also if scsi_status_is_good returned
>> failed which would happen for all check conditions. In this patch we use
>> SCMD_FAILURE_STAT_ANY which will trigger for the same conditions as
>> when scsi_status_is_good returns false and there is status. This will
>> cover all CCs including UAs so there is no explicit failures arrary entry
> 
> /s/arrary/array/
> 
>> for UAs.
> 
> But the first failure_defs member is for UNIT_ATTENTION, below, right?
> 

I should not have written "on a UA" above. We want to retry every UA
except that first entries in that array.

This first and second entry says if we see it then fail. However, if
it's not that specific failure value, then the last entry
SCMD_FAILURE_STAT_ANY kicks in and scsi_execute_cmd will retry.

It's so for this chunk:

-			the_result = scsi_execute_cmd(sdkp->device, cmd,
-						      REQ_OP_DRV_IN, NULL, 0,
-						      SD_TIMEOUT,
-						      sdkp->max_retries,
-						      &exec_args);
 
-			if (the_result > 0) {
-				/*
-				 * If the drive has indicated to us that it
-				 * doesn't have any media in it, don't bother
-				 * with any more polling.
-				 */
-				if (media_not_present(sdkp, &sshdr)) {
-					if (media_was_present)
-						sd_printk(KERN_NOTICE, sdkp,
-							  "Media removed, stopped polling\n");
-					return;
-				}

we will not retry if we get media_not_present but then retry here:

-		} while (retries < 3 &&
-			 (!scsi_status_is_good(the_result) ||
-			  (scsi_status_is_check_condition(the_result) &&
-			  sense_valid && sshdr.sense_key == UNIT_ATTENTION)));



>> +    struct scsi_failure failure_defs[] = {
>> +        /* Do not retry Medium Not Present */
>> +        {
>> +            .sense = UNIT_ATTENTION,
>> +            .asc = 0x3A,
>> +            .ascq = SCMD_FAILURE_ASCQ_ANY,
>> +            .result = SAM_STAT_CHECK_CONDITION,
>> +        },
>> +        {
>> +            .sense = NOT_READY,
>> +            .asc = 0x3A,
>> +            .ascq = SCMD_FAILURE_ASCQ_ANY,
>> +            .result = SAM_STAT_CHECK_CONDITION,
>> +        },
>> +        /* Retry when scsi_status_is_good would return false 3 times */
>> +        {
>> +            .result = SCMD_FAILURE_STAT_ANY,
>> +            .allowed = 3,
>> +        },
>> +        {}
>
diff mbox series

Patch

diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 641f9c9c0674..cda0d029ab7f 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -2235,55 +2235,68 @@  static int sd_done(struct scsi_cmnd *SCpnt)
 static void
 sd_spinup_disk(struct scsi_disk *sdkp)
 {
-	unsigned char cmd[10];
+	static const u8 cmd[10] = { TEST_UNIT_READY };
 	unsigned long spintime_expire = 0;
-	int retries, spintime;
+	int spintime, sense_valid = 0;
 	unsigned int the_result;
 	struct scsi_sense_hdr sshdr;
+	struct scsi_failure failure_defs[] = {
+		/* Do not retry Medium Not Present */
+		{
+			.sense = UNIT_ATTENTION,
+			.asc = 0x3A,
+			.ascq = SCMD_FAILURE_ASCQ_ANY,
+			.result = SAM_STAT_CHECK_CONDITION,
+		},
+		{
+			.sense = NOT_READY,
+			.asc = 0x3A,
+			.ascq = SCMD_FAILURE_ASCQ_ANY,
+			.result = SAM_STAT_CHECK_CONDITION,
+		},
+		/* Retry when scsi_status_is_good would return false 3 times */
+		{
+			.result = SCMD_FAILURE_STAT_ANY,
+			.allowed = 3,
+		},
+		{}
+	};
+	struct scsi_failures failures = {
+		.failure_definitions = failure_defs,
+	};
 	const struct scsi_exec_args exec_args = {
 		.sshdr = &sshdr,
+		.failures = &failures,
 	};
-	int sense_valid = 0;
 
 	spintime = 0;
 
 	/* Spin up drives, as required.  Only do this at boot time */
 	/* Spinup needs to be done for module loads too. */
 	do {
-		retries = 0;
+		bool media_was_present = sdkp->media_present;
 
-		do {
-			bool media_was_present = sdkp->media_present;
+		scsi_reset_failures(&failures);
 
-			cmd[0] = TEST_UNIT_READY;
-			memset((void *) &cmd[1], 0, 9);
+		the_result = scsi_execute_cmd(sdkp->device, cmd, REQ_OP_DRV_IN,
+					      NULL, 0, SD_TIMEOUT,
+					      sdkp->max_retries, &exec_args);
 
-			the_result = scsi_execute_cmd(sdkp->device, cmd,
-						      REQ_OP_DRV_IN, NULL, 0,
-						      SD_TIMEOUT,
-						      sdkp->max_retries,
-						      &exec_args);
 
-			if (the_result > 0) {
-				/*
-				 * If the drive has indicated to us that it
-				 * doesn't have any media in it, don't bother
-				 * with any more polling.
-				 */
-				if (media_not_present(sdkp, &sshdr)) {
-					if (media_was_present)
-						sd_printk(KERN_NOTICE, sdkp,
-							  "Media removed, stopped polling\n");
-					return;
-				}
-
-				sense_valid = scsi_sense_valid(&sshdr);
+		if (the_result > 0) {
+			/*
+			 * If the drive has indicated to us that it doesn't
+			 * have any media in it, don't bother with any more
+			 * polling.
+			 */
+			if (media_not_present(sdkp, &sshdr)) {
+				if (media_was_present)
+					sd_printk(KERN_NOTICE, sdkp,
+						  "Media removed, stopped polling\n");
+				return;
 			}
-			retries++;
-		} while (retries < 3 &&
-			 (!scsi_status_is_good(the_result) ||
-			  (scsi_status_is_check_condition(the_result) &&
-			  sense_valid && sshdr.sense_key == UNIT_ATTENTION)));
+			sense_valid = scsi_sense_valid(&sshdr);
+		}
 
 		if (!scsi_status_is_check_condition(the_result)) {
 			/* no sense, TUR either succeeded or failed