diff mbox series

[v4,5/7] scsi: Retry unaligned zoned writes

Message ID 20230726193440.1655149-6-bvanassche@acm.org (mailing list archive)
State New, archived
Headers show
Series Improve the performance of F2FS on zoned UFS | expand

Commit Message

Bart Van Assche July 26, 2023, 7:34 p.m. UTC
If zoned writes (REQ_OP_WRITE) for a sequential write required zone have
a starting LBA that differs from the write pointer, e.g. because zoned
writes have been reordered, then the storage device will respond with an
UNALIGNED WRITE COMMAND error. Send commands that failed with an
unaligned write error to the SCSI error handler. Let the SCSI error
handler sort SCSI commands per LBA before resubmitting these.

If zone write locking is disabled, increase the number of retries for
write commands sent to a sequential zone to the maximum number of
outstanding commands because in the worst case the number of times
reordered zoned writes have to be retried is (number of outstanding
writes per sequential zone) - 1.

Cc: Martin K. Petersen <martin.petersen@oracle.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Damien Le Moal <dlemoal@kernel.org>
Cc: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
 drivers/scsi/scsi_error.c | 38 ++++++++++++++++++++++++++++++++++++++
 drivers/scsi/scsi_lib.c   |  1 +
 drivers/scsi/sd.c         |  2 ++
 include/scsi/scsi.h       |  1 +
 4 files changed, 42 insertions(+)

Comments

Damien Le Moal July 27, 2023, 12:33 a.m. UTC | #1
On 7/27/23 04:34, Bart Van Assche wrote:
> If zoned writes (REQ_OP_WRITE) for a sequential write required zone have
> a starting LBA that differs from the write pointer, e.g. because zoned
> writes have been reordered, then the storage device will respond with an
> UNALIGNED WRITE COMMAND error. Send commands that failed with an
> unaligned write error to the SCSI error handler. Let the SCSI error
> handler sort SCSI commands per LBA before resubmitting these.

This last sentence reads as if the retry is done for all cases. That is not
true. It is done only and only if zone write locking is disabled, that is for
requests with REQ_NO_ZONE_WRITE_LOCK and a request queue with
QUEUE_FLAG_NO_ZONE_WRITE_LOCK set.

> If zone write locking is disabled, increase the number of retries for
> write commands sent to a sequential zone to the maximum number of
> outstanding commands because in the worst case the number of times
> reordered zoned writes have to be retried is (number of outstanding
> writes per sequential zone) - 1.
> 
> Cc: Martin K. Petersen <martin.petersen@oracle.com>
> Cc: Christoph Hellwig <hch@lst.de>
> Cc: Damien Le Moal <dlemoal@kernel.org>
> Cc: Ming Lei <ming.lei@redhat.com>
> Signed-off-by: Bart Van Assche <bvanassche@acm.org>
> ---
>  drivers/scsi/scsi_error.c | 38 ++++++++++++++++++++++++++++++++++++++
>  drivers/scsi/scsi_lib.c   |  1 +
>  drivers/scsi/sd.c         |  2 ++
>  include/scsi/scsi.h       |  1 +
>  4 files changed, 42 insertions(+)
> 
> diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
> index c67cdcdc3ba8..9dc354a24d4b 100644
> --- a/drivers/scsi/scsi_error.c
> +++ b/drivers/scsi/scsi_error.c
> @@ -27,6 +27,7 @@
>  #include <linux/blkdev.h>
>  #include <linux/delay.h>
>  #include <linux/jiffies.h>
> +#include <linux/list_sort.h>
>  
>  #include <scsi/scsi.h>
>  #include <scsi/scsi_cmnd.h>
> @@ -698,6 +699,17 @@ enum scsi_disposition scsi_check_sense(struct scsi_cmnd *scmd)
>  		fallthrough;
>  
>  	case ILLEGAL_REQUEST:
> +		/*
> +		 * Unaligned write command. This indicates that zoned writes
> +		 * have been received by the device in the wrong order. If zone
> +		 * write locking is disabled, retry after all pending commands
> +		 * have completed.
> +		 */
> +		if (sshdr.asc == 0x21 && sshdr.ascq == 0x04 &&
> +		    blk_no_zone_write_lock(scsi_cmd_to_rq(scmd)) &&
> +		    !scsi_noretry_cmd(scmd) && scsi_cmd_retry_allowed(scmd))
> +			return NEEDS_DELAYED_RETRY;
> +
>  		if (sshdr.asc == 0x20 || /* Invalid command operation code */
>  		    sshdr.asc == 0x21 || /* Logical block address out of range */
>  		    sshdr.asc == 0x22 || /* Invalid function */
> @@ -2223,6 +2235,25 @@ void scsi_eh_flush_done_q(struct list_head *done_q)
>  }
>  EXPORT_SYMBOL(scsi_eh_flush_done_q);
>  
> +/*
> + * Returns a negative value if @_a has a lower LBA than @_b, zero if
> + * both have the same LBA and a positive value otherwise.
> + */
> +static int scsi_cmp_lba(void *priv, const struct list_head *_a,
> +			const struct list_head *_b)
> +{
> +	struct scsi_cmnd *a = list_entry(_a, typeof(*a), eh_entry);
> +	struct scsi_cmnd *b = list_entry(_b, typeof(*b), eh_entry);
> +	const sector_t pos_a = blk_rq_pos(scsi_cmd_to_rq(a));
> +	const sector_t pos_b = blk_rq_pos(scsi_cmd_to_rq(b));
> +
> +	if (pos_a < pos_b)
> +		return -1;
> +	if (pos_a > pos_b)
> +		return 1;
> +	return 0;
> +}
> +
>  /**
>   * scsi_unjam_host - Attempt to fix a host which has a cmd that failed.
>   * @shost:	Host to unjam.
> @@ -2258,6 +2289,13 @@ static void scsi_unjam_host(struct Scsi_Host *shost)
>  
>  	SCSI_LOG_ERROR_RECOVERY(1, scsi_eh_prt_fail_stats(shost, &eh_work_q));
>  
> +	/*
> +	 * Sort pending SCSI commands in LBA order. This is important if one of
> +	 * the SCSI devices associated with @shost is a zoned block device for
> +	 * which zone write locking is disabled.
> +	 */
> +	list_sort(NULL, &eh_work_q, scsi_cmp_lba);
> +
>  	if (!scsi_eh_get_sense(&eh_work_q, &eh_done_q))
>  		scsi_eh_ready_devs(shost, &eh_work_q, &eh_done_q);
>  
> diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
> index 59176946ab56..69da8aee13df 100644
> --- a/drivers/scsi/scsi_lib.c
> +++ b/drivers/scsi/scsi_lib.c
> @@ -1443,6 +1443,7 @@ static void scsi_complete(struct request *rq)
>  	case ADD_TO_MLQUEUE:
>  		scsi_queue_insert(cmd, SCSI_MLQUEUE_DEVICE_BUSY);
>  		break;
> +	case NEEDS_DELAYED_RETRY:
>  	default:
>  		scsi_eh_scmd_add(cmd);
>  		break;
> diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
> index 68b12afa0721..7b5877323245 100644
> --- a/drivers/scsi/sd.c
> +++ b/drivers/scsi/sd.c
> @@ -1235,6 +1235,8 @@ static blk_status_t sd_setup_read_write_cmnd(struct scsi_cmnd *cmd)
>  	cmd->transfersize = sdp->sector_size;
>  	cmd->underflow = nr_blocks << 9;
>  	cmd->allowed = sdkp->max_retries;
> +	if (blk_no_zone_write_lock(rq) && blk_rq_is_seq_zoned_write(rq))
> +		cmd->allowed += rq->q->nr_requests;

Why += ? Shouldn't this be "=" ?

That said, reading and commenting on your patches gave me a better
understanding of how this works. With that, I am now thinking that
QUEUE_FLAG_NO_ZONE_WRITE_LOCK may be the only thing needed.
My thinking:
1) For well behaved applications correctly issuing writes sequentially per zone
from the write pointer, we will not get any performance benefits for device
with QUEUE_FLAG_NO_ZONE_WRITE_LOCK. But we could if REQ_NO_ZONE_WRITE_LOCK was set.
2) For buggy applications that fail to write zones sequentially, they already
today get an error. That will remain true even for devices with
QUEUE_FLAG_NO_ZONE_WRITE_LOCK and even if we set REQ_NO_ZONE_WRITE_LOCK. The
only difference from today will be that the error return to the application
will be delayed by all the retries, but an error will still be returned,
always, even in the case of an application completely forgetting to issue one
IO (i.e. the application submitted a write stream with a hole in it). There
will be no deadlock given that the retry is within the scsi layer only, not
back to the block layer (which could create deadlocks preventing forward progress).
3) For in-kernel users (f2fs, btrfs, zonefs, dm-zoned), we know writes happen
sequentially, even when stacked on top of zone compliant DMs (dm-crypt,
dm-linear). They all could set REQ_NO_ZONE_WRITE_LOCK for devices with
QUEUE_FLAG_NO_ZONE_WRITE_LOCK and get the same performance boost as you see for
f2fs.

From all this, and given that for (3) REQ_NO_ZONE_WRITE_LOCK is set
unconditionally, it now seems to me that this request flag is useless...
Thoughts ?

>  	cmd->sdb.length = nr_blocks * sdp->sector_size;
>  
>  	SCSI_LOG_HLQUEUE(1,
> diff --git a/include/scsi/scsi.h b/include/scsi/scsi.h
> index ec093594ba53..6600db046227 100644
> --- a/include/scsi/scsi.h
> +++ b/include/scsi/scsi.h
> @@ -93,6 +93,7 @@ static inline int scsi_status_is_check_condition(int status)
>   * Internal return values.
>   */
>  enum scsi_disposition {
> +	NEEDS_DELAYED_RETRY	= 0x2000,
>  	NEEDS_RETRY		= 0x2001,
>  	SUCCESS			= 0x2002,
>  	FAILED			= 0x2003,
Bart Van Assche July 27, 2023, 1:03 a.m. UTC | #2
On 7/26/23 17:33, Damien Le Moal wrote:
>  From all this, and given that for (3) REQ_NO_ZONE_WRITE_LOCK is set
> unconditionally, it now seems to me that this request flag is useless...
> Thoughts ?

Hi Damien,

Thanks for having taken a close look at this patch series.

The flag REQ_NO_ZONE_WRITE_LOCK was introduced based on earlier review
feedback. Removing that flag again would help me because that would
allow me to develop a test for the blktest suite that submits I/O
directly to the block device instead of an F2FS filesystem. I like
F2FS but it's probably good to minimize the number of layers when
writing blktest tests.

Thanks,

Bart.
Damien Le Moal July 27, 2023, 1:09 a.m. UTC | #3
On 7/27/23 10:03, Bart Van Assche wrote:
> On 7/26/23 17:33, Damien Le Moal wrote:
>>  From all this, and given that for (3) REQ_NO_ZONE_WRITE_LOCK is set
>> unconditionally, it now seems to me that this request flag is useless...
>> Thoughts ?
> 
> Hi Damien,
> 
> Thanks for having taken a close look at this patch series.
> 
> The flag REQ_NO_ZONE_WRITE_LOCK was introduced based on earlier review
> feedback. Removing that flag again would help me because that would
> allow me to develop a test for the blktest suite that submits I/O
> directly to the block device instead of an F2FS filesystem. I like
> F2FS but it's probably good to minimize the number of layers when
> writing blktest tests.

Sounds good.

If you could also test the series with zonefs, to hammer it some more that
would be good (I unfortunately do not have any zoned UFS devices to run that
myself). You can run zonefs test suite (see
https://github.com/westerndigitalcorporation/zonefs-tools/tree/master/tests).
Simply execute "zonefs-tests.sh /dev/sdX" as root and everything should run
(need zonefs-tools and fio installed).

> 
> Thanks,
> 
> Bart.
>
Bart Van Assche July 27, 2023, 2:53 p.m. UTC | #4
On 7/26/23 18:09, Damien Le Moal wrote:
> If you could also test the series with zonefs, to hammer it some more that
> would be good (I unfortunately do not have any zoned UFS devices to run that
> myself). You can run zonefs test suite (see
> https://github.com/westerndigitalcorporation/zonefs-tools/tree/master/tests).
> Simply execute "zonefs-tests.sh /dev/sdX" as root and everything should run
> (need zonefs-tools and fio installed).

Running zonefs-tests.sh on an Android device might require a 
considerable amount of work. zonefs-tests.sh is a bash script. Bash is 
not available on Android devices. From 
https://android.googlesource.com/platform/system/core/+/master/shell_and_utilities/README.md:
----------------------------------------------------------------------
Since IceCreamSandwich Android has used mksh as its shell. Before then 
it used ash (which actually remained unused in the tree up to and 
including KitKat).

Initially Android had a very limited command-line provided by its own 
“toolbox” binary. Since Marshmallow almost everything is supplied by 
toybox instead.
----------------------------------------------------------------------

Another challenge is that zonefs-tools depends on at least one header 
file that is not included in the Android NDK (blkid/blkid.h).

Thanks,

Bart.
Bart Van Assche July 27, 2023, 8:57 p.m. UTC | #5
On 7/26/23 17:33, Damien Le Moal wrote:
> On 7/27/23 04:34, Bart Van Assche wrote:
>> +	if (blk_no_zone_write_lock(rq) && blk_rq_is_seq_zoned_write(rq))
>> +		cmd->allowed += rq->q->nr_requests;
> 
> Why += ? Shouldn't this be "=" ?
Hi Damien,

I think that the retry mechanism is also used to retry commands for 
which a unit attention was reported. Hence "+=" instead of "=".

Thanks,

Bart.
diff mbox series

Patch

diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
index c67cdcdc3ba8..9dc354a24d4b 100644
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -27,6 +27,7 @@ 
 #include <linux/blkdev.h>
 #include <linux/delay.h>
 #include <linux/jiffies.h>
+#include <linux/list_sort.h>
 
 #include <scsi/scsi.h>
 #include <scsi/scsi_cmnd.h>
@@ -698,6 +699,17 @@  enum scsi_disposition scsi_check_sense(struct scsi_cmnd *scmd)
 		fallthrough;
 
 	case ILLEGAL_REQUEST:
+		/*
+		 * Unaligned write command. This indicates that zoned writes
+		 * have been received by the device in the wrong order. If zone
+		 * write locking is disabled, retry after all pending commands
+		 * have completed.
+		 */
+		if (sshdr.asc == 0x21 && sshdr.ascq == 0x04 &&
+		    blk_no_zone_write_lock(scsi_cmd_to_rq(scmd)) &&
+		    !scsi_noretry_cmd(scmd) && scsi_cmd_retry_allowed(scmd))
+			return NEEDS_DELAYED_RETRY;
+
 		if (sshdr.asc == 0x20 || /* Invalid command operation code */
 		    sshdr.asc == 0x21 || /* Logical block address out of range */
 		    sshdr.asc == 0x22 || /* Invalid function */
@@ -2223,6 +2235,25 @@  void scsi_eh_flush_done_q(struct list_head *done_q)
 }
 EXPORT_SYMBOL(scsi_eh_flush_done_q);
 
+/*
+ * Returns a negative value if @_a has a lower LBA than @_b, zero if
+ * both have the same LBA and a positive value otherwise.
+ */
+static int scsi_cmp_lba(void *priv, const struct list_head *_a,
+			const struct list_head *_b)
+{
+	struct scsi_cmnd *a = list_entry(_a, typeof(*a), eh_entry);
+	struct scsi_cmnd *b = list_entry(_b, typeof(*b), eh_entry);
+	const sector_t pos_a = blk_rq_pos(scsi_cmd_to_rq(a));
+	const sector_t pos_b = blk_rq_pos(scsi_cmd_to_rq(b));
+
+	if (pos_a < pos_b)
+		return -1;
+	if (pos_a > pos_b)
+		return 1;
+	return 0;
+}
+
 /**
  * scsi_unjam_host - Attempt to fix a host which has a cmd that failed.
  * @shost:	Host to unjam.
@@ -2258,6 +2289,13 @@  static void scsi_unjam_host(struct Scsi_Host *shost)
 
 	SCSI_LOG_ERROR_RECOVERY(1, scsi_eh_prt_fail_stats(shost, &eh_work_q));
 
+	/*
+	 * Sort pending SCSI commands in LBA order. This is important if one of
+	 * the SCSI devices associated with @shost is a zoned block device for
+	 * which zone write locking is disabled.
+	 */
+	list_sort(NULL, &eh_work_q, scsi_cmp_lba);
+
 	if (!scsi_eh_get_sense(&eh_work_q, &eh_done_q))
 		scsi_eh_ready_devs(shost, &eh_work_q, &eh_done_q);
 
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 59176946ab56..69da8aee13df 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1443,6 +1443,7 @@  static void scsi_complete(struct request *rq)
 	case ADD_TO_MLQUEUE:
 		scsi_queue_insert(cmd, SCSI_MLQUEUE_DEVICE_BUSY);
 		break;
+	case NEEDS_DELAYED_RETRY:
 	default:
 		scsi_eh_scmd_add(cmd);
 		break;
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 68b12afa0721..7b5877323245 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -1235,6 +1235,8 @@  static blk_status_t sd_setup_read_write_cmnd(struct scsi_cmnd *cmd)
 	cmd->transfersize = sdp->sector_size;
 	cmd->underflow = nr_blocks << 9;
 	cmd->allowed = sdkp->max_retries;
+	if (blk_no_zone_write_lock(rq) && blk_rq_is_seq_zoned_write(rq))
+		cmd->allowed += rq->q->nr_requests;
 	cmd->sdb.length = nr_blocks * sdp->sector_size;
 
 	SCSI_LOG_HLQUEUE(1,
diff --git a/include/scsi/scsi.h b/include/scsi/scsi.h
index ec093594ba53..6600db046227 100644
--- a/include/scsi/scsi.h
+++ b/include/scsi/scsi.h
@@ -93,6 +93,7 @@  static inline int scsi_status_is_check_condition(int status)
  * Internal return values.
  */
 enum scsi_disposition {
+	NEEDS_DELAYED_RETRY	= 0x2000,
 	NEEDS_RETRY		= 0x2001,
 	SUCCESS			= 0x2002,
 	FAILED			= 0x2003,