diff mbox series

[v8,1/9] block: Introduce more member variables related to zone write locking

Message ID 20230811213604.548235-2-bvanassche@acm.org (mailing list archive)
State New, archived
Headers show
Series Improve performance for zoned UFS devices | expand

Commit Message

Bart Van Assche Aug. 11, 2023, 9:35 p.m. UTC
Many but not all storage controllers require serialization of zoned writes.
Introduce a new request queue limit member variable
(driver_preserves_write_order) that allows block drivers to indicate that
the order of write commands is preserved and hence that serialization of
writes per zone is not required.

Make disk_set_zoned() set 'use_zone_write_lock' only if the block device
has zones and if the block driver does not preserve the order of write
requests.

Cc: Damien Le Moal <dlemoal@kernel.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
 block/blk-settings.c   |  7 +++++++
 include/linux/blkdev.h | 10 ++++++++++
 2 files changed, 17 insertions(+)

Comments

Damien Le Moal Aug. 14, 2023, 12:32 p.m. UTC | #1
On 8/12/23 06:35, Bart Van Assche wrote:
> Many but not all storage controllers require serialization of zoned writes.
> Introduce a new request queue limit member variable
> (driver_preserves_write_order) that allows block drivers to indicate that
> the order of write commands is preserved and hence that serialization of
> writes per zone is not required.
> 
> Make disk_set_zoned() set 'use_zone_write_lock' only if the block device
> has zones and if the block driver does not preserve the order of write
> requests.
> 
> Cc: Damien Le Moal <dlemoal@kernel.org>
> Cc: Christoph Hellwig <hch@lst.de>
> Cc: Ming Lei <ming.lei@redhat.com>
> Signed-off-by: Bart Van Assche <bvanassche@acm.org>
> ---
>  block/blk-settings.c   |  7 +++++++
>  include/linux/blkdev.h | 10 ++++++++++
>  2 files changed, 17 insertions(+)
> 
> diff --git a/block/blk-settings.c b/block/blk-settings.c
> index 0046b447268f..3a7748af1bef 100644
> --- a/block/blk-settings.c
> +++ b/block/blk-settings.c
> @@ -56,6 +56,8 @@ void blk_set_default_limits(struct queue_limits *lim)
>  	lim->alignment_offset = 0;
>  	lim->io_opt = 0;
>  	lim->misaligned = 0;
> +	lim->driver_preserves_write_order = false;
> +	lim->use_zone_write_lock = false;
>  	lim->zoned = BLK_ZONED_NONE;
>  	lim->zone_write_granularity = 0;
>  	lim->dma_alignment = 511;
> @@ -685,6 +687,9 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
>  						   b->max_secure_erase_sectors);
>  	t->zone_write_granularity = max(t->zone_write_granularity,
>  					b->zone_write_granularity);
> +	/* Request-based stacking drivers do not reorder requests. */
> +	t->driver_preserves_write_order = b->driver_preserves_write_order;
> +	t->use_zone_write_lock = b->use_zone_write_lock;

I do not think this is correct as the last target of a multi target device will
dictate the result, regardless of the other targets. So this should be something
like:

	t->driver_preserves_write_order = t->driver_preserves_write_order &&
		b->driver_preserves_write_order;
	t->use_zone_write_lock =
		t->use_zone_write_lock || b->use_zone_write_lock;

However, given that driver_preserves_write_order is initialized as false, this
would always be false. Not sure how to handle that...

>  	t->zoned = max(t->zoned, b->zoned);
>  	return ret;
>  }
> @@ -949,6 +954,8 @@ void disk_set_zoned(struct gendisk *disk, enum blk_zoned_model model)
>  	}
>  
>  	q->limits.zoned = model;
> +	q->limits.use_zone_write_lock = model != BLK_ZONED_NONE &&
> +		!q->limits.driver_preserves_write_order;

I think this needs a comment to explain the condition as it takes a while to
understand it.

>  	if (model != BLK_ZONED_NONE) {
>  		/*
>  		 * Set the zone write granularity to the device logical block

You also should set use_zone_write_lock to false in disk_clear_zone_settings().

In patch 9, ufshcd_auto_hibern8_update() changes the value of
driver_preserves_write_order, which will change the value of use_zone_write_lock
only if disk_set_zoned() is called again after ufshcd_auto_hibern8_update(). Is
that the case ? Is the drive revalidated always after
ufshcd_auto_hibern8_update() is executed ?

> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index 2f5371b8482c..2c090a28ec78 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -316,6 +316,16 @@ struct queue_limits {
>  	unsigned char		misaligned;
>  	unsigned char		discard_misaligned;
>  	unsigned char		raid_partial_stripes_expensive;
> +	/*
> +	 * Whether or not the block driver preserves the order of write
> +	 * requests. Set by the block driver.
> +	 */
> +	bool			driver_preserves_write_order;
> +	/*
> +	 * Whether or not zone write locking should be used. Set by
> +	 * disk_set_zoned().
> +	 */
> +	bool			use_zone_write_lock;
>  	enum blk_zoned_model	zoned;
>  
>  	/*
Bart Van Assche Aug. 14, 2023, 4:57 p.m. UTC | #2
On 8/14/23 05:32, Damien Le Moal wrote:
> On 8/12/23 06:35, Bart Van Assche wrote:
>> --- a/block/blk-settings.c
>> +++ b/block/blk-settings.c
>> @@ -56,6 +56,8 @@ void blk_set_default_limits(struct queue_limits *lim)
>>   	lim->alignment_offset = 0;
>>   	lim->io_opt = 0;
>>   	lim->misaligned = 0;
>> +	lim->driver_preserves_write_order = false;
>> +	lim->use_zone_write_lock = false;
>>   	lim->zoned = BLK_ZONED_NONE;
>>   	lim->zone_write_granularity = 0;
>>   	lim->dma_alignment = 511;
>> @@ -685,6 +687,9 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
>>   						   b->max_secure_erase_sectors);
>>   	t->zone_write_granularity = max(t->zone_write_granularity,
>>   					b->zone_write_granularity);
>> +	/* Request-based stacking drivers do not reorder requests. */
>> +	t->driver_preserves_write_order = b->driver_preserves_write_order;
>> +	t->use_zone_write_lock = b->use_zone_write_lock;
> 
> I do not think this is correct as the last target of a multi target device will
> dictate the result, regardless of the other targets. So this should be something
> like:
> 
> 	t->driver_preserves_write_order = t->driver_preserves_write_order &&
> 		b->driver_preserves_write_order;
> 	t->use_zone_write_lock =
> 		t->use_zone_write_lock || b->use_zone_write_lock;
> 
> However, given that driver_preserves_write_order is initialized as false, this
> would always be false. Not sure how to handle that...

How about integrating the (untested) change below into this patch? It keeps
the default value for driver_preserves_write_order to false for regular block
drivers and changes the default value to true for stacking drivers:

--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -84,6 +84,7 @@ void blk_set_stacking_limits(struct queue_limits *lim)
  	lim->max_dev_sectors = UINT_MAX;
  	lim->max_write_zeroes_sectors = UINT_MAX;
  	lim->max_zone_append_sectors = UINT_MAX;
+	lim->driver_preserves_write_order = true;
  }
  EXPORT_SYMBOL(blk_set_stacking_limits);

@@ -688,8 +689,10 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
  	t->zone_write_granularity = max(t->zone_write_granularity,
  					b->zone_write_granularity);
  	/* Request-based stacking drivers do not reorder requests. */
-	t->driver_preserves_write_order = b->driver_preserves_write_order;
-	t->use_zone_write_lock = b->use_zone_write_lock;
+	t->driver_preserves_write_order = t->driver_preserves_write_order &&
+		b->driver_preserves_write_order;
+	t->use_zone_write_lock = t->use_zone_write_lock ||
+		b->use_zone_write_lock;
  	t->zoned = max(t->zoned, b->zoned);
  	return ret;
  }


>>   	t->zoned = max(t->zoned, b->zoned);
>>   	return ret;
>>   }
>> @@ -949,6 +954,8 @@ void disk_set_zoned(struct gendisk *disk, enum blk_zoned_model model)
>>   	}
>>   
>>   	q->limits.zoned = model;
>> +	q->limits.use_zone_write_lock = model != BLK_ZONED_NONE &&
>> +		!q->limits.driver_preserves_write_order;
> 
> I think this needs a comment to explain the condition as it takes a while to
> understand it.

Something like this?

	/*
	 * Use the zone write lock only for zoned block devices and only if
	 * the block driver does not preserve the order of write commands.
	 */

>>   	if (model != BLK_ZONED_NONE) {
>>   		/*
>>   		 * Set the zone write granularity to the device logical block
> 
> You also should set use_zone_write_lock to false in disk_clear_zone_settings().

I will do this.

> In patch 9, ufshcd_auto_hibern8_update() changes the value of
> driver_preserves_write_order, which will change the value of use_zone_write_lock
> only if disk_set_zoned() is called again after ufshcd_auto_hibern8_update(). Is
> that the case ? Is the drive revalidated always after
> ufshcd_auto_hibern8_update() is executed ?

I will start with testing this change on top of this patch series:

--- a/drivers/ufs/core/ufshcd.c
+++ b/drivers/ufs/core/ufshcd.c
@@ -4350,6 +4350,7 @@ static void ufshcd_update_preserves_write_order(struct ufs_hba *hba,
  		blk_mq_freeze_queue_wait(q);
  		q->limits.driver_preserves_write_order = preserves_write_order;
  		blk_mq_unfreeze_queue(q);
+		scsi_rescan_device(&sdev->sdev_gendev);
  	}
  }

Thanks,

Bart.
Damien Le Moal Aug. 15, 2023, 2:01 a.m. UTC | #3
On 8/15/23 01:57, Bart Van Assche wrote:
> On 8/14/23 05:32, Damien Le Moal wrote:
>> On 8/12/23 06:35, Bart Van Assche wrote:
>>> --- a/block/blk-settings.c
>>> +++ b/block/blk-settings.c
>>> @@ -56,6 +56,8 @@ void blk_set_default_limits(struct queue_limits *lim)
>>>   	lim->alignment_offset = 0;
>>>   	lim->io_opt = 0;
>>>   	lim->misaligned = 0;
>>> +	lim->driver_preserves_write_order = false;
>>> +	lim->use_zone_write_lock = false;
>>>   	lim->zoned = BLK_ZONED_NONE;
>>>   	lim->zone_write_granularity = 0;
>>>   	lim->dma_alignment = 511;
>>> @@ -685,6 +687,9 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
>>>   						   b->max_secure_erase_sectors);
>>>   	t->zone_write_granularity = max(t->zone_write_granularity,
>>>   					b->zone_write_granularity);
>>> +	/* Request-based stacking drivers do not reorder requests. */
>>> +	t->driver_preserves_write_order = b->driver_preserves_write_order;
>>> +	t->use_zone_write_lock = b->use_zone_write_lock;
>>
>> I do not think this is correct as the last target of a multi target device will
>> dictate the result, regardless of the other targets. So this should be something
>> like:
>>
>> 	t->driver_preserves_write_order = t->driver_preserves_write_order &&
>> 		b->driver_preserves_write_order;
>> 	t->use_zone_write_lock =
>> 		t->use_zone_write_lock || b->use_zone_write_lock;
>>
>> However, given that driver_preserves_write_order is initialized as false, this
>> would always be false. Not sure how to handle that...
> 
> How about integrating the (untested) change below into this patch? It keeps
> the default value for driver_preserves_write_order to false for regular block
> drivers and changes the default value to true for stacking drivers:
> 
> --- a/block/blk-settings.c
> +++ b/block/blk-settings.c
> @@ -84,6 +84,7 @@ void blk_set_stacking_limits(struct queue_limits *lim)
>   	lim->max_dev_sectors = UINT_MAX;
>   	lim->max_write_zeroes_sectors = UINT_MAX;
>   	lim->max_zone_append_sectors = UINT_MAX;
> +	lim->driver_preserves_write_order = true;
>   }
>   EXPORT_SYMBOL(blk_set_stacking_limits);
> 
> @@ -688,8 +689,10 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
>   	t->zone_write_granularity = max(t->zone_write_granularity,
>   					b->zone_write_granularity);
>   	/* Request-based stacking drivers do not reorder requests. */
> -	t->driver_preserves_write_order = b->driver_preserves_write_order;
> -	t->use_zone_write_lock = b->use_zone_write_lock;
> +	t->driver_preserves_write_order = t->driver_preserves_write_order &&
> +		b->driver_preserves_write_order;
> +	t->use_zone_write_lock = t->use_zone_write_lock ||
> +		b->use_zone_write_lock;
>   	t->zoned = max(t->zoned, b->zoned);
>   	return ret;
>   }

I think that should work. Testing/checking this with dm-linear by combining
different null-blk devices with different configs should be easy enough.

> 
> 
>>>   	t->zoned = max(t->zoned, b->zoned);
>>>   	return ret;
>>>   }
>>> @@ -949,6 +954,8 @@ void disk_set_zoned(struct gendisk *disk, enum blk_zoned_model model)
>>>   	}
>>>   
>>>   	q->limits.zoned = model;
>>> +	q->limits.use_zone_write_lock = model != BLK_ZONED_NONE &&
>>> +		!q->limits.driver_preserves_write_order;
>>
>> I think this needs a comment to explain the condition as it takes a while to
>> understand it.
> 
> Something like this?
> 
> 	/*
> 	 * Use the zone write lock only for zoned block devices and only if
> 	 * the block driver does not preserve the order of write commands.
> 	 */

That works for me.

> 
>>>   	if (model != BLK_ZONED_NONE) {
>>>   		/*
>>>   		 * Set the zone write granularity to the device logical block
>>
>> You also should set use_zone_write_lock to false in disk_clear_zone_settings().
> 
> I will do this.
> 
>> In patch 9, ufshcd_auto_hibern8_update() changes the value of
>> driver_preserves_write_order, which will change the value of use_zone_write_lock
>> only if disk_set_zoned() is called again after ufshcd_auto_hibern8_update(). Is
>> that the case ? Is the drive revalidated always after
>> ufshcd_auto_hibern8_update() is executed ?
> 
> I will start with testing this change on top of this patch series:
> 
> --- a/drivers/ufs/core/ufshcd.c
> +++ b/drivers/ufs/core/ufshcd.c
> @@ -4350,6 +4350,7 @@ static void ufshcd_update_preserves_write_order(struct ufs_hba *hba,
>   		blk_mq_freeze_queue_wait(q);
>   		q->limits.driver_preserves_write_order = preserves_write_order;
>   		blk_mq_unfreeze_queue(q);
> +		scsi_rescan_device(&sdev->sdev_gendev);

Maybe calling disk_set_zoned() before blk_mq_unfreeze_queue() should be enough ?
rescan/revalidate will be done in case you get a topology change event (or
equivalent), which I think is not the case here.

>   	}
>   }
> 
> Thanks,
> 
> Bart.
> 
>
Bart Van Assche Aug. 15, 2023, 4:06 p.m. UTC | #4
On 8/14/23 19:01, Damien Le Moal wrote:
> On 8/15/23 01:57, Bart Van Assche wrote:
>> --- a/drivers/ufs/core/ufshcd.c
>> +++ b/drivers/ufs/core/ufshcd.c
>> @@ -4350,6 +4350,7 @@ static void ufshcd_update_preserves_write_order(struct ufs_hba *hba,
>>    		blk_mq_freeze_queue_wait(q);
>>    		q->limits.driver_preserves_write_order = preserves_write_order;
>>    		blk_mq_unfreeze_queue(q);
>> +		scsi_rescan_device(&sdev->sdev_gendev);
> 
> Maybe calling disk_set_zoned() before blk_mq_unfreeze_queue() should be enough ?
> rescan/revalidate will be done in case you get a topology change event (or
> equivalent), which I think is not the case here.

Hi Damien,

I will look into calling disk_set_zoned() before blk_mq_unfreeze_queue().

Thanks,

Bart.
diff mbox series

Patch

diff --git a/block/blk-settings.c b/block/blk-settings.c
index 0046b447268f..3a7748af1bef 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -56,6 +56,8 @@  void blk_set_default_limits(struct queue_limits *lim)
 	lim->alignment_offset = 0;
 	lim->io_opt = 0;
 	lim->misaligned = 0;
+	lim->driver_preserves_write_order = false;
+	lim->use_zone_write_lock = false;
 	lim->zoned = BLK_ZONED_NONE;
 	lim->zone_write_granularity = 0;
 	lim->dma_alignment = 511;
@@ -685,6 +687,9 @@  int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 						   b->max_secure_erase_sectors);
 	t->zone_write_granularity = max(t->zone_write_granularity,
 					b->zone_write_granularity);
+	/* Request-based stacking drivers do not reorder requests. */
+	t->driver_preserves_write_order = b->driver_preserves_write_order;
+	t->use_zone_write_lock = b->use_zone_write_lock;
 	t->zoned = max(t->zoned, b->zoned);
 	return ret;
 }
@@ -949,6 +954,8 @@  void disk_set_zoned(struct gendisk *disk, enum blk_zoned_model model)
 	}
 
 	q->limits.zoned = model;
+	q->limits.use_zone_write_lock = model != BLK_ZONED_NONE &&
+		!q->limits.driver_preserves_write_order;
 	if (model != BLK_ZONED_NONE) {
 		/*
 		 * Set the zone write granularity to the device logical block
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 2f5371b8482c..2c090a28ec78 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -316,6 +316,16 @@  struct queue_limits {
 	unsigned char		misaligned;
 	unsigned char		discard_misaligned;
 	unsigned char		raid_partial_stripes_expensive;
+	/*
+	 * Whether or not the block driver preserves the order of write
+	 * requests. Set by the block driver.
+	 */
+	bool			driver_preserves_write_order;
+	/*
+	 * Whether or not zone write locking should be used. Set by
+	 * disk_set_zoned().
+	 */
+	bool			use_zone_write_lock;
 	enum blk_zoned_model	zoned;
 
 	/*