diff mbox series

[2/2] block: pre-calculate max_zone_append_sectors

Message ID 20241104073955.112324-3-hch@lst.de (mailing list archive)
State New
Headers show
Series [1/2] block: remove the max_zone_append_sectors check in blk_revalidate_disk_zones | expand

Commit Message

Christoph Hellwig Nov. 4, 2024, 7:39 a.m. UTC
max_zone_append_sectors differs from all other queue limits in that the
final value used is not stored in the queue_limits but needs to be
obtained using queue_limits_max_zone_append_sectors helper.  This not
only adds (tiny) extra overhead to the I/O path, but also can be easily
forgotten in file system code.

Add a new max_hw_zone_append_sectors value to queue_limits which is
set by the driver, and calculate max_zone_append_sectors from that and
the other inputs in blk_validate_zoned_limits, similar to how
max_sectors is calculated to fix this.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 block/blk-core.c               |  2 +-
 block/blk-merge.c              |  3 +--
 block/blk-settings.c           | 25 ++++++++++++-------------
 block/blk-sysfs.c              | 17 +++--------------
 drivers/block/null_blk/zoned.c |  2 +-
 drivers/block/ublk_drv.c       |  2 +-
 drivers/block/virtio_blk.c     |  2 +-
 drivers/md/dm-zone.c           |  4 ++--
 drivers/nvme/host/multipath.c  |  2 +-
 drivers/nvme/host/zns.c        |  2 +-
 drivers/scsi/sd_zbc.c          |  2 --
 include/linux/blkdev.h         | 21 +++------------------
 12 files changed, 27 insertions(+), 57 deletions(-)

Comments

Keith Busch Nov. 6, 2024, 6:41 p.m. UTC | #1
On Mon, Nov 04, 2024 at 08:39:32AM +0100, Christoph Hellwig wrote:
> diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
> index 6a15873055b9..c26cb7d3a2e5 100644
> --- a/drivers/nvme/host/multipath.c
> +++ b/drivers/nvme/host/multipath.c
> @@ -636,7 +636,7 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
>  	if (head->ids.csi == NVME_CSI_ZNS)
>  		lim.features |= BLK_FEAT_ZONED;
>  	else
> -		lim.max_zone_append_sectors = 0;
> +		lim.max_hw_zone_append_sectors = 0;

I think you need to continue clearing max_zone_append_sectors here. The
initial stack limits sets max_zone_append_sectors to UINT_MAX, and
blk_validate_zoned_limits() wants it to be zero.
Klara Modin Nov. 6, 2024, 9:50 p.m. UTC | #2
On 2024-11-06 19:41, Keith Busch wrote:
> On Mon, Nov 04, 2024 at 08:39:32AM +0100, Christoph Hellwig wrote:
>> diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
>> index 6a15873055b9..c26cb7d3a2e5 100644
>> --- a/drivers/nvme/host/multipath.c
>> +++ b/drivers/nvme/host/multipath.c
>> @@ -636,7 +636,7 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
>>   	if (head->ids.csi == NVME_CSI_ZNS)
>>   		lim.features |= BLK_FEAT_ZONED;
>>   	else
>> -		lim.max_zone_append_sectors = 0;
>> +		lim.max_hw_zone_append_sectors = 0;
> 
> I think you need to continue clearing max_zone_append_sectors here. The
> initial stack limits sets max_zone_append_sectors to UINT_MAX, and
> blk_validate_zoned_limits() wants it to be zero.
> 

This appears to be the case. I hit this on a 32-bit x86 machine. 
Clearing max_zone_append_sectors here as well resolves the issue for me.

Regards,
Klara Modin
WARNING: CPU: 0 PID: 21 at block/blk-settings.c:75 blk_validate_limits (block/blk-settings.c:75 (discriminator 1) block/blk-settings.c:366 (discriminator 1))
CPU: 0 UID: 0 PID: 21 Comm: kworker/u4:3 Not tainted 6.12.0-rc6-next-20241106-pentium-mid-00030-gd9dcafd90d18 #17
Workqueue: async async_run_entry_fn
EIP: blk_validate_limits (block/blk-settings.c:75 (discriminator 1) block/blk-settings.c:366 (discriminator 1))
Code: e9 a8 fd ff ff 83 bb 90 00 00 00 00 74 07 0f 0b e9 98 fd ff ff 83 7b 60 00 74 07 0f 0b e9 8b fd ff ff 31 c0 83 7b 54 00 74 0e <0f> 0b e9 7c fd ff ff 0f 0b e9 75 fd ff ff 8d 65 f4 5b 5e 5f 5d c3
All code
========
   0:	e9 a8 fd ff ff       	jmp    0xfffffffffffffdad
   5:	83 bb 90 00 00 00 00 	cmpl   $0x0,0x90(%rbx)
   c:	74 07                	je     0x15
   e:	0f 0b                	ud2
  10:	e9 98 fd ff ff       	jmp    0xfffffffffffffdad
  15:	83 7b 60 00          	cmpl   $0x0,0x60(%rbx)
  19:	74 07                	je     0x22
  1b:	0f 0b                	ud2
  1d:	e9 8b fd ff ff       	jmp    0xfffffffffffffdad
  22:	31 c0                	xor    %eax,%eax
  24:	83 7b 54 00          	cmpl   $0x0,0x54(%rbx)
  28:	74 0e                	je     0x38
  2a:*	0f 0b                	ud2		<-- trapping instruction
  2c:	e9 7c fd ff ff       	jmp    0xfffffffffffffdad
  31:	0f 0b                	ud2
  33:	e9 75 fd ff ff       	jmp    0xfffffffffffffdad
  38:	8d 65 f4             	lea    -0xc(%rbp),%esp
  3b:	5b                   	pop    %rbx
  3c:	5e                   	pop    %rsi
  3d:	5f                   	pop    %rdi
  3e:	5d                   	pop    %rbp
  3f:	c3                   	ret

Code starting with the faulting instruction
===========================================
   0:	0f 0b                	ud2
   2:	e9 7c fd ff ff       	jmp    0xfffffffffffffd83
   7:	0f 0b                	ud2
   9:	e9 75 fd ff ff       	jmp    0xfffffffffffffd83
   e:	8d 65 f4             	lea    -0xc(%rbp),%esp
  11:	5b                   	pop    %rbx
  12:	5e                   	pop    %rsi
  13:	5f                   	pop    %rdi
  14:	5d                   	pop    %rbp
  15:	c3                   	ret
EAX: 00000000 EBX: c11a7d3c ECX: 00000200 EDX: 00000000
ESI: ffffffff EDI: ffffffff EBP: c11a7c58 ESP: c11a7c40
DS: 007b ES: 007b FS: 0000 GS: 0000 SS: 0068 EFLAGS: 00010286
CR0: 80050033 CR2: bffd9c6c CR3: 055a7000 CR4: 00000010
Call Trace:
? show_regs (arch/x86/kernel/dumpstack.c:479 arch/x86/kernel/dumpstack.c:465)
? blk_validate_limits (block/blk-settings.c:75 (discriminator 1) block/blk-settings.c:366 (discriminator 1))
? __warn (kernel/panic.c:748)
? report_bug (lib/bug.c:201 lib/bug.c:219)
? __bio_advance (block/bio.c:1419)
? blk_validate_limits (block/blk-settings.c:75 (discriminator 1) block/blk-settings.c:366 (discriminator 1))
? exc_overflow (arch/x86/kernel/traps.c:301)
? handle_bug (arch/x86/kernel/traps.c:285)
? exc_invalid_op (arch/x86/kernel/traps.c:309 (discriminator 1))
? handle_exception (arch/x86/entry/entry_32.S:1055)
? exc_overflow (arch/x86/kernel/traps.c:301)
? blk_validate_limits (block/blk-settings.c:75 (discriminator 1) block/blk-settings.c:366 (discriminator 1))
? exc_overflow (arch/x86/kernel/traps.c:301)
? blk_validate_limits (block/blk-settings.c:75 (discriminator 1) block/blk-settings.c:366 (discriminator 1))
? __kmalloc_cache_noprof (mm/slub.c:4105 mm/slub.c:4153 mm/slub.c:4309)
blk_set_default_limits (block/blk-settings.c:383)
blk_alloc_queue (block/blk-core.c:417)
__blk_alloc_disk (block/genhd.c:1431 (discriminator 4))
nvme_mpath_alloc_disk (drivers/nvme/host/multipath.c:641 (discriminator 1))
nvme_alloc_ns (drivers/nvme/host/core.c:3650 drivers/nvme/host/core.c:3751 drivers/nvme/host/core.c:3857)
nvme_scan_ns (drivers/nvme/host/core.c:4058)
nvme_scan_ns_async (drivers/nvme/host/core.c:4087)
async_run_entry_fn (arch/x86/include/asm/irqflags.h:26 arch/x86/include/asm/irqflags.h:87 arch/x86/include/asm/irqflags.h:123 kernel/async.c:136)
process_scheduled_works (kernel/workqueue.c:3235 kernel/workqueue.c:3310)
worker_thread (include/linux/list.h:373 (discriminator 2) kernel/workqueue.c:946 (discriminator 2) kernel/workqueue.c:3392 (discriminator 2))
kthread (kernel/kthread.c:391)
? rescuer_thread (kernel/workqueue.c:3337)
? kthread_park (kernel/kthread.c:342)
ret_from_fork (arch/x86/kernel/process.c:153)
? kthread_park (kernel/kthread.c:342)
ret_from_fork_asm (arch/x86/entry/entry_32.S:737)
entry_INT80_32 (arch/x86/entry/entry_32.S:945)
diff mbox series

Patch

diff --git a/block/blk-core.c b/block/blk-core.c
index 09d10bb95fda..5df4607321ca 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -607,7 +607,7 @@  static inline blk_status_t blk_check_zone_append(struct request_queue *q,
 		return BLK_STS_IOERR;
 
 	/* Make sure the BIO is small enough and will not get split */
-	if (nr_sectors > queue_max_zone_append_sectors(q))
+	if (nr_sectors > q->limits.max_zone_append_sectors)
 		return BLK_STS_IOERR;
 
 	bio->bi_opf |= REQ_NOMERGE;
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 4e6c0a52009c..cdb2d6748033 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -387,11 +387,10 @@  struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim,
 struct bio *bio_split_zone_append(struct bio *bio,
 		const struct queue_limits *lim, unsigned *nr_segs)
 {
-	unsigned int max_sectors = queue_limits_max_zone_append_sectors(lim);
 	int split_sectors;
 
 	split_sectors = bio_split_rw_at(bio, lim, nr_segs,
-			max_sectors << SECTOR_SHIFT);
+			lim->max_zone_append_sectors << SECTOR_SHIFT);
 	if (WARN_ON_ONCE(split_sectors > 0))
 		split_sectors = -EINVAL;
 	return bio_submit_split(bio, split_sectors);
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 95fc39d09872..d729a771a940 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -91,17 +91,16 @@  static int blk_validate_zoned_limits(struct queue_limits *lim)
 	if (lim->zone_write_granularity < lim->logical_block_size)
 		lim->zone_write_granularity = lim->logical_block_size;
 
-	if (lim->max_zone_append_sectors) {
-		/*
-		 * The Zone Append size is limited by the maximum I/O size
-		 * and the zone size given that it can't span zones.
-		 */
-		lim->max_zone_append_sectors =
-			min3(lim->max_hw_sectors,
-			     lim->max_zone_append_sectors,
-			     lim->chunk_sectors);
-	}
-
+	/*
+	 * The Zone Append size is limited by the maximum I/O size and the zone
+	 * size given that it can't span zones.
+	 *
+	 * If no max_hw_zone_append_sectors limit is provided, the block layer
+	 * will emulated it, else we're also bound by the hardware limit.
+	 */
+	lim->max_zone_append_sectors =
+		min_not_zero(lim->max_hw_zone_append_sectors,
+			min(lim->chunk_sectors, lim->max_hw_sectors));
 	return 0;
 }
 
@@ -527,8 +526,8 @@  int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 	t->max_dev_sectors = min_not_zero(t->max_dev_sectors, b->max_dev_sectors);
 	t->max_write_zeroes_sectors = min(t->max_write_zeroes_sectors,
 					b->max_write_zeroes_sectors);
-	t->max_zone_append_sectors = min(queue_limits_max_zone_append_sectors(t),
-					 queue_limits_max_zone_append_sectors(b));
+	t->max_hw_zone_append_sectors = min(t->max_hw_zone_append_sectors,
+					b->max_hw_zone_append_sectors);
 
 	t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask,
 					    b->seg_boundary_mask);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 741b95dfdbf6..d9f22122ae2f 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -131,6 +131,7 @@  QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_hw_discard_sectors)
 QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_write_zeroes_sectors)
 QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(atomic_write_max_sectors)
 QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(atomic_write_boundary_sectors)
+QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_zone_append_sectors)
 
 #define QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_KB(_field)			\
 static ssize_t queue_##_field##_show(struct gendisk *disk, char *page)	\
@@ -178,18 +179,6 @@  static ssize_t queue_max_discard_sectors_store(struct gendisk *disk,
 	return ret;
 }
 
-/*
- * For zone append queue_max_zone_append_sectors does not just return the
- * underlying queue limits, but actually contains a calculation.  Because of
- * that we can't simply use QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES here.
- */
-static ssize_t queue_zone_append_max_show(struct gendisk *disk, char *page)
-{
-	return sprintf(page, "%llu\n",
-		(u64)queue_max_zone_append_sectors(disk->queue) <<
-			SECTOR_SHIFT);
-}
-
 static ssize_t
 queue_max_sectors_store(struct gendisk *disk, const char *page, size_t count)
 {
@@ -479,7 +468,7 @@  QUEUE_RO_ENTRY(queue_atomic_write_unit_min, "atomic_write_unit_min_bytes");
 
 QUEUE_RO_ENTRY(queue_write_same_max, "write_same_max_bytes");
 QUEUE_RO_ENTRY(queue_max_write_zeroes_sectors, "write_zeroes_max_bytes");
-QUEUE_RO_ENTRY(queue_zone_append_max, "zone_append_max_bytes");
+QUEUE_RO_ENTRY(queue_max_zone_append_sectors, "zone_append_max_bytes");
 QUEUE_RO_ENTRY(queue_zone_write_granularity, "zone_write_granularity");
 
 QUEUE_RO_ENTRY(queue_zoned, "zoned");
@@ -607,7 +596,7 @@  static struct attribute *queue_attrs[] = {
 	&queue_atomic_write_unit_max_entry.attr,
 	&queue_write_same_max_entry.attr,
 	&queue_max_write_zeroes_sectors_entry.attr,
-	&queue_zone_append_max_entry.attr,
+	&queue_max_zone_append_sectors_entry.attr,
 	&queue_zone_write_granularity_entry.attr,
 	&queue_rotational_entry.attr,
 	&queue_zoned_entry.attr,
diff --git a/drivers/block/null_blk/zoned.c b/drivers/block/null_blk/zoned.c
index 9bc768b2ca56..0d5f9bf95229 100644
--- a/drivers/block/null_blk/zoned.c
+++ b/drivers/block/null_blk/zoned.c
@@ -166,7 +166,7 @@  int null_init_zoned_dev(struct nullb_device *dev,
 
 	lim->features |= BLK_FEAT_ZONED;
 	lim->chunk_sectors = dev->zone_size_sects;
-	lim->max_zone_append_sectors = dev->zone_append_max_sectors;
+	lim->max_hw_zone_append_sectors = dev->zone_append_max_sectors;
 	lim->max_open_zones = dev->zone_max_open;
 	lim->max_active_zones = dev->zone_max_active;
 	return 0;
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 59951e7c2593..8d938b2b41ee 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -2270,7 +2270,7 @@  static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd)
 		lim.features |= BLK_FEAT_ZONED;
 		lim.max_active_zones = p->max_active_zones;
 		lim.max_open_zones =  p->max_open_zones;
-		lim.max_zone_append_sectors = p->max_zone_append_sectors;
+		lim.max_hw_zone_append_sectors = p->max_zone_append_sectors;
 	}
 
 	if (ub->params.basic.attrs & UBLK_ATTR_VOLATILE_CACHE) {
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 194417abc105..0e99a4714928 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -784,7 +784,7 @@  static int virtblk_read_zoned_limits(struct virtio_blk *vblk,
 			wg, v);
 		return -ENODEV;
 	}
-	lim->max_zone_append_sectors = v;
+	lim->max_hw_zone_append_sectors = v;
 	dev_dbg(&vdev->dev, "max append sectors = %u\n", v);
 
 	return 0;
diff --git a/drivers/md/dm-zone.c b/drivers/md/dm-zone.c
index c0d41c36e06e..20edd3fabbab 100644
--- a/drivers/md/dm-zone.c
+++ b/drivers/md/dm-zone.c
@@ -344,7 +344,7 @@  int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q,
 		clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags);
 	} else {
 		set_bit(DMF_EMULATE_ZONE_APPEND, &md->flags);
-		lim->max_zone_append_sectors = 0;
+		lim->max_hw_zone_append_sectors = 0;
 	}
 
 	/*
@@ -379,7 +379,7 @@  int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q,
 	if (!zlim.mapped_nr_seq_zones) {
 		lim->max_open_zones = 0;
 		lim->max_active_zones = 0;
-		lim->max_zone_append_sectors = 0;
+		lim->max_hw_zone_append_sectors = 0;
 		lim->zone_write_granularity = 0;
 		lim->chunk_sectors = 0;
 		lim->features &= ~BLK_FEAT_ZONED;
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 6a15873055b9..c26cb7d3a2e5 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -636,7 +636,7 @@  int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
 	if (head->ids.csi == NVME_CSI_ZNS)
 		lim.features |= BLK_FEAT_ZONED;
 	else
-		lim.max_zone_append_sectors = 0;
+		lim.max_hw_zone_append_sectors = 0;
 
 	head->disk = blk_alloc_disk(&lim, ctrl->numa_node);
 	if (IS_ERR(head->disk))
diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c
index 9a06f9d98cd6..382949e18c6a 100644
--- a/drivers/nvme/host/zns.c
+++ b/drivers/nvme/host/zns.c
@@ -111,7 +111,7 @@  void nvme_update_zone_info(struct nvme_ns *ns, struct queue_limits *lim,
 	lim->features |= BLK_FEAT_ZONED;
 	lim->max_open_zones = zi->max_open_zones;
 	lim->max_active_zones = zi->max_active_zones;
-	lim->max_zone_append_sectors = ns->ctrl->max_zone_append;
+	lim->max_hw_zone_append_sectors = ns->ctrl->max_zone_append;
 	lim->chunk_sectors = ns->head->zsze =
 		nvme_lba_to_sect(ns->head, zi->zone_size);
 }
diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index ee2b74238758..de5c54c057ec 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -634,8 +634,6 @@  int sd_zbc_read_zones(struct scsi_disk *sdkp, struct queue_limits *lim,
 		lim->max_open_zones = sdkp->zones_max_open;
 	lim->max_active_zones = 0;
 	lim->chunk_sectors = logical_to_sectors(sdkp->device, zone_blocks);
-	/* Enable block layer zone append emulation */
-	lim->max_zone_append_sectors = 0;
 
 	return 0;
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 7bfc877e159e..6d1413bd69a5 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -375,6 +375,7 @@  struct queue_limits {
 	unsigned int		max_user_discard_sectors;
 	unsigned int		max_secure_erase_sectors;
 	unsigned int		max_write_zeroes_sectors;
+	unsigned int		max_hw_zone_append_sectors;
 	unsigned int		max_zone_append_sectors;
 	unsigned int		discard_granularity;
 	unsigned int		discard_alignment;
@@ -1204,25 +1205,9 @@  static inline unsigned int queue_max_segment_size(const struct request_queue *q)
 	return q->limits.max_segment_size;
 }
 
-static inline unsigned int
-queue_limits_max_zone_append_sectors(const struct queue_limits *l)
-{
-	unsigned int max_sectors = min(l->chunk_sectors, l->max_hw_sectors);
-
-	return min_not_zero(l->max_zone_append_sectors, max_sectors);
-}
-
-static inline unsigned int queue_max_zone_append_sectors(struct request_queue *q)
-{
-	if (!blk_queue_is_zoned(q))
-		return 0;
-
-	return queue_limits_max_zone_append_sectors(&q->limits);
-}
-
 static inline bool queue_emulates_zone_append(struct request_queue *q)
 {
-	return blk_queue_is_zoned(q) && !q->limits.max_zone_append_sectors;
+	return blk_queue_is_zoned(q) && !q->limits.max_hw_zone_append_sectors;
 }
 
 static inline bool bdev_emulates_zone_append(struct block_device *bdev)
@@ -1233,7 +1218,7 @@  static inline bool bdev_emulates_zone_append(struct block_device *bdev)
 static inline unsigned int
 bdev_max_zone_append_sectors(struct block_device *bdev)
 {
-	return queue_max_zone_append_sectors(bdev_get_queue(bdev));
+	return bdev_limits(bdev)->max_zone_append_sectors;
 }
 
 static inline unsigned int bdev_max_segments(struct block_device *bdev)