diff mbox series

[8/8] block: set the zone size in blk_revalidate_disk_zones atomically

Message ID 20191203093908.24612-9-hch@lst.de (mailing list archive)
State New, archived
Headers show
Series [1/8] null_blk: fix zone size paramter check | expand

Commit Message

Christoph Hellwig Dec. 3, 2019, 9:39 a.m. UTC
The current zone revalidation code has a major problem in that it
doesn't update the zone size and q->nr_zones atomically, leading
to a short window where an out of bounds access to the zone arrays
is possible.

To fix this move the setting of the zone size into the crticial
sections blk_revalidate_disk_zones so that it gets updated together
with the zone bitmaps and q->nr_zones.  This also slightly simplifies
the caller as it deducts the zone size from the report_zones.

This change also allows to check for a power of two zone size in generic
code.

Reported-by: Hans Holmberg <hans@owltronix.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 block/blk-zoned.c             | 59 ++++++++++++++++++++---------------
 drivers/block/null_blk_main.c |  3 +-
 drivers/scsi/sd_zbc.c         |  2 --
 3 files changed, 35 insertions(+), 29 deletions(-)

Comments

Javier González Dec. 3, 2019, 2 p.m. UTC | #1
On 03.12.2019 10:39, Christoph Hellwig wrote:
>The current zone revalidation code has a major problem in that it
>doesn't update the zone size and q->nr_zones atomically, leading
>to a short window where an out of bounds access to the zone arrays
>is possible.
>
>To fix this move the setting of the zone size into the crticial

nip: critical

>sections blk_revalidate_disk_zones so that it gets updated together
>with the zone bitmaps and q->nr_zones.  This also slightly simplifies
>the caller as it deducts the zone size from the report_zones.

This part makes sense. Good catch.
>
>This change also allows to check for a power of two zone size in generic
>code.

I think however that this checks should remain at the driver level, or
at least depend on a flag that signals that the zoned device is actually
a power of two.

>
>Reported-by: Hans Holmberg <hans@owltronix.com>
>Signed-off-by: Christoph Hellwig <hch@lst.de>
>---
> block/blk-zoned.c             | 59 ++++++++++++++++++++---------------
> drivers/block/null_blk_main.c |  3 +-
> drivers/scsi/sd_zbc.c         |  2 --
> 3 files changed, 35 insertions(+), 29 deletions(-)
>
>diff --git a/block/blk-zoned.c b/block/blk-zoned.c
>index 51d427659ce7..d00fcfd71dfe 100644
>--- a/block/blk-zoned.c
>+++ b/block/blk-zoned.c
>@@ -343,6 +343,7 @@ struct blk_revalidate_zone_args {
> 	unsigned long	*conv_zones_bitmap;
> 	unsigned long	*seq_zones_wlock;
> 	unsigned int	nr_zones;
>+	sector_t	zone_sectors;
> 	sector_t	sector;
> };
>
>@@ -355,25 +356,33 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx,
> 	struct blk_revalidate_zone_args *args = data;
> 	struct gendisk *disk = args->disk;
> 	struct request_queue *q = disk->queue;
>-	sector_t zone_sectors = blk_queue_zone_sectors(q);
> 	sector_t capacity = get_capacity(disk);
>
> 	/*
> 	 * All zones must have the same size, with the exception on an eventual
> 	 * smaller last zone.
> 	 */
>-	if (zone->start + zone_sectors < capacity &&
>-	    zone->len != zone_sectors) {
>-		pr_warn("%s: Invalid zoned device with non constant zone size\n",
>-			disk->disk_name);
>-		return false;
>-	}
>+	if (zone->start == 0) {
>+		if (zone->len == 0 || !is_power_of_2(zone->len)) {
>+			pr_warn("%s: Invalid zoned device with non power of two zone size (%llu)\n",
>+				disk->disk_name, zone->len);
>+			return -ENODEV;
>+		}
>
>-	if (zone->start + zone->len >= capacity &&
>-	    zone->len > zone_sectors) {
>-		pr_warn("%s: Invalid zoned device with larger last zone size\n",
>-			disk->disk_name);
>-		return -ENODEV;
>+		args->zone_sectors = zone->len;
>+		args->nr_zones = (capacity + zone->len - 1) >> ilog2(zone->len);
>+	} else if (zone->start + args->zone_sectors < capacity) {
>+		if (zone->len != args->zone_sectors) {
>+			pr_warn("%s: Invalid zoned device with non constant zone size\n",
>+				disk->disk_name);
>+			return -ENODEV;
>+		}
>+	} else {
>+		if (zone->len > args->zone_sectors) {
>+			pr_warn("%s: Invalid zoned device with larger last zone size\n",
>+				disk->disk_name);
>+			return -ENODEV;
>+		}
> 	}
>
> 	/* Check for holes in the zone report */
>@@ -428,9 +437,9 @@ int blk_revalidate_disk_zones(struct gendisk *disk)
> 	struct request_queue *q = disk->queue;
> 	struct blk_revalidate_zone_args args = {
> 		.disk		= disk,
>-		.nr_zones	= blkdev_nr_zones(disk),
> 	};
>-	int ret = 0;
>+	unsigned int noio_flag;
>+	int ret;
>
> 	if (WARN_ON_ONCE(!blk_queue_is_zoned(q)))
> 		return -EIO;
>@@ -438,24 +447,22 @@ int blk_revalidate_disk_zones(struct gendisk *disk)
> 		return -EIO;
>
> 	/*
>-	 * Ensure that all memory allocations in this context are done as
>-	 * if GFP_NOIO was specified.
>+	 * Ensure that all memory allocations in this context are done as if
>+	 * GFP_NOIO was specified.
> 	 */
>-	if (args.nr_zones) {
>-		unsigned int noio_flag = memalloc_noio_save();
>-
>-		ret = disk->fops->report_zones(disk, 0, args.nr_zones,
>-					       blk_revalidate_zone_cb, &args);
>-		memalloc_noio_restore(noio_flag);
>-	}
>+	noio_flag = memalloc_noio_save();
>+	ret = disk->fops->report_zones(disk, 0, UINT_MAX,
>+				       blk_revalidate_zone_cb, &args);
>+	memalloc_noio_restore(noio_flag);
>
> 	/*
>-	 * Install the new bitmaps, making sure the queue is stopped and
>-	 * all I/Os are completed (i.e. a scheduler is not referencing the
>-	 * bitmaps).
>+	 * Install the new bitmaps and update nr_zones only once the queue is
>+	 * stopped and all I/Os are completed (i.e. a scheduler is not
>+	 * referencing the bitmaps).
> 	 */
> 	blk_mq_freeze_queue(q);
> 	if (ret >= 0) {
>+		blk_queue_chunk_sectors(q, args.zone_sectors);
> 		q->nr_zones = args.nr_zones;
> 		swap(q->seq_zones_wlock, args.seq_zones_wlock);
> 		swap(q->conv_zones_bitmap, args.conv_zones_bitmap);
>diff --git a/drivers/block/null_blk_main.c b/drivers/block/null_blk_main.c
>index 068cd0ae6e2c..997b7dc095b9 100644
>--- a/drivers/block/null_blk_main.c
>+++ b/drivers/block/null_blk_main.c
>@@ -1583,6 +1583,8 @@ static int null_gendisk_register(struct nullb *nullb)
> 			if (ret)
> 				return ret;
> 		} else {
>+			blk_queue_chunk_sectors(nullb->q,
>+					nullb->dev->zone_size_sects);
> 			nullb->q->nr_zones = blkdev_nr_zones(disk);
> 		}
> 	}
>@@ -1746,7 +1748,6 @@ static int null_add_dev(struct nullb_device *dev)
> 		if (rv)
> 			goto out_cleanup_blk_queue;
>
>-		blk_queue_chunk_sectors(nullb->q, dev->zone_size_sects);
> 		nullb->q->limits.zoned = BLK_ZONED_HM;
> 		blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, nullb->q);
> 		blk_queue_required_elevator_features(nullb->q,
>diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
>index 0e5ede48f045..27d72c1d4654 100644
>--- a/drivers/scsi/sd_zbc.c
>+++ b/drivers/scsi/sd_zbc.c
>@@ -412,8 +412,6 @@ int sd_zbc_read_zones(struct scsi_disk *sdkp, unsigned char *buf)
> 		goto err;
>
> 	/* The drive satisfies the kernel restrictions: set it up */
>-	blk_queue_chunk_sectors(sdkp->disk->queue,
>-			logical_to_sectors(sdkp->device, zone_blocks));
> 	blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, sdkp->disk->queue);
> 	blk_queue_required_elevator_features(sdkp->disk->queue,
> 					     ELEVATOR_F_ZBD_SEQ_WRITE);
>-- 
>2.20.1
>
Christoph Hellwig Dec. 3, 2019, 3:18 p.m. UTC | #2
On Tue, Dec 03, 2019 at 03:00:09PM +0100, Javier González wrote:
>> This change also allows to check for a power of two zone size in generic
>> code.
>
> I think however that this checks should remain at the driver level, or
> at least depend on a flag that signals that the zoned device is actually
> a power of two.

The block layer requires a power of two zone size / chunk size,
including having a BUG_ON for that requirement blk_queue_chunk_sectors.
I'd much rather have a proper check in the zone code with proper
diagnostics than triggering a BUG_ON..
Javier González Dec. 3, 2019, 3:34 p.m. UTC | #3
> On 3 Dec 2019, at 16.18, Christoph Hellwig <hch@lst.de> wrote:
> 
> On Tue, Dec 03, 2019 at 03:00:09PM +0100, Javier González wrote:
>>> This change also allows to check for a power of two zone size in generic
>>> code.
>> 
>> I think however that this checks should remain at the driver level, or
>> at least depend on a flag that signals that the zoned device is actually
>> a power of two.
> 
> The block layer requires a power of two zone size / chunk size,
> including having a BUG_ON for that requirement blk_queue_chunk_sectors.
> I'd much rather have a proper check in the zone code with proper
> diagnostics than triggering a BUG_ON..

Agree on the BUG_ON part.  But since you’re looking into this part now, would it make sense to do the check in the block layer only if the driver imposes a power of two? We can also do it down the road, but seems like double work.
Christoph Hellwig Dec. 3, 2019, 3:42 p.m. UTC | #4
On Tue, Dec 03, 2019 at 04:34:08PM +0100, Javier González wrote:
> Agree on the BUG_ON part.  But since you’re looking into this part now, would it make sense to do the check in the block layer only if the driver imposes a power of two? We can also do it down the road, but seems like double work.

The whole block layer chunk / zone handling has always assumed power
of two zone sizes.  Changing that would introduce expensive divisions
in the fast path.  This patch just moves the check to where it belongs.
Javier González Dec. 3, 2019, 5:17 p.m. UTC | #5
> On 3 Dec 2019, at 16.42, Christoph Hellwig <hch@lst.de> wrote:
> 
> On Tue, Dec 03, 2019 at 04:34:08PM +0100, Javier González wrote:
>> Agree on the BUG_ON part.  But since you’re looking into this part now, would it make sense to do the check in the block layer only if the driver imposes a power of two? We can also do it down the road, but seems like double work.
> 
> The whole block layer chunk / zone handling has always assumed power
> of two zone sizes.  Changing that would introduce expensive divisions
> in the fast path.  This patch just moves the check to where it belongs.

Ok. Let’s do the refactor now. Though, we will need to support this for zoned devices that are not powers of two, but we can add this path when time comes. 

You can add my reviewed-by 

Thanks Christoph!
Javier
diff mbox series

Patch

diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 51d427659ce7..d00fcfd71dfe 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -343,6 +343,7 @@  struct blk_revalidate_zone_args {
 	unsigned long	*conv_zones_bitmap;
 	unsigned long	*seq_zones_wlock;
 	unsigned int	nr_zones;
+	sector_t	zone_sectors;
 	sector_t	sector;
 };
 
@@ -355,25 +356,33 @@  static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx,
 	struct blk_revalidate_zone_args *args = data;
 	struct gendisk *disk = args->disk;
 	struct request_queue *q = disk->queue;
-	sector_t zone_sectors = blk_queue_zone_sectors(q);
 	sector_t capacity = get_capacity(disk);
 
 	/*
 	 * All zones must have the same size, with the exception on an eventual
 	 * smaller last zone.
 	 */
-	if (zone->start + zone_sectors < capacity &&
-	    zone->len != zone_sectors) {
-		pr_warn("%s: Invalid zoned device with non constant zone size\n",
-			disk->disk_name);
-		return false;
-	}
+	if (zone->start == 0) {
+		if (zone->len == 0 || !is_power_of_2(zone->len)) {
+			pr_warn("%s: Invalid zoned device with non power of two zone size (%llu)\n",
+				disk->disk_name, zone->len);
+			return -ENODEV;
+		}
 
-	if (zone->start + zone->len >= capacity &&
-	    zone->len > zone_sectors) {
-		pr_warn("%s: Invalid zoned device with larger last zone size\n",
-			disk->disk_name);
-		return -ENODEV;
+		args->zone_sectors = zone->len;
+		args->nr_zones = (capacity + zone->len - 1) >> ilog2(zone->len);
+	} else if (zone->start + args->zone_sectors < capacity) {
+		if (zone->len != args->zone_sectors) {
+			pr_warn("%s: Invalid zoned device with non constant zone size\n",
+				disk->disk_name);
+			return -ENODEV;
+		}
+	} else {
+		if (zone->len > args->zone_sectors) {
+			pr_warn("%s: Invalid zoned device with larger last zone size\n",
+				disk->disk_name);
+			return -ENODEV;
+		}
 	}
 
 	/* Check for holes in the zone report */
@@ -428,9 +437,9 @@  int blk_revalidate_disk_zones(struct gendisk *disk)
 	struct request_queue *q = disk->queue;
 	struct blk_revalidate_zone_args args = {
 		.disk		= disk,
-		.nr_zones	= blkdev_nr_zones(disk),
 	};
-	int ret = 0;
+	unsigned int noio_flag;
+	int ret;
 
 	if (WARN_ON_ONCE(!blk_queue_is_zoned(q)))
 		return -EIO;
@@ -438,24 +447,22 @@  int blk_revalidate_disk_zones(struct gendisk *disk)
 		return -EIO;
 
 	/*
-	 * Ensure that all memory allocations in this context are done as
-	 * if GFP_NOIO was specified.
+	 * Ensure that all memory allocations in this context are done as if
+	 * GFP_NOIO was specified.
 	 */
-	if (args.nr_zones) {
-		unsigned int noio_flag = memalloc_noio_save();
-
-		ret = disk->fops->report_zones(disk, 0, args.nr_zones,
-					       blk_revalidate_zone_cb, &args);
-		memalloc_noio_restore(noio_flag);
-	}
+	noio_flag = memalloc_noio_save();
+	ret = disk->fops->report_zones(disk, 0, UINT_MAX,
+				       blk_revalidate_zone_cb, &args);
+	memalloc_noio_restore(noio_flag);
 
 	/*
-	 * Install the new bitmaps, making sure the queue is stopped and
-	 * all I/Os are completed (i.e. a scheduler is not referencing the
-	 * bitmaps).
+	 * Install the new bitmaps and update nr_zones only once the queue is
+	 * stopped and all I/Os are completed (i.e. a scheduler is not
+	 * referencing the bitmaps).
 	 */
 	blk_mq_freeze_queue(q);
 	if (ret >= 0) {
+		blk_queue_chunk_sectors(q, args.zone_sectors);
 		q->nr_zones = args.nr_zones;
 		swap(q->seq_zones_wlock, args.seq_zones_wlock);
 		swap(q->conv_zones_bitmap, args.conv_zones_bitmap);
diff --git a/drivers/block/null_blk_main.c b/drivers/block/null_blk_main.c
index 068cd0ae6e2c..997b7dc095b9 100644
--- a/drivers/block/null_blk_main.c
+++ b/drivers/block/null_blk_main.c
@@ -1583,6 +1583,8 @@  static int null_gendisk_register(struct nullb *nullb)
 			if (ret)
 				return ret;
 		} else {
+			blk_queue_chunk_sectors(nullb->q,
+					nullb->dev->zone_size_sects);
 			nullb->q->nr_zones = blkdev_nr_zones(disk);
 		}
 	}
@@ -1746,7 +1748,6 @@  static int null_add_dev(struct nullb_device *dev)
 		if (rv)
 			goto out_cleanup_blk_queue;
 
-		blk_queue_chunk_sectors(nullb->q, dev->zone_size_sects);
 		nullb->q->limits.zoned = BLK_ZONED_HM;
 		blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, nullb->q);
 		blk_queue_required_elevator_features(nullb->q,
diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index 0e5ede48f045..27d72c1d4654 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -412,8 +412,6 @@  int sd_zbc_read_zones(struct scsi_disk *sdkp, unsigned char *buf)
 		goto err;
 
 	/* The drive satisfies the kernel restrictions: set it up */
-	blk_queue_chunk_sectors(sdkp->disk->queue,
-			logical_to_sectors(sdkp->device, zone_blocks));
 	blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, sdkp->disk->queue);
 	blk_queue_required_elevator_features(sdkp->disk->queue,
 					     ELEVATOR_F_ZBD_SEQ_WRITE);