diff mbox series

[6/6] null_blk: Add support for power_of_2 emulation to the null blk device

Message ID 20220308165349.231320-7-p.raghav@samsung.com (mailing list archive)
State New, archived
Headers show
Series power_of_2 emulation support for NVMe ZNS devices | expand

Commit Message

Pankaj Raghav March 8, 2022, 4:53 p.m. UTC
power_of_2(PO2) emulation support is added to the null blk device to
measure performance.

Callbacks are added in the hotpaths that require PO2 emulation specific
behaviour to reduce the impact on exisiting path.

The power_of_2 emulation support is wired up for both the request and
the bio queue mode and it is automatically enabled when the given zone
size is non-power_of_2.

Signed-off-by: Pankaj Raghav <p.raghav@samsung.com>
---
 drivers/block/null_blk/main.c     |   8 +-
 drivers/block/null_blk/null_blk.h |  12 ++
 drivers/block/null_blk/zoned.c    | 203 ++++++++++++++++++++++++++----
 3 files changed, 196 insertions(+), 27 deletions(-)

Comments

Damien Le Moal March 9, 2022, 4:09 a.m. UTC | #1
On 3/9/22 01:53, Pankaj Raghav wrote:
> power_of_2(PO2) emulation support is added to the null blk device to
> measure performance.
> 
> Callbacks are added in the hotpaths that require PO2 emulation specific
> behaviour to reduce the impact on exisiting path.
> 
> The power_of_2 emulation support is wired up for both the request and
> the bio queue mode and it is automatically enabled when the given zone
> size is non-power_of_2.

This does not make any sense. Why would you want to add power of 2 zone
size emulation to nullblk ? Just set the zone size to be a power of 2...

If this is for test purpose, then use QEMU. These changes make no sense
to me here.

A change that would make sense in the context of this series is to allow
for setting a zoned null_blk device zone size to a non power of 2 size.
But this series does not actually deal with that. So do not touch this
driver please.

> 
> Signed-off-by: Pankaj Raghav <p.raghav@samsung.com>
> ---
>  drivers/block/null_blk/main.c     |   8 +-
>  drivers/block/null_blk/null_blk.h |  12 ++
>  drivers/block/null_blk/zoned.c    | 203 ++++++++++++++++++++++++++----
>  3 files changed, 196 insertions(+), 27 deletions(-)
> 
> diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c
> index 625a06bfa5ad..c926b59f2b17 100644
> --- a/drivers/block/null_blk/main.c
> +++ b/drivers/block/null_blk/main.c
> @@ -210,7 +210,7 @@ MODULE_PARM_DESC(zoned, "Make device as a host-managed zoned block device. Defau
>  
>  static unsigned long g_zone_size = 256;
>  module_param_named(zone_size, g_zone_size, ulong, S_IRUGO);
> -MODULE_PARM_DESC(zone_size, "Zone size in MB when block device is zoned. Must be power-of-two: Default: 256");
> +MODULE_PARM_DESC(zone_size, "Zone size in MB when block device is zoned. Default: 256");
>  
>  static unsigned long g_zone_capacity;
>  module_param_named(zone_capacity, g_zone_capacity, ulong, 0444);
> @@ -1772,11 +1772,13 @@ static const struct block_device_operations null_bio_ops = {
>  	.owner		= THIS_MODULE,
>  	.submit_bio	= null_submit_bio,
>  	.report_zones	= null_report_zones,
> +	.npo2_zone_setup = null_po2_zone_emu_setup,
>  };
>  
>  static const struct block_device_operations null_rq_ops = {
>  	.owner		= THIS_MODULE,
>  	.report_zones	= null_report_zones,
> +	.npo2_zone_setup = null_po2_zone_emu_setup,
>  };
>  
>  static int setup_commands(struct nullb_queue *nq)
> @@ -1929,8 +1931,8 @@ static int null_validate_conf(struct nullb_device *dev)
>  		dev->mbps = 0;
>  
>  	if (dev->zoned &&
> -	    (!dev->zone_size || !is_power_of_2(dev->zone_size))) {
> -		pr_err("zone_size must be power-of-two\n");
> +	    (!dev->zone_size)) {
> +		pr_err("zone_size must be zero\n");
>  		return -EINVAL;
>  	}
>  
> diff --git a/drivers/block/null_blk/null_blk.h b/drivers/block/null_blk/null_blk.h
> index 78eb56b0ca55..34c1b7b2546b 100644
> --- a/drivers/block/null_blk/null_blk.h
> +++ b/drivers/block/null_blk/null_blk.h
> @@ -74,6 +74,16 @@ struct nullb_device {
>  	unsigned int imp_close_zone_no;
>  	struct nullb_zone *zones;
>  	sector_t zone_size_sects;
> +	sector_t zone_size_po2_sects;
> +	sector_t zone_size_diff_sects;
> +	/* The callbacks below are used as hook to perform po2 emulation for a
> +	 * zoned device.
> +	 */
> +	unsigned int (*zone_no)(struct nullb_device *dev,
> +				sector_t sect);
> +	sector_t (*zone_update_sector)(struct nullb_device *dev, sector_t sect);
> +	sector_t (*zone_update_sector_append)(struct nullb_device *dev,
> +					      sector_t sect);
>  	bool need_zone_res_mgmt;
>  	spinlock_t zone_res_lock;
>  
> @@ -137,6 +147,7 @@ int null_register_zoned_dev(struct nullb *nullb);
>  void null_free_zoned_dev(struct nullb_device *dev);
>  int null_report_zones(struct gendisk *disk, sector_t sector,
>  		      unsigned int nr_zones, report_zones_cb cb, void *data);
> +void null_po2_zone_emu_setup(struct gendisk *disk);
>  blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd,
>  				    enum req_opf op, sector_t sector,
>  				    sector_t nr_sectors);
> @@ -166,5 +177,6 @@ static inline size_t null_zone_valid_read_len(struct nullb *nullb,
>  	return len;
>  }
>  #define null_report_zones	NULL
> +#define null_po2_zone_emu_setup	NULL
>  #endif /* CONFIG_BLK_DEV_ZONED */
>  #endif /* __NULL_BLK_H */
> diff --git a/drivers/block/null_blk/zoned.c b/drivers/block/null_blk/zoned.c
> index dae54dd1aeac..3bb63c170149 100644
> --- a/drivers/block/null_blk/zoned.c
> +++ b/drivers/block/null_blk/zoned.c
> @@ -16,6 +16,44 @@ static inline unsigned int null_zone_no(struct nullb_device *dev, sector_t sect)
>  	return sect >> ilog2(dev->zone_size_sects);
>  }
>  
> +static inline unsigned int null_npo2_zone_no(struct nullb_device *dev,
> +					     sector_t sect)
> +{
> +	return sect / dev->zone_size_sects;
> +}
> +
> +static inline bool null_is_po2_zone_emu(struct nullb_device *dev)
> +{
> +	return !!dev->zone_size_diff_sects;
> +}
> +
> +static inline sector_t null_zone_update_sector_noop(struct nullb_device *dev,
> +						    sector_t sect)
> +{
> +	return sect;
> +}
> +
> +static inline sector_t null_zone_update_sector_po2_emu(struct nullb_device *dev,
> +						       sector_t sect)
> +{
> +	sector_t zsze_po2 = dev->zone_size_po2_sects;
> +	sector_t zsze_diff = dev->zone_size_diff_sects;
> +	u32 zone_idx = sect >> ilog2(zsze_po2);
> +
> +	sect = sect - (zone_idx * zsze_diff);
> +	return sect;
> +}
> +
> +static inline sector_t
> +null_zone_update_sector_append_po2_emu(struct nullb_device *dev, sector_t sect)
> +{
> +	/* Need to readjust the sector if po2 emulation is used. */
> +	u32 zone_no = dev->zone_no(dev, sect);
> +
> +	sect += dev->zone_size_diff_sects * zone_no;
> +	return sect;
> +}
> +
>  static inline void null_lock_zone_res(struct nullb_device *dev)
>  {
>  	if (dev->need_zone_res_mgmt)
> @@ -62,15 +100,14 @@ int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q)
>  	sector_t sector = 0;
>  	unsigned int i;
>  
> -	if (!is_power_of_2(dev->zone_size)) {
> -		pr_err("zone_size must be power-of-two\n");
> -		return -EINVAL;
> -	}
>  	if (dev->zone_size > dev->size) {
>  		pr_err("Zone size larger than device capacity\n");
>  		return -EINVAL;
>  	}
>  
> +	if (!is_power_of_2(dev->zone_size))
> +		pr_info("zone_size is not power-of-two. power-of-two emulation is enabled");
> +
>  	if (!dev->zone_capacity)
>  		dev->zone_capacity = dev->zone_size;
>  
> @@ -83,8 +120,14 @@ int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q)
>  	zone_capacity_sects = mb_to_sects(dev->zone_capacity);
>  	dev_capacity_sects = mb_to_sects(dev->size);
>  	dev->zone_size_sects = mb_to_sects(dev->zone_size);
> -	dev->nr_zones = round_up(dev_capacity_sects, dev->zone_size_sects)
> -		>> ilog2(dev->zone_size_sects);
> +
> +	dev->nr_zones = roundup(dev_capacity_sects, dev->zone_size_sects) /
> +			dev->zone_size_sects;
> +
> +	dev->zone_no = null_zone_no;
> +	dev->zone_update_sector = null_zone_update_sector_noop;
> +	dev->zone_update_sector_append = null_zone_update_sector_noop;
> +
>  
>  	dev->zones = kvmalloc_array(dev->nr_zones, sizeof(struct nullb_zone),
>  				    GFP_KERNEL | __GFP_ZERO);
> @@ -166,7 +209,13 @@ int null_register_zoned_dev(struct nullb *nullb)
>  		if (ret)
>  			return ret;
>  	} else {
> -		blk_queue_chunk_sectors(q, dev->zone_size_sects);
> +		nullb->disk->fops->npo2_zone_setup(nullb->disk);
> +
> +		if (null_is_po2_zone_emu(dev))
> +			blk_queue_chunk_sectors(q, dev->zone_size_po2_sects);
> +		else
> +			blk_queue_chunk_sectors(q, dev->zone_size_sects);
> +
>  		q->nr_zones = blkdev_nr_zones(nullb->disk);
>  	}
>  
> @@ -183,17 +232,49 @@ void null_free_zoned_dev(struct nullb_device *dev)
>  	dev->zones = NULL;
>  }
>  
> +static void null_update_zone_info(struct nullb *nullb, struct blk_zone *blkz,
> +				  struct nullb_zone *zone)
> +{
> +	unsigned int zone_idx;
> +	u64 zone_gap;
> +	struct nullb_device *dev = nullb->dev;
> +
> +	if (null_is_po2_zone_emu(dev)) {
> +		zone_idx = zone->start / zone->len;
> +		zone_gap = zone_idx * dev->zone_size_diff_sects;
> +
> +		blkz->start = zone->start + zone_gap;
> +		blkz->len = dev->zone_size_po2_sects;
> +		blkz->wp = zone->wp + zone_gap;
> +	} else {
> +		blkz->start = zone->start;
> +		blkz->len = zone->len;
> +		blkz->wp = zone->wp;
> +	}
> +
> +	blkz->type = zone->type;
> +	blkz->cond = zone->cond;
> +	blkz->capacity = zone->capacity;
> +}
> +
>  int null_report_zones(struct gendisk *disk, sector_t sector,
>  		unsigned int nr_zones, report_zones_cb cb, void *data)
>  {
>  	struct nullb *nullb = disk->private_data;
>  	struct nullb_device *dev = nullb->dev;
>  	unsigned int first_zone, i;
> +	sector_t zone_size;
>  	struct nullb_zone *zone;
>  	struct blk_zone blkz;
>  	int error;
>  
> -	first_zone = null_zone_no(dev, sector);
> +	if (null_is_po2_zone_emu(dev))
> +		zone_size = dev->zone_size_po2_sects;
> +	else
> +		zone_size = dev->zone_size_sects;
> +
> +	first_zone = sector / zone_size;
> +
>  	if (first_zone >= dev->nr_zones)
>  		return 0;
>  
> @@ -210,12 +291,7 @@ int null_report_zones(struct gendisk *disk, sector_t sector,
>  		 * array.
>  		 */
>  		null_lock_zone(dev, zone);
> -		blkz.start = zone->start;
> -		blkz.len = zone->len;
> -		blkz.wp = zone->wp;
> -		blkz.type = zone->type;
> -		blkz.cond = zone->cond;
> -		blkz.capacity = zone->capacity;
> +		null_update_zone_info(nullb, &blkz, zone);
>  		null_unlock_zone(dev, zone);
>  
>  		error = cb(&blkz, i, data);
> @@ -226,6 +302,35 @@ int null_report_zones(struct gendisk *disk, sector_t sector,
>  	return nr_zones;
>  }
>  
> +void null_po2_zone_emu_setup(struct gendisk *disk)
> +{
> +	struct nullb *nullb = disk->private_data;
> +	struct nullb_device *dev = nullb->dev;
> +	sector_t capacity;
> +
> +	if (is_power_of_2(dev->zone_size_sects))
> +		return;
> +
> +	if (!get_capacity(disk))
> +		return;
> +
> +	blk_mq_freeze_queue(disk->queue);
> +
> +	blk_queue_po2_zone_emu(disk->queue, 1);
> +	dev->zone_size_po2_sects =
> +		1 << get_count_order_long(dev->zone_size_sects);
> +	dev->zone_size_diff_sects =
> +		dev->zone_size_po2_sects - dev->zone_size_sects;
> +	dev->zone_no = null_npo2_zone_no;
> +	dev->zone_update_sector = null_zone_update_sector_po2_emu;
> +	dev->zone_update_sector_append = null_zone_update_sector_append_po2_emu;
> +
> +	capacity = dev->nr_zones * dev->zone_size_po2_sects;
> +	set_capacity(disk, capacity);
> +
> +	blk_mq_unfreeze_queue(disk->queue);
> +}
> +
>  /*
>   * This is called in the case of memory backing from null_process_cmd()
>   * with the target zone already locked.
> @@ -234,7 +339,7 @@ size_t null_zone_valid_read_len(struct nullb *nullb,
>  				sector_t sector, unsigned int len)
>  {
>  	struct nullb_device *dev = nullb->dev;
> -	struct nullb_zone *zone = &dev->zones[null_zone_no(dev, sector)];
> +	struct nullb_zone *zone = &dev->zones[dev->zone_no(dev, sector)];
>  	unsigned int nr_sectors = len >> SECTOR_SHIFT;
>  
>  	/* Read must be below the write pointer position */
> @@ -363,11 +468,24 @@ static blk_status_t null_check_zone_resources(struct nullb_device *dev,
>  	}
>  }
>  
> +static void null_update_sector_append(struct nullb_cmd *cmd, sector_t sector)
> +{
> +	struct nullb_device *dev = cmd->nq->dev;
> +
> +	sector = dev->zone_update_sector_append(dev, sector);
> +
> +	if (cmd->bio) {
> +		cmd->bio->bi_iter.bi_sector = sector;
> +	} else {
> +		cmd->rq->__sector = sector;
> +	}
> +}
> +
>  static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector,
>  				    unsigned int nr_sectors, bool append)
>  {
>  	struct nullb_device *dev = cmd->nq->dev;
> -	unsigned int zno = null_zone_no(dev, sector);
> +	unsigned int zno = dev->zone_no(dev, sector);
>  	struct nullb_zone *zone = &dev->zones[zno];
>  	blk_status_t ret;
>  
> @@ -395,10 +513,7 @@ static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector,
>  	 */
>  	if (append) {
>  		sector = zone->wp;
> -		if (cmd->bio)
> -			cmd->bio->bi_iter.bi_sector = sector;
> -		else
> -			cmd->rq->__sector = sector;
> +		null_update_sector_append(cmd, sector);
>  	} else if (sector != zone->wp) {
>  		ret = BLK_STS_IOERR;
>  		goto unlock;
> @@ -619,7 +734,7 @@ static blk_status_t null_zone_mgmt(struct nullb_cmd *cmd, enum req_opf op,
>  		return BLK_STS_OK;
>  	}
>  
> -	zone_no = null_zone_no(dev, sector);
> +	zone_no = dev->zone_no(dev, sector);
>  	zone = &dev->zones[zone_no];
>  
>  	null_lock_zone(dev, zone);
> @@ -650,13 +765,54 @@ static blk_status_t null_zone_mgmt(struct nullb_cmd *cmd, enum req_opf op,
>  	return ret;
>  }
>  
> +static blk_status_t null_handle_po2_zone_emu_violation(struct nullb_cmd *cmd,
> +						       enum req_opf op)
> +{
> +	if (op == REQ_OP_READ) {
> +		if (cmd->bio)
> +			zero_fill_bio(cmd->bio);
> +		else
> +			zero_fill_bio(cmd->rq->bio);
> +
> +		return BLK_STS_OK;
> +	} else {
> +		return BLK_STS_IOERR;
> +	}
> +}
> +
> +static bool null_verify_sector_violation(struct nullb_device *dev,
> +					 sector_t sector)
> +{
> +	if (unlikely(null_is_po2_zone_emu(dev))) {
> +		/* The zone idx should be calculated based on the emulated
> +		 * layout
> +		 */
> +		u32 zone_idx = sector >> ilog2(dev->zone_size_po2_sects);
> +		sector_t zsze = dev->zone_size_sects;
> +		sector_t sect = null_zone_update_sector_po2_emu(dev, sector);
> +
> +		if (sect - (zone_idx * zsze) > zsze)
> +			return true;
> +	}
> +	return false;
> +}
> +
>  blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, enum req_opf op,
>  				    sector_t sector, sector_t nr_sectors)
>  {
> -	struct nullb_device *dev;
> +	struct nullb_device *dev = cmd->nq->dev;
>  	struct nullb_zone *zone;
>  	blk_status_t sts;
>  
> +	/* Handle when the sector falls in the emulated area */
> +	if (unlikely(null_verify_sector_violation(dev, sector)))
> +		return null_handle_po2_zone_emu_violation(cmd, op);
> +
> +	/* The sector value is updated if po2 emulation is enabled, else it
> +	 * will have no effect on the value
> +	 */
> +	sector = dev->zone_update_sector(dev, sector);
> +
>  	switch (op) {
>  	case REQ_OP_WRITE:
>  		return null_zone_write(cmd, sector, nr_sectors, false);
> @@ -669,8 +825,7 @@ blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, enum req_opf op,
>  	case REQ_OP_ZONE_FINISH:
>  		return null_zone_mgmt(cmd, op, sector);
>  	default:
> -		dev = cmd->nq->dev;
> -		zone = &dev->zones[null_zone_no(dev, sector)];
> +		zone = &dev->zones[dev->zone_no(dev, sector)];
>  
>  		null_lock_zone(dev, zone);
>  		sts = null_process_cmd(cmd, op, sector, nr_sectors);
Pankaj Raghav March 9, 2022, 2:42 p.m. UTC | #2
On 2022-03-09 05:09, Damien Le Moal wrote:
> On 3/9/22 01:53, Pankaj Raghav wrote:
>> power_of_2(PO2) emulation support is added to the null blk device to
>> measure performance.
>>
>> Callbacks are added in the hotpaths that require PO2 emulation specific
>> behaviour to reduce the impact on exisiting path.
>>
>> The power_of_2 emulation support is wired up for both the request and
>> the bio queue mode and it is automatically enabled when the given zone
>> size is non-power_of_2.
> 
> This does not make any sense. Why would you want to add power of 2 zone
> size emulation to nullblk ? Just set the zone size to be a power of 2...
> 
> If this is for test purpose, then use QEMU. These changes make no sense
> to me here.
> 
I see your point but this was mainly added to measure the performance impact.

I ran the conformance test with different configurations in QEMU but I don't
think QEMU would be a preferred option to measure performance, especially, if we
want to account for changes we did to the hot path with a indirection. 

As ZNS drives are not available in retail stores, this patch also provides a way
for the community to reproduce the performance analysis that we did without needing
a real device.

> A change that would make sense in the context of this series is to allow
> for setting a zoned null_blk device zone size to a non power of 2 size.
This is not possible with the block layer expecting the zone sizes to be po2.
A null blk device with non po2 zone size will only work with the emulation that is
added as a part of this patch.

As I said before, once we relax the block layer requirement, then we could allow
non po2 zone sizes without a lot of changes to the null blk device.
> But this series does not actually deal with that. So do not touch this
> driver please.
> 
If you really think it doesn't belong here, then I can take it out in the next
revision.
diff mbox series

Patch

diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c
index 625a06bfa5ad..c926b59f2b17 100644
--- a/drivers/block/null_blk/main.c
+++ b/drivers/block/null_blk/main.c
@@ -210,7 +210,7 @@  MODULE_PARM_DESC(zoned, "Make device as a host-managed zoned block device. Defau
 
 static unsigned long g_zone_size = 256;
 module_param_named(zone_size, g_zone_size, ulong, S_IRUGO);
-MODULE_PARM_DESC(zone_size, "Zone size in MB when block device is zoned. Must be power-of-two: Default: 256");
+MODULE_PARM_DESC(zone_size, "Zone size in MB when block device is zoned. Default: 256");
 
 static unsigned long g_zone_capacity;
 module_param_named(zone_capacity, g_zone_capacity, ulong, 0444);
@@ -1772,11 +1772,13 @@  static const struct block_device_operations null_bio_ops = {
 	.owner		= THIS_MODULE,
 	.submit_bio	= null_submit_bio,
 	.report_zones	= null_report_zones,
+	.npo2_zone_setup = null_po2_zone_emu_setup,
 };
 
 static const struct block_device_operations null_rq_ops = {
 	.owner		= THIS_MODULE,
 	.report_zones	= null_report_zones,
+	.npo2_zone_setup = null_po2_zone_emu_setup,
 };
 
 static int setup_commands(struct nullb_queue *nq)
@@ -1929,8 +1931,8 @@  static int null_validate_conf(struct nullb_device *dev)
 		dev->mbps = 0;
 
 	if (dev->zoned &&
-	    (!dev->zone_size || !is_power_of_2(dev->zone_size))) {
-		pr_err("zone_size must be power-of-two\n");
+	    (!dev->zone_size)) {
+		pr_err("zone_size must be zero\n");
 		return -EINVAL;
 	}
 
diff --git a/drivers/block/null_blk/null_blk.h b/drivers/block/null_blk/null_blk.h
index 78eb56b0ca55..34c1b7b2546b 100644
--- a/drivers/block/null_blk/null_blk.h
+++ b/drivers/block/null_blk/null_blk.h
@@ -74,6 +74,16 @@  struct nullb_device {
 	unsigned int imp_close_zone_no;
 	struct nullb_zone *zones;
 	sector_t zone_size_sects;
+	sector_t zone_size_po2_sects;
+	sector_t zone_size_diff_sects;
+	/* The callbacks below are used as hook to perform po2 emulation for a
+	 * zoned device.
+	 */
+	unsigned int (*zone_no)(struct nullb_device *dev,
+				sector_t sect);
+	sector_t (*zone_update_sector)(struct nullb_device *dev, sector_t sect);
+	sector_t (*zone_update_sector_append)(struct nullb_device *dev,
+					      sector_t sect);
 	bool need_zone_res_mgmt;
 	spinlock_t zone_res_lock;
 
@@ -137,6 +147,7 @@  int null_register_zoned_dev(struct nullb *nullb);
 void null_free_zoned_dev(struct nullb_device *dev);
 int null_report_zones(struct gendisk *disk, sector_t sector,
 		      unsigned int nr_zones, report_zones_cb cb, void *data);
+void null_po2_zone_emu_setup(struct gendisk *disk);
 blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd,
 				    enum req_opf op, sector_t sector,
 				    sector_t nr_sectors);
@@ -166,5 +177,6 @@  static inline size_t null_zone_valid_read_len(struct nullb *nullb,
 	return len;
 }
 #define null_report_zones	NULL
+#define null_po2_zone_emu_setup	NULL
 #endif /* CONFIG_BLK_DEV_ZONED */
 #endif /* __NULL_BLK_H */
diff --git a/drivers/block/null_blk/zoned.c b/drivers/block/null_blk/zoned.c
index dae54dd1aeac..3bb63c170149 100644
--- a/drivers/block/null_blk/zoned.c
+++ b/drivers/block/null_blk/zoned.c
@@ -16,6 +16,44 @@  static inline unsigned int null_zone_no(struct nullb_device *dev, sector_t sect)
 	return sect >> ilog2(dev->zone_size_sects);
 }
 
+static inline unsigned int null_npo2_zone_no(struct nullb_device *dev,
+					     sector_t sect)
+{
+	return sect / dev->zone_size_sects;
+}
+
+static inline bool null_is_po2_zone_emu(struct nullb_device *dev)
+{
+	return !!dev->zone_size_diff_sects;
+}
+
+static inline sector_t null_zone_update_sector_noop(struct nullb_device *dev,
+						    sector_t sect)
+{
+	return sect;
+}
+
+static inline sector_t null_zone_update_sector_po2_emu(struct nullb_device *dev,
+						       sector_t sect)
+{
+	sector_t zsze_po2 = dev->zone_size_po2_sects;
+	sector_t zsze_diff = dev->zone_size_diff_sects;
+	u32 zone_idx = sect >> ilog2(zsze_po2);
+
+	sect = sect - (zone_idx * zsze_diff);
+	return sect;
+}
+
+static inline sector_t
+null_zone_update_sector_append_po2_emu(struct nullb_device *dev, sector_t sect)
+{
+	/* Need to readjust the sector if po2 emulation is used. */
+	u32 zone_no = dev->zone_no(dev, sect);
+
+	sect += dev->zone_size_diff_sects * zone_no;
+	return sect;
+}
+
 static inline void null_lock_zone_res(struct nullb_device *dev)
 {
 	if (dev->need_zone_res_mgmt)
@@ -62,15 +100,14 @@  int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q)
 	sector_t sector = 0;
 	unsigned int i;
 
-	if (!is_power_of_2(dev->zone_size)) {
-		pr_err("zone_size must be power-of-two\n");
-		return -EINVAL;
-	}
 	if (dev->zone_size > dev->size) {
 		pr_err("Zone size larger than device capacity\n");
 		return -EINVAL;
 	}
 
+	if (!is_power_of_2(dev->zone_size))
+		pr_info("zone_size is not power-of-two. power-of-two emulation is enabled");
+
 	if (!dev->zone_capacity)
 		dev->zone_capacity = dev->zone_size;
 
@@ -83,8 +120,14 @@  int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q)
 	zone_capacity_sects = mb_to_sects(dev->zone_capacity);
 	dev_capacity_sects = mb_to_sects(dev->size);
 	dev->zone_size_sects = mb_to_sects(dev->zone_size);
-	dev->nr_zones = round_up(dev_capacity_sects, dev->zone_size_sects)
-		>> ilog2(dev->zone_size_sects);
+
+	dev->nr_zones = roundup(dev_capacity_sects, dev->zone_size_sects) /
+			dev->zone_size_sects;
+
+	dev->zone_no = null_zone_no;
+	dev->zone_update_sector = null_zone_update_sector_noop;
+	dev->zone_update_sector_append = null_zone_update_sector_noop;
+
 
 	dev->zones = kvmalloc_array(dev->nr_zones, sizeof(struct nullb_zone),
 				    GFP_KERNEL | __GFP_ZERO);
@@ -166,7 +209,13 @@  int null_register_zoned_dev(struct nullb *nullb)
 		if (ret)
 			return ret;
 	} else {
-		blk_queue_chunk_sectors(q, dev->zone_size_sects);
+		nullb->disk->fops->npo2_zone_setup(nullb->disk);
+
+		if (null_is_po2_zone_emu(dev))
+			blk_queue_chunk_sectors(q, dev->zone_size_po2_sects);
+		else
+			blk_queue_chunk_sectors(q, dev->zone_size_sects);
+
 		q->nr_zones = blkdev_nr_zones(nullb->disk);
 	}
 
@@ -183,17 +232,49 @@  void null_free_zoned_dev(struct nullb_device *dev)
 	dev->zones = NULL;
 }
 
+static void null_update_zone_info(struct nullb *nullb, struct blk_zone *blkz,
+				  struct nullb_zone *zone)
+{
+	unsigned int zone_idx;
+	u64 zone_gap;
+	struct nullb_device *dev = nullb->dev;
+
+	if (null_is_po2_zone_emu(dev)) {
+		zone_idx = zone->start / zone->len;
+		zone_gap = zone_idx * dev->zone_size_diff_sects;
+
+		blkz->start = zone->start + zone_gap;
+		blkz->len = dev->zone_size_po2_sects;
+		blkz->wp = zone->wp + zone_gap;
+	} else {
+		blkz->start = zone->start;
+		blkz->len = zone->len;
+		blkz->wp = zone->wp;
+	}
+
+	blkz->type = zone->type;
+	blkz->cond = zone->cond;
+	blkz->capacity = zone->capacity;
+}
+
 int null_report_zones(struct gendisk *disk, sector_t sector,
 		unsigned int nr_zones, report_zones_cb cb, void *data)
 {
 	struct nullb *nullb = disk->private_data;
 	struct nullb_device *dev = nullb->dev;
 	unsigned int first_zone, i;
+	sector_t zone_size;
 	struct nullb_zone *zone;
 	struct blk_zone blkz;
 	int error;
 
-	first_zone = null_zone_no(dev, sector);
+	if (null_is_po2_zone_emu(dev))
+		zone_size = dev->zone_size_po2_sects;
+	else
+		zone_size = dev->zone_size_sects;
+
+	first_zone = sector / zone_size;
+
 	if (first_zone >= dev->nr_zones)
 		return 0;
 
@@ -210,12 +291,7 @@  int null_report_zones(struct gendisk *disk, sector_t sector,
 		 * array.
 		 */
 		null_lock_zone(dev, zone);
-		blkz.start = zone->start;
-		blkz.len = zone->len;
-		blkz.wp = zone->wp;
-		blkz.type = zone->type;
-		blkz.cond = zone->cond;
-		blkz.capacity = zone->capacity;
+		null_update_zone_info(nullb, &blkz, zone);
 		null_unlock_zone(dev, zone);
 
 		error = cb(&blkz, i, data);
@@ -226,6 +302,35 @@  int null_report_zones(struct gendisk *disk, sector_t sector,
 	return nr_zones;
 }
 
+void null_po2_zone_emu_setup(struct gendisk *disk)
+{
+	struct nullb *nullb = disk->private_data;
+	struct nullb_device *dev = nullb->dev;
+	sector_t capacity;
+
+	if (is_power_of_2(dev->zone_size_sects))
+		return;
+
+	if (!get_capacity(disk))
+		return;
+
+	blk_mq_freeze_queue(disk->queue);
+
+	blk_queue_po2_zone_emu(disk->queue, 1);
+	dev->zone_size_po2_sects =
+		1 << get_count_order_long(dev->zone_size_sects);
+	dev->zone_size_diff_sects =
+		dev->zone_size_po2_sects - dev->zone_size_sects;
+	dev->zone_no = null_npo2_zone_no;
+	dev->zone_update_sector = null_zone_update_sector_po2_emu;
+	dev->zone_update_sector_append = null_zone_update_sector_append_po2_emu;
+
+	capacity = dev->nr_zones * dev->zone_size_po2_sects;
+	set_capacity(disk, capacity);
+
+	blk_mq_unfreeze_queue(disk->queue);
+}
+
 /*
  * This is called in the case of memory backing from null_process_cmd()
  * with the target zone already locked.
@@ -234,7 +339,7 @@  size_t null_zone_valid_read_len(struct nullb *nullb,
 				sector_t sector, unsigned int len)
 {
 	struct nullb_device *dev = nullb->dev;
-	struct nullb_zone *zone = &dev->zones[null_zone_no(dev, sector)];
+	struct nullb_zone *zone = &dev->zones[dev->zone_no(dev, sector)];
 	unsigned int nr_sectors = len >> SECTOR_SHIFT;
 
 	/* Read must be below the write pointer position */
@@ -363,11 +468,24 @@  static blk_status_t null_check_zone_resources(struct nullb_device *dev,
 	}
 }
 
+static void null_update_sector_append(struct nullb_cmd *cmd, sector_t sector)
+{
+	struct nullb_device *dev = cmd->nq->dev;
+
+	sector = dev->zone_update_sector_append(dev, sector);
+
+	if (cmd->bio) {
+		cmd->bio->bi_iter.bi_sector = sector;
+	} else {
+		cmd->rq->__sector = sector;
+	}
+}
+
 static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector,
 				    unsigned int nr_sectors, bool append)
 {
 	struct nullb_device *dev = cmd->nq->dev;
-	unsigned int zno = null_zone_no(dev, sector);
+	unsigned int zno = dev->zone_no(dev, sector);
 	struct nullb_zone *zone = &dev->zones[zno];
 	blk_status_t ret;
 
@@ -395,10 +513,7 @@  static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector,
 	 */
 	if (append) {
 		sector = zone->wp;
-		if (cmd->bio)
-			cmd->bio->bi_iter.bi_sector = sector;
-		else
-			cmd->rq->__sector = sector;
+		null_update_sector_append(cmd, sector);
 	} else if (sector != zone->wp) {
 		ret = BLK_STS_IOERR;
 		goto unlock;
@@ -619,7 +734,7 @@  static blk_status_t null_zone_mgmt(struct nullb_cmd *cmd, enum req_opf op,
 		return BLK_STS_OK;
 	}
 
-	zone_no = null_zone_no(dev, sector);
+	zone_no = dev->zone_no(dev, sector);
 	zone = &dev->zones[zone_no];
 
 	null_lock_zone(dev, zone);
@@ -650,13 +765,54 @@  static blk_status_t null_zone_mgmt(struct nullb_cmd *cmd, enum req_opf op,
 	return ret;
 }
 
+static blk_status_t null_handle_po2_zone_emu_violation(struct nullb_cmd *cmd,
+						       enum req_opf op)
+{
+	if (op == REQ_OP_READ) {
+		if (cmd->bio)
+			zero_fill_bio(cmd->bio);
+		else
+			zero_fill_bio(cmd->rq->bio);
+
+		return BLK_STS_OK;
+	} else {
+		return BLK_STS_IOERR;
+	}
+}
+
+static bool null_verify_sector_violation(struct nullb_device *dev,
+					 sector_t sector)
+{
+	if (unlikely(null_is_po2_zone_emu(dev))) {
+		/* The zone idx should be calculated based on the emulated
+		 * layout
+		 */
+		u32 zone_idx = sector >> ilog2(dev->zone_size_po2_sects);
+		sector_t zsze = dev->zone_size_sects;
+		sector_t sect = null_zone_update_sector_po2_emu(dev, sector);
+
+		if (sect - (zone_idx * zsze) > zsze)
+			return true;
+	}
+	return false;
+}
+
 blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, enum req_opf op,
 				    sector_t sector, sector_t nr_sectors)
 {
-	struct nullb_device *dev;
+	struct nullb_device *dev = cmd->nq->dev;
 	struct nullb_zone *zone;
 	blk_status_t sts;
 
+	/* Handle when the sector falls in the emulated area */
+	if (unlikely(null_verify_sector_violation(dev, sector)))
+		return null_handle_po2_zone_emu_violation(cmd, op);
+
+	/* The sector value is updated if po2 emulation is enabled, else it
+	 * will have no effect on the value
+	 */
+	sector = dev->zone_update_sector(dev, sector);
+
 	switch (op) {
 	case REQ_OP_WRITE:
 		return null_zone_write(cmd, sector, nr_sectors, false);
@@ -669,8 +825,7 @@  blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, enum req_opf op,
 	case REQ_OP_ZONE_FINISH:
 		return null_zone_mgmt(cmd, op, sector);
 	default:
-		dev = cmd->nq->dev;
-		zone = &dev->zones[null_zone_no(dev, sector)];
+		zone = &dev->zones[dev->zone_no(dev, sector)];
 
 		null_lock_zone(dev, zone);
 		sts = null_process_cmd(cmd, op, sector, nr_sectors);