diff mbox series

[4/6] nvme: zns: Add support for power_of_2 emulation to NVMe ZNS devices

Message ID 20220308165349.231320-5-p.raghav@samsung.com (mailing list archive)
State New, archived
Headers show
Series power_of_2 emulation support for NVMe ZNS devices | expand

Commit Message

Pankaj Raghav March 8, 2022, 4:53 p.m. UTC
power_of_2(PO2) is not a requirement as per the NVMe ZNSspecification,
however we still have in place a lot of assumptions
for PO2 in the block layer, in FS such as F2FS, btrfs and userspace
applications.

So in keeping with these requirements, provide an emulation layer to
non-power_of_2 zone devices and which does not create a performance
regression to existing zone storage devices which have PO2 zone sizes.
Callbacks are provided where needed in the hot paths to reduce the
impact of performance regression.

Signed-off-by: Pankaj Raghav <p.raghav@samsung.com>
---
 drivers/nvme/host/core.c |  28 +++++++----
 drivers/nvme/host/nvme.h | 100 ++++++++++++++++++++++++++++++++++++++-
 drivers/nvme/host/pci.c  |   4 ++
 drivers/nvme/host/zns.c  |  79 +++++++++++++++++++++++++++++--
 include/linux/blk-mq.h   |   2 +
 5 files changed, 200 insertions(+), 13 deletions(-)

Comments

Damien Le Moal March 9, 2022, 4:04 a.m. UTC | #1
On 3/9/22 01:53, Pankaj Raghav wrote:
> power_of_2(PO2) is not a requirement as per the NVMe ZNSspecification,
> however we still have in place a lot of assumptions
> for PO2 in the block layer, in FS such as F2FS, btrfs and userspace
> applications.
> 
> So in keeping with these requirements, provide an emulation layer to
> non-power_of_2 zone devices and which does not create a performance
> regression to existing zone storage devices which have PO2 zone sizes.
> Callbacks are provided where needed in the hot paths to reduce the
> impact of performance regression.

Contradiction: reducing the impact of performance regression is not the
same as "does not create a performance regression". So which one is it ?
Please add performance numbers to this commit message.

And also please actually explain what the patch is changing. This commit
message is about the why, but nothing on the how.

> 
> Signed-off-by: Pankaj Raghav <p.raghav@samsung.com>
> ---
>  drivers/nvme/host/core.c |  28 +++++++----
>  drivers/nvme/host/nvme.h | 100 ++++++++++++++++++++++++++++++++++++++-
>  drivers/nvme/host/pci.c  |   4 ++
>  drivers/nvme/host/zns.c  |  79 +++++++++++++++++++++++++++++--
>  include/linux/blk-mq.h   |   2 +
>  5 files changed, 200 insertions(+), 13 deletions(-)
> 
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index fd4720d37cc0..c7180d512b08 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -327,14 +327,6 @@ static inline enum nvme_disposition nvme_decide_disposition(struct request *req)
>  	return RETRY;
>  }
>  
> -static inline void nvme_end_req_zoned(struct request *req)
> -{
> -	if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
> -	    req_op(req) == REQ_OP_ZONE_APPEND)
> -		req->__sector = nvme_lba_to_sect(req->q->queuedata,
> -			le64_to_cpu(nvme_req(req)->result.u64));
> -}
> -
>  static inline void nvme_end_req(struct request *req)
>  {
>  	blk_status_t status = nvme_error_status(nvme_req(req)->status);
> @@ -676,6 +668,12 @@ blk_status_t nvme_fail_nonready_command(struct nvme_ctrl *ctrl,
>  }
>  EXPORT_SYMBOL_GPL(nvme_fail_nonready_command);
>  
> +blk_status_t nvme_fail_po2_zone_emu_violation(struct request *req)
> +{
> +	return nvme_zone_handle_po2_emu_violation(req);
> +}
> +EXPORT_SYMBOL_GPL(nvme_fail_po2_zone_emu_violation);
> +

This should go in zns.c, not in the core.

>  bool __nvme_check_ready(struct nvme_ctrl *ctrl, struct request *rq,
>  		bool queue_live)
>  {
> @@ -879,6 +877,7 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
>  	req->special_vec.bv_offset = offset_in_page(range);
>  	req->special_vec.bv_len = alloc_size;
>  	req->rq_flags |= RQF_SPECIAL_PAYLOAD;
> +	nvme_verify_sector_value(ns, req);
>  
>  	return BLK_STS_OK;
>  }
> @@ -909,6 +908,7 @@ static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns,
>  			break;
>  		}
>  	}
> +	nvme_verify_sector_value(ns, req);
>  
>  	return BLK_STS_OK;
>  }
> @@ -973,6 +973,7 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
>  
>  	cmnd->rw.control = cpu_to_le16(control);
>  	cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt);
> +	nvme_verify_sector_value(ns, req);
>  	return 0;
>  }
>  
> @@ -2105,8 +2106,14 @@ static int nvme_report_zones(struct gendisk *disk, sector_t sector,
>  	return nvme_ns_report_zones(disk->private_data, sector, nr_zones, cb,
>  			data);
>  }
> +
> +static void nvme_npo2_zone_setup(struct gendisk *disk)
> +{
> +	nvme_ns_po2_zone_emu_setup(disk->private_data);
> +}

This helper seems useless.

>  #else
>  #define nvme_report_zones	NULL
> +#define nvme_npo2_zone_setup	NULL
>  #endif /* CONFIG_BLK_DEV_ZONED */
>  
>  static const struct block_device_operations nvme_bdev_ops = {
> @@ -2116,6 +2123,7 @@ static const struct block_device_operations nvme_bdev_ops = {
>  	.release	= nvme_release,
>  	.getgeo		= nvme_getgeo,
>  	.report_zones	= nvme_report_zones,
> +	.npo2_zone_setup = nvme_npo2_zone_setup,
>  	.pr_ops		= &nvme_pr_ops,
>  };
>  
> @@ -3844,6 +3852,10 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid,
>  	ns->disk = disk;
>  	ns->queue = disk->queue;
>  
> +#ifdef CONFIG_BLK_DEV_ZONED
> +	ns->sect_to_lba = __nvme_sect_to_lba;
> +	ns->update_sector_append = nvme_update_sector_append_noop;
> +#endif
>  	if (ctrl->opts && ctrl->opts->data_digest)
>  		blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, ns->queue);
>  
> diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
> index a162f6c6da6e..f584f760e8cc 100644
> --- a/drivers/nvme/host/nvme.h
> +++ b/drivers/nvme/host/nvme.h
> @@ -457,6 +457,10 @@ struct nvme_ns {
>  	u8 pi_type;
>  #ifdef CONFIG_BLK_DEV_ZONED
>  	u64 zsze;
> +	u64 zsze_po2;
> +	u32 zsze_diff;
> +	u64 (*sect_to_lba)(struct nvme_ns *ns, sector_t sector);
> +	sector_t (*update_sector_append)(struct nvme_ns *ns, sector_t sector);
>  #endif
>  	unsigned long features;
>  	unsigned long flags;
> @@ -562,12 +566,21 @@ static inline int nvme_reset_subsystem(struct nvme_ctrl *ctrl)
>  	return ctrl->ops->reg_write32(ctrl, NVME_REG_NSSR, 0x4E564D65);
>  }
>  
> +static inline u64 __nvme_sect_to_lba(struct nvme_ns *ns, sector_t sector)
> +{
> +	return sector >> (ns->lba_shift - SECTOR_SHIFT);
> +}
> +
>  /*
>   * Convert a 512B sector number to a device logical block number.
>   */
>  static inline u64 nvme_sect_to_lba(struct nvme_ns *ns, sector_t sector)
>  {
> -	return sector >> (ns->lba_shift - SECTOR_SHIFT);
> +#ifdef CONFIG_BLK_DEV_ZONED
> +	return ns->sect_to_lba(ns, sector);

So for a power of 2 zone sized device, you are forcing an indirect call,
always. Not acceptable. What is the point of that po2_zone_emu boolean
you added to the queue ?

> +#else
> +	return __nvme_sect_to_lba(ns, sector);

This helper is useless.

> +#endif
>  }
>  
>  /*
> @@ -578,6 +591,83 @@ static inline sector_t nvme_lba_to_sect(struct nvme_ns *ns, u64 lba)
>  	return lba << (ns->lba_shift - SECTOR_SHIFT);
>  }
>  
> +#ifdef CONFIG_BLK_DEV_ZONED
> +static inline u64 __nvme_sect_to_lba_po2(struct nvme_ns *ns, sector_t sector)
> +{
> +	sector_t zone_idx = sector >> ilog2(ns->zsze_po2);
> +
> +	sector = sector - (zone_idx * ns->zsze_diff);
> +
> +	return sector >> (ns->lba_shift - SECTOR_SHIFT);
> +}
> +
> +static inline sector_t
> +nvme_update_sector_append_po2_zone_emu(struct nvme_ns *ns, sector_t sector)
> +{
> +	/* The sector value passed by the drive after a append operation is the
> +	 * based on the actual zone layout in the device, hence, use the actual
> +	 * zone_size to calculate the zone number from the sector.
> +	 */
> +	u32 zone_no = sector / ns->zsze;
> +
> +	sector += ns->zsze_diff * zone_no;
> +	return sector;
> +}
> +
> +static inline sector_t nvme_update_sector_append_noop(struct nvme_ns *ns,
> +						      sector_t sector)
> +{
> +	return sector;
> +}
> +
> +static inline void nvme_end_req_zoned(struct request *req)
> +{
> +	if (req_op(req) == REQ_OP_ZONE_APPEND) {
> +		struct nvme_ns *ns = req->q->queuedata;
> +		sector_t sector;
> +
> +		sector = nvme_lba_to_sect(ns,
> +					  le64_to_cpu(nvme_req(req)->result.u64));
> +
> +		sector = ns->update_sector_append(ns, sector);

Why not assign that to req->__sector directly ?
And again here, you are forcing the indirect function call for *all* zns
devices, even those that have a power of 2 zone size.

> +
> +		req->__sector = sector;
> +	}
> +}
> +
> +static inline void nvme_verify_sector_value(struct nvme_ns *ns, struct request *req)
> +{
> +	if (unlikely(blk_queue_is_po2_zone_emu(ns->queue))) {
> +		sector_t sector = blk_rq_pos(req);
> +		sector_t zone_idx = sector >> ilog2(ns->zsze_po2);
> +		sector_t device_sector = sector - (zone_idx * ns->zsze_diff);
> +
> +		/* Check if the IO is in the emulated area */
> +		if (device_sector - (zone_idx * ns->zsze) > ns->zsze)
> +			req->rq_flags |= RQF_ZONE_EMU_VIOLATION;
> +	}
> +}
> +
> +static inline bool nvme_po2_zone_emu_violation(struct request *req)
> +{
> +	return req->rq_flags & RQF_ZONE_EMU_VIOLATION;
> +}

This helper makes the code unreadable in my opinion.

> +#else
> +static inline void nvme_end_req_zoned(struct request *req)
> +{
> +}
> +
> +static inline void nvme_verify_sector_value(struct nvme_ns *ns, struct request *req)
> +{
> +}
> +
> +static inline bool nvme_po2_zone_emu_violation(struct request *req)
> +{
> +	return false;
> +}
> +
> +#endif
> +
>  /*
>   * Convert byte length to nvme's 0-based num dwords
>   */
> @@ -752,6 +842,7 @@ long nvme_ns_head_chr_ioctl(struct file *file, unsigned int cmd,
>  long nvme_dev_ioctl(struct file *file, unsigned int cmd,
>  		unsigned long arg);
>  int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo);
> +blk_status_t nvme_fail_po2_zone_emu_violation(struct request *req);
>  
>  extern const struct attribute_group *nvme_ns_id_attr_groups[];
>  extern const struct pr_ops nvme_pr_ops;
> @@ -873,11 +964,13 @@ static inline void nvme_mpath_default_iopolicy(struct nvme_subsystem *subsys)
>  int nvme_revalidate_zones(struct nvme_ns *ns);
>  int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector,
>  		unsigned int nr_zones, report_zones_cb cb, void *data);
> +void nvme_ns_po2_zone_emu_setup(struct nvme_ns *ns);
>  #ifdef CONFIG_BLK_DEV_ZONED
>  int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf);
>  blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct request *req,
>  				       struct nvme_command *cmnd,
>  				       enum nvme_zone_mgmt_action action);
> +blk_status_t nvme_zone_handle_po2_emu_violation(struct request *req);
>  #else
>  static inline blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns,
>  		struct request *req, struct nvme_command *cmnd,
> @@ -892,6 +985,11 @@ static inline int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf)
>  		 "Please enable CONFIG_BLK_DEV_ZONED to support ZNS devices\n");
>  	return -EPROTONOSUPPORT;
>  }
> +
> +static inline blk_status_t nvme_zone_handle_po2_emu_violation(struct request *req)
> +{
> +	return BLK_STS_OK;
> +}
>  #endif
>  
>  static inline struct nvme_ns *nvme_get_ns_from_dev(struct device *dev)
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index 6a99ed680915..fc022df3f98e 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -960,6 +960,10 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
>  		return nvme_fail_nonready_command(&dev->ctrl, req);
>  
>  	ret = nvme_prep_rq(dev, req);
> +
> +	if (unlikely(nvme_po2_zone_emu_violation(req)))
> +		return nvme_fail_po2_zone_emu_violation(req);
> +
>  	if (unlikely(ret))
>  		return ret;
>  	spin_lock(&nvmeq->sq_lock);
> diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c
> index ad02c61c0b52..25516a5ae7e2 100644
> --- a/drivers/nvme/host/zns.c
> +++ b/drivers/nvme/host/zns.c
> @@ -3,7 +3,9 @@
>   * Copyright (C) 2020 Western Digital Corporation or its affiliates.
>   */
>  
> +#include <linux/log2.h>
>  #include <linux/blkdev.h>
> +#include <linux/math.h>
>  #include <linux/vmalloc.h>
>  #include "nvme.h"
>  
> @@ -46,6 +48,18 @@ static int nvme_set_max_append(struct nvme_ctrl *ctrl)
>  	return 0;
>  }
>  
> +static sector_t nvme_zone_size(struct nvme_ns *ns)
> +{
> +	sector_t zone_size;
> +
> +	if (blk_queue_is_po2_zone_emu(ns->queue))
> +		zone_size = ns->zsze_po2;
> +	else
> +		zone_size = ns->zsze;
> +
> +	return zone_size;
> +}
> +
>  int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf)
>  {
>  	struct nvme_effects_log *log = ns->head->effects;
> @@ -122,7 +136,7 @@ static void *nvme_zns_alloc_report_buffer(struct nvme_ns *ns,
>  				   sizeof(struct nvme_zone_descriptor);
>  
>  	nr_zones = min_t(unsigned int, nr_zones,
> -			 get_capacity(ns->disk) / ns->zsze);
> +			 get_capacity(ns->disk) / nvme_zone_size(ns));
>  
>  	bufsize = sizeof(struct nvme_zone_report) +
>  		nr_zones * sizeof(struct nvme_zone_descriptor);
> @@ -147,6 +161,8 @@ static int nvme_zone_parse_entry(struct nvme_ns *ns,
>  				 void *data)
>  {
>  	struct blk_zone zone = { };
> +	u64 zone_gap = 0;
> +	u32 zone_idx;
>  
>  	if ((entry->zt & 0xf) != NVME_ZONE_TYPE_SEQWRITE_REQ) {
>  		dev_err(ns->ctrl->device, "invalid zone type %#x\n",
> @@ -159,10 +175,19 @@ static int nvme_zone_parse_entry(struct nvme_ns *ns,
>  	zone.len = ns->zsze;
>  	zone.capacity = nvme_lba_to_sect(ns, le64_to_cpu(entry->zcap));
>  	zone.start = nvme_lba_to_sect(ns, le64_to_cpu(entry->zslba));
> +
> +	if (blk_queue_is_po2_zone_emu(ns->queue)) {
> +		zone_idx = zone.start / zone.len;
> +		zone_gap = zone_idx * ns->zsze_diff;
> +		zone.start += zone_gap;
> +		zone.len = ns->zsze_po2;
> +	}
> +
>  	if (zone.cond == BLK_ZONE_COND_FULL)
>  		zone.wp = zone.start + zone.len;
>  	else
> -		zone.wp = nvme_lba_to_sect(ns, le64_to_cpu(entry->wp));
> +		zone.wp =
> +			nvme_lba_to_sect(ns, le64_to_cpu(entry->wp)) + zone_gap;
>  
>  	return cb(&zone, idx, data);
>  }
> @@ -173,6 +198,7 @@ int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector,
>  	struct nvme_zone_report *report;
>  	struct nvme_command c = { };
>  	int ret, zone_idx = 0;
> +	u64 zone_size = nvme_zone_size(ns);
>  	unsigned int nz, i;
>  	size_t buflen;
>  
> @@ -190,7 +216,7 @@ int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector,
>  	c.zmr.zrasf = NVME_ZRASF_ZONE_REPORT_ALL;
>  	c.zmr.pr = NVME_REPORT_ZONE_PARTIAL;
>  
> -	sector &= ~(ns->zsze - 1);
> +	sector = rounddown(sector, zone_size);
>  	while (zone_idx < nr_zones && sector < get_capacity(ns->disk)) {
>  		memset(report, 0, buflen);
>  
> @@ -214,7 +240,7 @@ int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector,
>  			zone_idx++;
>  		}
>  
> -		sector += ns->zsze * nz;
> +		sector += zone_size * nz;
>  	}
>  
>  	if (zone_idx > 0)
> @@ -226,6 +252,32 @@ int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector,
>  	return ret;
>  }
>  
> +void nvme_ns_po2_zone_emu_setup(struct nvme_ns *ns)
> +{
> +	u32 nr_zones;
> +	sector_t capacity;
> +
> +	if (is_power_of_2(ns->zsze))
> +		return;
> +
> +	if (!get_capacity(ns->disk))
> +		return;
> +
> +	blk_mq_freeze_queue(ns->disk->queue);
> +
> +	blk_queue_po2_zone_emu(ns->queue, 1);
> +	ns->zsze_po2 = 1 << get_count_order_long(ns->zsze);
> +	ns->zsze_diff = ns->zsze_po2 - ns->zsze;
> +
> +	nr_zones = get_capacity(ns->disk) / ns->zsze;
> +	capacity = nr_zones * ns->zsze_po2;
> +	set_capacity_and_notify(ns->disk, capacity);
> +	ns->sect_to_lba = __nvme_sect_to_lba_po2;
> +	ns->update_sector_append = nvme_update_sector_append_po2_zone_emu;
> +
> +	blk_mq_unfreeze_queue(ns->disk->queue);
> +}
> +
>  blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct request *req,
>  		struct nvme_command *c, enum nvme_zone_mgmt_action action)
>  {
> @@ -239,5 +291,24 @@ blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct request *req,
>  	if (req_op(req) == REQ_OP_ZONE_RESET_ALL)
>  		c->zms.select_all = 1;
>  
> +	nvme_verify_sector_value(ns, req);
> +	return BLK_STS_OK;
> +}
> +
> +blk_status_t nvme_zone_handle_po2_emu_violation(struct request *req)
> +{
> +	/*  The spec mentions that read from ZCAP until ZSZE shall behave
> +	 *  like a deallocated block. Deallocated block reads are
> +	 *  deterministic, hence we fill zero.
> +	 * The spec does not clearly define the result for other opreation.
> +	 */

Comment style and indentation is weird.

> +	if (req_op(req) == REQ_OP_READ) {
> +		zero_fill_bio(req->bio);
> +		nvme_req(req)->status = NVME_SC_SUCCESS;
> +	} else {
> +		nvme_req(req)->status = NVME_SC_WRITE_FAULT;
> +	}

What about requests that straddle the zone capacity ? They need to be
partially zeroed too, otherwise data from the next zone may be exposed.

> +	blk_mq_set_request_complete(req);
> +	nvme_complete_rq(req);
>  	return BLK_STS_OK;
>  }
> diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
> index 3a41d50b85d3..9ec59183efcd 100644
> --- a/include/linux/blk-mq.h
> +++ b/include/linux/blk-mq.h
> @@ -57,6 +57,8 @@ typedef __u32 __bitwise req_flags_t;
>  #define RQF_TIMED_OUT		((__force req_flags_t)(1 << 21))
>  /* queue has elevator attached */
>  #define RQF_ELV			((__force req_flags_t)(1 << 22))
> +/* request to do IO in the emulated area with po2 zone emulation */
> +#define RQF_ZONE_EMU_VIOLATION	((__force req_flags_t)(1 << 23))
>  
>  /* flags that prevent us from merging requests: */
>  #define RQF_NOMERGE_FLAGS \
Pankaj Raghav March 9, 2022, 2:33 p.m. UTC | #2
On 2022-03-09 05:04, Damien Le Moal wrote:
> On 3/9/22 01:53, Pankaj Raghav wrote:
 
> Contradiction: reducing the impact of performance regression is not the
> same as "does not create a performance regression". So which one is it ?
> Please add performance numbers to this commit message.

> And also please actually explain what the patch is changing. This commit
> message is about the why, but nothing on the how.
>
I will reword and add a bit more context to the commit log with perf numbers
in the next revision
>> +EXPORT_SYMBOL_GPL(nvme_fail_po2_zone_emu_violation);
>> +
> 
> This should go in zns.c, not in the core.
> 
Ok.

>> +
>> +static void nvme_npo2_zone_setup(struct gendisk *disk)
>> +{
>> +	nvme_ns_po2_zone_emu_setup(disk->private_data);
>> +}
> 
> This helper seems useless.
>
I tried to retain the pattern with report_zones which is currently like this:
static int nvme_report_zones(struct gendisk *disk, sector_t sector,
		unsigned int nr_zones, report_zones_cb cb, void *data)
{
	return nvme_ns_report_zones(disk->private_data, sector, nr_zones, cb,
			data);
}

But I can just remove this helper and use nvme_ns_po2_zone_emu_setup cb directly
in nvme_bdev_fops.

>> +
>>  /*
>>   * Convert a 512B sector number to a device logical block number.
>>   */
>>  static inline u64 nvme_sect_to_lba(struct nvme_ns *ns, sector_t sector)
>>  {
>> -	return sector >> (ns->lba_shift - SECTOR_SHIFT);
>> +#ifdef CONFIG_BLK_DEV_ZONED
>> +	return ns->sect_to_lba(ns, sector);
> 
> So for a power of 2 zone sized device, you are forcing an indirect call,
> always. Not acceptable. What is the point of that po2_zone_emu boolean
> you added to the queue ?
This is a good point and we had a discussion about this internally.
Initially I had something like this:
if (!blk_queue_is_po2_zone_emu(disk))
	return sector >> (ns->lba_shift - SECTOR_SHIFT);
else
	return __nvme_sect_to_lba_po2(ns, sec);

But @Luis indicated that it was better to set an op which comes at a cost of indirection
instead of having a runtime check with a if/else in the **hot path**. The code also looks
more clear with having an op.

The performance analysis that we performed did not show any regression while using the indirection
for po2 zone sizes, at least on the x86_64 architecture.
So maybe we could use this opportunity to discuss which approach could be used here.


>> +
>> +		sector = nvme_lba_to_sect(ns,
>> +					  le64_to_cpu(nvme_req(req)->result.u64));
>> +
>> +		sector = ns->update_sector_append(ns, sector);
> 
> Why not assign that to req->__sector directly ?
> And again here, you are forcing the indirect function call for *all* zns
> devices, even those that have a power of 2 zone size.
>
Same answer as above. I will adapt them based on the outcome of our
discussions.
>> +
>> +		req->__sector = sector;
>> +	}
>> +
>> +static inline bool nvme_po2_zone_emu_violation(struct request *req)
>> +{
>> +	return req->rq_flags & RQF_ZONE_EMU_VIOLATION;
>> +}
> 
> This helper makes the code unreadable in my opinion.
>
I will open code it then.
>> +#else
>> +static inline void nvme_end_req_zoned(struct request *req)
>> +{
>> +}
>> +
>> +static inline void nvme_verify_sector_value(struct nvme_ns *ns, struct request *req)
>> +{
>> +}
>> +
>> +static inline bool nvme_po2_zone_emu_violation(struct request *req)
>> +{
>> +	return false;
>> +}
>> +
>> +#endif
>> +
>>  /*
>>   * Convert byte length to nvme's 0-based num dwords
>>   */
>> @@ -752,6 +842,7 @@ long nvme_ns_head_chr_ioctl(struct file *file, unsigned int cmd,
>>  long nvme_dev_ioctl(struct file *file, unsigned int cmd,
>>  		unsigned long arg);
>>  int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo);
>> +blk_status_t nvme_fail_po2_zone_emu_violation(struct request *req);
>>  
>>  extern const struct attribute_group *nvme_ns_id_attr_groups[];
>>  extern const struct pr_ops nvme_pr_ops;
>> @@ -873,11 +964,13 @@ static inline void nvme_mpath_default_iopolicy(struct nvme_subsystem *subsys)
>>  int nvme_revalidate_zones(struct nvme_ns *ns);
>>  int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector,
>>  		unsigned int nr_zones, report_zones_cb cb, void *data);
>> +void nvme_ns_po2_zone_emu_setup(struct nvme_ns *ns);
>>  #ifdef CONFIG_BLK_DEV_ZONED
>>  int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf);
>>  blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct request *req,
>>  				       struct nvme_command *cmnd,
>>  				       enum nvme_zone_mgmt_action action);
>> +blk_status_t nvme_zone_handle_po2_emu_violation(struct request *req);
>>  #else
>>  static inline blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns,
>>  		struct request *req, struct nvme_command *cmnd,
>> @@ -892,6 +985,11 @@ static inline int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf)
>>  		 "Please enable CONFIG_BLK_DEV_ZONED to support ZNS devices\n");
>>  	return -EPROTONOSUPPORT;
>>  }
>> +
>> +static inline blk_status_t nvme_zone_handle_po2_emu_violation(struct request *req)
>> +{
>> +	return BLK_STS_OK;
>> +}
>>  #endif
>>  
>>  static inline struct nvme_ns *nvme_get_ns_from_dev(struct device *dev)
>> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
>> index 6a99ed680915..fc022df3f98e 100644
>> --- a/drivers/nvme/host/pci.c
>> +++ b/drivers/nvme/host/pci.c
>> @@ -960,6 +960,10 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
>>  		return nvme_fail_nonready_command(&dev->ctrl, req);
>>  
>>  	ret = nvme_prep_rq(dev, req);
>> +
>> +	if (unlikely(nvme_po2_zone_emu_violation(req)))
>> +		return nvme_fail_po2_zone_emu_violation(req);
>> +
>>  	if (unlikely(ret))
>>  		return ret;
>>  	spin_lock(&nvmeq->sq_lock);
>> diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c
>> index ad02c61c0b52..25516a5ae7e2 100644
>> --- a/drivers/nvme/host/zns.c
>> +++ b/drivers/nvme/host/zns.c
>> @@ -3,7 +3,9 @@
>>   * Copyright (C) 2020 Western Digital Corporation or its affiliates.
>>   */
>>  
>> +#include <linux/log2.h>
>>  #include <linux/blkdev.h>
>> +#include <linux/math.h>
>>  #include <linux/vmalloc.h>
>>  #include "nvme.h"
>>  
>> @@ -46,6 +48,18 @@ static int nvme_set_max_append(struct nvme_ctrl *ctrl)
>>  	return 0;
>>  }
>>  
>> +static sector_t nvme_zone_size(struct nvme_ns *ns)
>> +{
>> +	sector_t zone_size;
>> +
>> +	if (blk_queue_is_po2_zone_emu(ns->queue))
>> +		zone_size = ns->zsze_po2;
>> +	else
>> +		zone_size = ns->zsze;
>> +
>> +	return zone_size;
>> +}
>> +
>>  int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf)
>>  {
>>  	struct nvme_effects_log *log = ns->head->effects;
>> @@ -122,7 +136,7 @@ static void *nvme_zns_alloc_report_buffer(struct nvme_ns *ns,
>>  				   sizeof(struct nvme_zone_descriptor);
>>  
>>  	nr_zones = min_t(unsigned int, nr_zones,
>> -			 get_capacity(ns->disk) / ns->zsze);
>> +			 get_capacity(ns->disk) / nvme_zone_size(ns));
>>  
>>  	bufsize = sizeof(struct nvme_zone_report) +
>>  		nr_zones * sizeof(struct nvme_zone_descriptor);
>> @@ -147,6 +161,8 @@ static int nvme_zone_parse_entry(struct nvme_ns *ns,
>>  				 void *data)
>>  {
>>  	struct blk_zone zone = { };
>> +	u64 zone_gap = 0;
>> +	u32 zone_idx;
>>  
>>  	if ((entry->zt & 0xf) != NVME_ZONE_TYPE_SEQWRITE_REQ) {
>>  		dev_err(ns->ctrl->device, "invalid zone type %#x\n",
>> @@ -159,10 +175,19 @@ static int nvme_zone_parse_entry(struct nvme_ns *ns,
>>  	zone.len = ns->zsze;
>>  	zone.capacity = nvme_lba_to_sect(ns, le64_to_cpu(entry->zcap));
>>  	zone.start = nvme_lba_to_sect(ns, le64_to_cpu(entry->zslba));
>> +
>> +	if (blk_queue_is_po2_zone_emu(ns->queue)) {
>> +		zone_idx = zone.start / zone.len;
>> +		zone_gap = zone_idx * ns->zsze_diff;
>> +		zone.start += zone_gap;
>> +		zone.len = ns->zsze_po2;
>> +	}
>> +
>>  	if (zone.cond == BLK_ZONE_COND_FULL)
>>  		zone.wp = zone.start + zone.len;
>>  	else
>> -		zone.wp = nvme_lba_to_sect(ns, le64_to_cpu(entry->wp));
>> +		zone.wp =
>> +			nvme_lba_to_sect(ns, le64_to_cpu(entry->wp)) + zone_gap;
>>  
>>  	return cb(&zone, idx, data);
>>  }
>> @@ -173,6 +198,7 @@ int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector,
>>  	struct nvme_zone_report *report;
>>  	struct nvme_command c = { };
>>  	int ret, zone_idx = 0;
>> +	u64 zone_size = nvme_zone_size(ns);
>>  	unsigned int nz, i;
>>  	size_t buflen;
>>  
>> @@ -190,7 +216,7 @@ int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector,
>>  	c.zmr.zrasf = NVME_ZRASF_ZONE_REPORT_ALL;
>>  	c.zmr.pr = NVME_REPORT_ZONE_PARTIAL;
>>  
>> -	sector &= ~(ns->zsze - 1);
>> +	sector = rounddown(sector, zone_size);
>>  	while (zone_idx < nr_zones && sector < get_capacity(ns->disk)) {
>>  		memset(report, 0, buflen);
>>  
>> @@ -214,7 +240,7 @@ int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector,
>>  			zone_idx++;
>>  		}
>>  
>> -		sector += ns->zsze * nz;
>> +		sector += zone_size * nz;
>>  	}
>>  
>>  	if (zone_idx > 0)
>> @@ -226,6 +252,32 @@ int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector,
>>  	return ret;
>>  }
>>  
>> +void nvme_ns_po2_zone_emu_setup(struct nvme_ns *ns)
>> +{
>> +	u32 nr_zones;
>> +	sector_t capacity;
>> +
>> +	if (is_power_of_2(ns->zsze))
>> +		return;
>> +
>> +	if (!get_capacity(ns->disk))
>> +		return;
>> +
>> +	blk_mq_freeze_queue(ns->disk->queue);
>> +
>> +	blk_queue_po2_zone_emu(ns->queue, 1);
>> +	ns->zsze_po2 = 1 << get_count_order_long(ns->zsze);
>> +	ns->zsze_diff = ns->zsze_po2 - ns->zsze;
>> +
>> +	nr_zones = get_capacity(ns->disk) / ns->zsze;
>> +	capacity = nr_zones * ns->zsze_po2;
>> +	set_capacity_and_notify(ns->disk, capacity);
>> +	ns->sect_to_lba = __nvme_sect_to_lba_po2;
>> +	ns->update_sector_append = nvme_update_sector_append_po2_zone_emu;
>> +
>> +	blk_mq_unfreeze_queue(ns->disk->queue);
>> +}
>> +
>>  blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct request *req,
>>  		struct nvme_command *c, enum nvme_zone_mgmt_action action)
>>  {
>> @@ -239,5 +291,24 @@ blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct request *req,
>>  	if (req_op(req) == REQ_OP_ZONE_RESET_ALL)
>>  		c->zms.select_all = 1;
>>  
>> +	nvme_verify_sector_value(ns, req);
>> +	return BLK_STS_OK;
>> +}
>> +
>> +blk_status_t nvme_zone_handle_po2_emu_violation(struct request *req)
>> +{
>> +	/*  The spec mentions that read from ZCAP until ZSZE shall behave
>> +	 *  like a deallocated block. Deallocated block reads are
>> +	 *  deterministic, hence we fill zero.
>> +	 * The spec does not clearly define the result for other opreation.
>> +	 */
> 
> Comment style and indentation is weird.
>
Ack.
>> +	if (req_op(req) == REQ_OP_READ) {
>> +		zero_fill_bio(req->bio);
>> +		nvme_req(req)->status = NVME_SC_SUCCESS;
>> +	} else {
>> +		nvme_req(req)->status = NVME_SC_WRITE_FAULT;
>> +	}
> 
> What about requests that straddle the zone capacity ? They need to be
> partially zeroed too, otherwise data from the next zone may be exposed.
>
Good point. I will add this support in the next revision. Thanks.
>> +	blk_mq_set_request_complete(req);
>> +	nvme_complete_rq(req);
>>  	return BLK_STS_OK;
>>  }
>> diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
>> index 3a41d50b85d3..9ec59183efcd 100644
>> --- a/include/linux/blk-mq.h
>> +++ b/include/linux/blk-mq.h
>> @@ -57,6 +57,8 @@ typedef __u32 __bitwise req_flags_t;
>>  #define RQF_TIMED_OUT		((__force req_flags_t)(1 << 21))
>>  /* queue has elevator attached */
>>  #define RQF_ELV			((__force req_flags_t)(1 << 22))
>> +/* request to do IO in the emulated area with po2 zone emulation */
>> +#define RQF_ZONE_EMU_VIOLATION	((__force req_flags_t)(1 << 23))
>>  
>>  /* flags that prevent us from merging requests: */
>>  #define RQF_NOMERGE_FLAGS \
> 
>
Damien Le Moal March 9, 2022, 9:43 p.m. UTC | #3
On 3/9/22 23:33, Pankaj Raghav wrote:
> 
> 
> On 2022-03-09 05:04, Damien Le Moal wrote:
>> On 3/9/22 01:53, Pankaj Raghav wrote:
>  
>> Contradiction: reducing the impact of performance regression is not the
>> same as "does not create a performance regression". So which one is it ?
>> Please add performance numbers to this commit message.
> 
>> And also please actually explain what the patch is changing. This commit
>> message is about the why, but nothing on the how.
>>
> I will reword and add a bit more context to the commit log with perf numbers
> in the next revision
>>> +EXPORT_SYMBOL_GPL(nvme_fail_po2_zone_emu_violation);
>>> +
>>
>> This should go in zns.c, not in the core.
>>
> Ok.
> 
>>> +
>>> +static void nvme_npo2_zone_setup(struct gendisk *disk)
>>> +{
>>> +	nvme_ns_po2_zone_emu_setup(disk->private_data);
>>> +}
>>
>> This helper seems useless.
>>
> I tried to retain the pattern with report_zones which is currently like this:
> static int nvme_report_zones(struct gendisk *disk, sector_t sector,
> 		unsigned int nr_zones, report_zones_cb cb, void *data)
> {
> 	return nvme_ns_report_zones(disk->private_data, sector, nr_zones, cb,
> 			data);
> }
> 
> But I can just remove this helper and use nvme_ns_po2_zone_emu_setup cb directly
> in nvme_bdev_fops.
> 
>>> +
>>>  /*
>>>   * Convert a 512B sector number to a device logical block number.
>>>   */
>>>  static inline u64 nvme_sect_to_lba(struct nvme_ns *ns, sector_t sector)
>>>  {
>>> -	return sector >> (ns->lba_shift - SECTOR_SHIFT);
>>> +#ifdef CONFIG_BLK_DEV_ZONED
>>> +	return ns->sect_to_lba(ns, sector);
>>
>> So for a power of 2 zone sized device, you are forcing an indirect call,
>> always. Not acceptable. What is the point of that po2_zone_emu boolean
>> you added to the queue ?
> This is a good point and we had a discussion about this internally.
> Initially I had something like this:
> if (!blk_queue_is_po2_zone_emu(disk))
> 	return sector >> (ns->lba_shift - SECTOR_SHIFT);
> else
> 	return __nvme_sect_to_lba_po2(ns, sec);

No need for the else.

> 
> But @Luis indicated that it was better to set an op which comes at a cost of indirection
> instead of having a runtime check with a if/else in the **hot path**. The code also looks
> more clear with having an op.

The indirect call using a function pointer makes the code obscure. And
the cost of that call is far greater and always present compared to the
CPU branch prediction which will luckily avoid most of the time taking
the wrong branch of an if.

> 
> The performance analysis that we performed did not show any regression while using the indirection
> for po2 zone sizes, at least on the x86_64 architecture.
> So maybe we could use this opportunity to discuss which approach could be used here.

The easiest one that makes the code clear and easy to understand.
Luis Chamberlain March 10, 2022, 8:35 p.m. UTC | #4
On Thu, Mar 10, 2022 at 06:43:47AM +0900, Damien Le Moal wrote:
> On 3/9/22 23:33, Pankaj Raghav wrote:
> > On 2022-03-09 05:04, Damien Le Moal wrote:
> >> So for a power of 2 zone sized device, you are forcing an indirect call,
> >> always. Not acceptable. What is the point of that po2_zone_emu boolean
> >> you added to the queue ?
> > This is a good point and we had a discussion about this internally.
> > Initially I had something like this:
> > if (!blk_queue_is_po2_zone_emu(disk))
> > 	return sector >> (ns->lba_shift - SECTOR_SHIFT);
> > else
> > 	return __nvme_sect_to_lba_po2(ns, sec);
> 
> No need for the else.

If true then great.

> > But @Luis indicated that it was better to set an op which comes at a cost of indirection
> > instead of having a runtime check with a if/else in the **hot path**. The code also looks
> > more clear with having an op.
> 
> The indirect call using a function pointer makes the code obscure. And
> the cost of that call is far greater and always present compared to the
> CPU branch prediction which will luckily avoid most of the time taking
> the wrong branch of an if.

The goal was to ensure no performance impact, and given a hot path
was involved and we simply cannot microbench append as there is no
way / API to do that, we can't be sure. But if you are certain that
there is no perf impact, it would be wonderful to live without it.

Thanks for the suggestion and push!

  Luis
Damien Le Moal March 10, 2022, 11:50 p.m. UTC | #5
On 3/11/22 05:35, Luis Chamberlain wrote:
> On Thu, Mar 10, 2022 at 06:43:47AM +0900, Damien Le Moal wrote:
>> On 3/9/22 23:33, Pankaj Raghav wrote:
>>> On 2022-03-09 05:04, Damien Le Moal wrote:
>>>> So for a power of 2 zone sized device, you are forcing an indirect call,
>>>> always. Not acceptable. What is the point of that po2_zone_emu boolean
>>>> you added to the queue ?
>>> This is a good point and we had a discussion about this internally.
>>> Initially I had something like this:
>>> if (!blk_queue_is_po2_zone_emu(disk))
>>> 	return sector >> (ns->lba_shift - SECTOR_SHIFT);
>>> else
>>> 	return __nvme_sect_to_lba_po2(ns, sec);
>>
>> No need for the else.
> 
> If true then great.

If true ? The else is clearly not needed here. One less line of code.

> 
>>> But @Luis indicated that it was better to set an op which comes at a cost of indirection
>>> instead of having a runtime check with a if/else in the **hot path**. The code also looks
>>> more clear with having an op.
>>
>> The indirect call using a function pointer makes the code obscure. And
>> the cost of that call is far greater and always present compared to the
>> CPU branch prediction which will luckily avoid most of the time taking
>> the wrong branch of an if.
> 
> The goal was to ensure no performance impact, and given a hot path
> was involved and we simply cannot microbench append as there is no
> way / API to do that, we can't be sure. But if you are certain that
> there is no perf impact, it would be wonderful to live without it.

Use zonefs. It uses zone append.

> 
> Thanks for the suggestion and push!
> 
>   Luis
Luis Chamberlain March 11, 2022, 12:56 a.m. UTC | #6
On Fri, Mar 11, 2022 at 08:50:47AM +0900, Damien Le Moal wrote:
> On 3/11/22 05:35, Luis Chamberlain wrote:
> > On Thu, Mar 10, 2022 at 06:43:47AM +0900, Damien Le Moal wrote:
> >> The indirect call using a function pointer makes the code obscure. And
> >> the cost of that call is far greater and always present compared to the
> >> CPU branch prediction which will luckily avoid most of the time taking
> >> the wrong branch of an if.
> > 
> > The goal was to ensure no performance impact, and given a hot path
> > was involved and we simply cannot microbench append as there is no
> > way / API to do that, we can't be sure. But if you are certain that
> > there is no perf impact, it would be wonderful to live without it.
> 
> Use zonefs. It uses zone append.

That'd be a microbench on zonefs append, not raw append, and so I
suspect a simple branch cannot make a difference and would get lost
as noise. So I don't think a perf test is required here. Please
let me know.

  Luis
diff mbox series

Patch

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index fd4720d37cc0..c7180d512b08 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -327,14 +327,6 @@  static inline enum nvme_disposition nvme_decide_disposition(struct request *req)
 	return RETRY;
 }
 
-static inline void nvme_end_req_zoned(struct request *req)
-{
-	if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
-	    req_op(req) == REQ_OP_ZONE_APPEND)
-		req->__sector = nvme_lba_to_sect(req->q->queuedata,
-			le64_to_cpu(nvme_req(req)->result.u64));
-}
-
 static inline void nvme_end_req(struct request *req)
 {
 	blk_status_t status = nvme_error_status(nvme_req(req)->status);
@@ -676,6 +668,12 @@  blk_status_t nvme_fail_nonready_command(struct nvme_ctrl *ctrl,
 }
 EXPORT_SYMBOL_GPL(nvme_fail_nonready_command);
 
+blk_status_t nvme_fail_po2_zone_emu_violation(struct request *req)
+{
+	return nvme_zone_handle_po2_emu_violation(req);
+}
+EXPORT_SYMBOL_GPL(nvme_fail_po2_zone_emu_violation);
+
 bool __nvme_check_ready(struct nvme_ctrl *ctrl, struct request *rq,
 		bool queue_live)
 {
@@ -879,6 +877,7 @@  static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
 	req->special_vec.bv_offset = offset_in_page(range);
 	req->special_vec.bv_len = alloc_size;
 	req->rq_flags |= RQF_SPECIAL_PAYLOAD;
+	nvme_verify_sector_value(ns, req);
 
 	return BLK_STS_OK;
 }
@@ -909,6 +908,7 @@  static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns,
 			break;
 		}
 	}
+	nvme_verify_sector_value(ns, req);
 
 	return BLK_STS_OK;
 }
@@ -973,6 +973,7 @@  static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
 
 	cmnd->rw.control = cpu_to_le16(control);
 	cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt);
+	nvme_verify_sector_value(ns, req);
 	return 0;
 }
 
@@ -2105,8 +2106,14 @@  static int nvme_report_zones(struct gendisk *disk, sector_t sector,
 	return nvme_ns_report_zones(disk->private_data, sector, nr_zones, cb,
 			data);
 }
+
+static void nvme_npo2_zone_setup(struct gendisk *disk)
+{
+	nvme_ns_po2_zone_emu_setup(disk->private_data);
+}
 #else
 #define nvme_report_zones	NULL
+#define nvme_npo2_zone_setup	NULL
 #endif /* CONFIG_BLK_DEV_ZONED */
 
 static const struct block_device_operations nvme_bdev_ops = {
@@ -2116,6 +2123,7 @@  static const struct block_device_operations nvme_bdev_ops = {
 	.release	= nvme_release,
 	.getgeo		= nvme_getgeo,
 	.report_zones	= nvme_report_zones,
+	.npo2_zone_setup = nvme_npo2_zone_setup,
 	.pr_ops		= &nvme_pr_ops,
 };
 
@@ -3844,6 +3852,10 @@  static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid,
 	ns->disk = disk;
 	ns->queue = disk->queue;
 
+#ifdef CONFIG_BLK_DEV_ZONED
+	ns->sect_to_lba = __nvme_sect_to_lba;
+	ns->update_sector_append = nvme_update_sector_append_noop;
+#endif
 	if (ctrl->opts && ctrl->opts->data_digest)
 		blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, ns->queue);
 
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index a162f6c6da6e..f584f760e8cc 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -457,6 +457,10 @@  struct nvme_ns {
 	u8 pi_type;
 #ifdef CONFIG_BLK_DEV_ZONED
 	u64 zsze;
+	u64 zsze_po2;
+	u32 zsze_diff;
+	u64 (*sect_to_lba)(struct nvme_ns *ns, sector_t sector);
+	sector_t (*update_sector_append)(struct nvme_ns *ns, sector_t sector);
 #endif
 	unsigned long features;
 	unsigned long flags;
@@ -562,12 +566,21 @@  static inline int nvme_reset_subsystem(struct nvme_ctrl *ctrl)
 	return ctrl->ops->reg_write32(ctrl, NVME_REG_NSSR, 0x4E564D65);
 }
 
+static inline u64 __nvme_sect_to_lba(struct nvme_ns *ns, sector_t sector)
+{
+	return sector >> (ns->lba_shift - SECTOR_SHIFT);
+}
+
 /*
  * Convert a 512B sector number to a device logical block number.
  */
 static inline u64 nvme_sect_to_lba(struct nvme_ns *ns, sector_t sector)
 {
-	return sector >> (ns->lba_shift - SECTOR_SHIFT);
+#ifdef CONFIG_BLK_DEV_ZONED
+	return ns->sect_to_lba(ns, sector);
+#else
+	return __nvme_sect_to_lba(ns, sector);
+#endif
 }
 
 /*
@@ -578,6 +591,83 @@  static inline sector_t nvme_lba_to_sect(struct nvme_ns *ns, u64 lba)
 	return lba << (ns->lba_shift - SECTOR_SHIFT);
 }
 
+#ifdef CONFIG_BLK_DEV_ZONED
+static inline u64 __nvme_sect_to_lba_po2(struct nvme_ns *ns, sector_t sector)
+{
+	sector_t zone_idx = sector >> ilog2(ns->zsze_po2);
+
+	sector = sector - (zone_idx * ns->zsze_diff);
+
+	return sector >> (ns->lba_shift - SECTOR_SHIFT);
+}
+
+static inline sector_t
+nvme_update_sector_append_po2_zone_emu(struct nvme_ns *ns, sector_t sector)
+{
+	/* The sector value passed by the drive after a append operation is the
+	 * based on the actual zone layout in the device, hence, use the actual
+	 * zone_size to calculate the zone number from the sector.
+	 */
+	u32 zone_no = sector / ns->zsze;
+
+	sector += ns->zsze_diff * zone_no;
+	return sector;
+}
+
+static inline sector_t nvme_update_sector_append_noop(struct nvme_ns *ns,
+						      sector_t sector)
+{
+	return sector;
+}
+
+static inline void nvme_end_req_zoned(struct request *req)
+{
+	if (req_op(req) == REQ_OP_ZONE_APPEND) {
+		struct nvme_ns *ns = req->q->queuedata;
+		sector_t sector;
+
+		sector = nvme_lba_to_sect(ns,
+					  le64_to_cpu(nvme_req(req)->result.u64));
+
+		sector = ns->update_sector_append(ns, sector);
+
+		req->__sector = sector;
+	}
+}
+
+static inline void nvme_verify_sector_value(struct nvme_ns *ns, struct request *req)
+{
+	if (unlikely(blk_queue_is_po2_zone_emu(ns->queue))) {
+		sector_t sector = blk_rq_pos(req);
+		sector_t zone_idx = sector >> ilog2(ns->zsze_po2);
+		sector_t device_sector = sector - (zone_idx * ns->zsze_diff);
+
+		/* Check if the IO is in the emulated area */
+		if (device_sector - (zone_idx * ns->zsze) > ns->zsze)
+			req->rq_flags |= RQF_ZONE_EMU_VIOLATION;
+	}
+}
+
+static inline bool nvme_po2_zone_emu_violation(struct request *req)
+{
+	return req->rq_flags & RQF_ZONE_EMU_VIOLATION;
+}
+#else
+static inline void nvme_end_req_zoned(struct request *req)
+{
+}
+
+static inline void nvme_verify_sector_value(struct nvme_ns *ns, struct request *req)
+{
+}
+
+static inline bool nvme_po2_zone_emu_violation(struct request *req)
+{
+	return false;
+}
+
+#endif
+
 /*
  * Convert byte length to nvme's 0-based num dwords
  */
@@ -752,6 +842,7 @@  long nvme_ns_head_chr_ioctl(struct file *file, unsigned int cmd,
 long nvme_dev_ioctl(struct file *file, unsigned int cmd,
 		unsigned long arg);
 int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo);
+blk_status_t nvme_fail_po2_zone_emu_violation(struct request *req);
 
 extern const struct attribute_group *nvme_ns_id_attr_groups[];
 extern const struct pr_ops nvme_pr_ops;
@@ -873,11 +964,13 @@  static inline void nvme_mpath_default_iopolicy(struct nvme_subsystem *subsys)
 int nvme_revalidate_zones(struct nvme_ns *ns);
 int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector,
 		unsigned int nr_zones, report_zones_cb cb, void *data);
+void nvme_ns_po2_zone_emu_setup(struct nvme_ns *ns);
 #ifdef CONFIG_BLK_DEV_ZONED
 int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf);
 blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct request *req,
 				       struct nvme_command *cmnd,
 				       enum nvme_zone_mgmt_action action);
+blk_status_t nvme_zone_handle_po2_emu_violation(struct request *req);
 #else
 static inline blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns,
 		struct request *req, struct nvme_command *cmnd,
@@ -892,6 +985,11 @@  static inline int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf)
 		 "Please enable CONFIG_BLK_DEV_ZONED to support ZNS devices\n");
 	return -EPROTONOSUPPORT;
 }
+
+static inline blk_status_t nvme_zone_handle_po2_emu_violation(struct request *req)
+{
+	return BLK_STS_OK;
+}
 #endif
 
 static inline struct nvme_ns *nvme_get_ns_from_dev(struct device *dev)
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 6a99ed680915..fc022df3f98e 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -960,6 +960,10 @@  static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 		return nvme_fail_nonready_command(&dev->ctrl, req);
 
 	ret = nvme_prep_rq(dev, req);
+
+	if (unlikely(nvme_po2_zone_emu_violation(req)))
+		return nvme_fail_po2_zone_emu_violation(req);
+
 	if (unlikely(ret))
 		return ret;
 	spin_lock(&nvmeq->sq_lock);
diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c
index ad02c61c0b52..25516a5ae7e2 100644
--- a/drivers/nvme/host/zns.c
+++ b/drivers/nvme/host/zns.c
@@ -3,7 +3,9 @@ 
  * Copyright (C) 2020 Western Digital Corporation or its affiliates.
  */
 
+#include <linux/log2.h>
 #include <linux/blkdev.h>
+#include <linux/math.h>
 #include <linux/vmalloc.h>
 #include "nvme.h"
 
@@ -46,6 +48,18 @@  static int nvme_set_max_append(struct nvme_ctrl *ctrl)
 	return 0;
 }
 
+static sector_t nvme_zone_size(struct nvme_ns *ns)
+{
+	sector_t zone_size;
+
+	if (blk_queue_is_po2_zone_emu(ns->queue))
+		zone_size = ns->zsze_po2;
+	else
+		zone_size = ns->zsze;
+
+	return zone_size;
+}
+
 int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf)
 {
 	struct nvme_effects_log *log = ns->head->effects;
@@ -122,7 +136,7 @@  static void *nvme_zns_alloc_report_buffer(struct nvme_ns *ns,
 				   sizeof(struct nvme_zone_descriptor);
 
 	nr_zones = min_t(unsigned int, nr_zones,
-			 get_capacity(ns->disk) / ns->zsze);
+			 get_capacity(ns->disk) / nvme_zone_size(ns));
 
 	bufsize = sizeof(struct nvme_zone_report) +
 		nr_zones * sizeof(struct nvme_zone_descriptor);
@@ -147,6 +161,8 @@  static int nvme_zone_parse_entry(struct nvme_ns *ns,
 				 void *data)
 {
 	struct blk_zone zone = { };
+	u64 zone_gap = 0;
+	u32 zone_idx;
 
 	if ((entry->zt & 0xf) != NVME_ZONE_TYPE_SEQWRITE_REQ) {
 		dev_err(ns->ctrl->device, "invalid zone type %#x\n",
@@ -159,10 +175,19 @@  static int nvme_zone_parse_entry(struct nvme_ns *ns,
 	zone.len = ns->zsze;
 	zone.capacity = nvme_lba_to_sect(ns, le64_to_cpu(entry->zcap));
 	zone.start = nvme_lba_to_sect(ns, le64_to_cpu(entry->zslba));
+
+	if (blk_queue_is_po2_zone_emu(ns->queue)) {
+		zone_idx = zone.start / zone.len;
+		zone_gap = zone_idx * ns->zsze_diff;
+		zone.start += zone_gap;
+		zone.len = ns->zsze_po2;
+	}
+
 	if (zone.cond == BLK_ZONE_COND_FULL)
 		zone.wp = zone.start + zone.len;
 	else
-		zone.wp = nvme_lba_to_sect(ns, le64_to_cpu(entry->wp));
+		zone.wp =
+			nvme_lba_to_sect(ns, le64_to_cpu(entry->wp)) + zone_gap;
 
 	return cb(&zone, idx, data);
 }
@@ -173,6 +198,7 @@  int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector,
 	struct nvme_zone_report *report;
 	struct nvme_command c = { };
 	int ret, zone_idx = 0;
+	u64 zone_size = nvme_zone_size(ns);
 	unsigned int nz, i;
 	size_t buflen;
 
@@ -190,7 +216,7 @@  int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector,
 	c.zmr.zrasf = NVME_ZRASF_ZONE_REPORT_ALL;
 	c.zmr.pr = NVME_REPORT_ZONE_PARTIAL;
 
-	sector &= ~(ns->zsze - 1);
+	sector = rounddown(sector, zone_size);
 	while (zone_idx < nr_zones && sector < get_capacity(ns->disk)) {
 		memset(report, 0, buflen);
 
@@ -214,7 +240,7 @@  int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector,
 			zone_idx++;
 		}
 
-		sector += ns->zsze * nz;
+		sector += zone_size * nz;
 	}
 
 	if (zone_idx > 0)
@@ -226,6 +252,32 @@  int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector,
 	return ret;
 }
 
+void nvme_ns_po2_zone_emu_setup(struct nvme_ns *ns)
+{
+	u32 nr_zones;
+	sector_t capacity;
+
+	if (is_power_of_2(ns->zsze))
+		return;
+
+	if (!get_capacity(ns->disk))
+		return;
+
+	blk_mq_freeze_queue(ns->disk->queue);
+
+	blk_queue_po2_zone_emu(ns->queue, 1);
+	ns->zsze_po2 = 1 << get_count_order_long(ns->zsze);
+	ns->zsze_diff = ns->zsze_po2 - ns->zsze;
+
+	nr_zones = get_capacity(ns->disk) / ns->zsze;
+	capacity = nr_zones * ns->zsze_po2;
+	set_capacity_and_notify(ns->disk, capacity);
+	ns->sect_to_lba = __nvme_sect_to_lba_po2;
+	ns->update_sector_append = nvme_update_sector_append_po2_zone_emu;
+
+	blk_mq_unfreeze_queue(ns->disk->queue);
+}
+
 blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct request *req,
 		struct nvme_command *c, enum nvme_zone_mgmt_action action)
 {
@@ -239,5 +291,24 @@  blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct request *req,
 	if (req_op(req) == REQ_OP_ZONE_RESET_ALL)
 		c->zms.select_all = 1;
 
+	nvme_verify_sector_value(ns, req);
+	return BLK_STS_OK;
+}
+
+blk_status_t nvme_zone_handle_po2_emu_violation(struct request *req)
+{
+	/*  The spec mentions that read from ZCAP until ZSZE shall behave
+	 *  like a deallocated block. Deallocated block reads are
+	 *  deterministic, hence we fill zero.
+	 * The spec does not clearly define the result for other opreation.
+	 */
+	if (req_op(req) == REQ_OP_READ) {
+		zero_fill_bio(req->bio);
+		nvme_req(req)->status = NVME_SC_SUCCESS;
+	} else {
+		nvme_req(req)->status = NVME_SC_WRITE_FAULT;
+	}
+	blk_mq_set_request_complete(req);
+	nvme_complete_rq(req);
 	return BLK_STS_OK;
 }
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 3a41d50b85d3..9ec59183efcd 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -57,6 +57,8 @@  typedef __u32 __bitwise req_flags_t;
 #define RQF_TIMED_OUT		((__force req_flags_t)(1 << 21))
 /* queue has elevator attached */
 #define RQF_ELV			((__force req_flags_t)(1 << 22))
+/* request to do IO in the emulated area with po2 zone emulation */
+#define RQF_ZONE_EMU_VIOLATION	((__force req_flags_t)(1 << 23))
 
 /* flags that prevent us from merging requests: */
 #define RQF_NOMERGE_FLAGS \