diff mbox series

[5/5] nvme: support for zoned namespaces

Message ID 20200615233424.13458-6-keith.busch@wdc.com (mailing list archive)
State New, archived
Headers show
Series nvme support for zoned namespace command set | expand

Commit Message

Keith Busch June 15, 2020, 11:34 p.m. UTC
Add support for NVM Express Zoned Namespaces (ZNS) Command Set defined
in NVM Express TP4053. Zoned namespaces are discovered based on their
Command Set Identifier reported in the namespaces Namespace
Identification Descriptor list. A successfully discovered Zoned
Namespace will be registered with the block layer as a host managed
zoned block device with Zone Append command support. A namespace that
does not support append is not supported by the driver.

Signed-off-by: Hans Holmberg <hans.holmberg@wdc.com>
Signed-off-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
Signed-off-by: Ajay Joshi <ajay.joshi@wdc.com>
Signed-off-by: Aravind Ramesh <aravind.ramesh@wdc.com>
Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
Signed-off-by: Matias Bjørling <matias.bjorling@wdc.com>
Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
Signed-off-by: Keith Busch <keith.busch@wdc.com>
---
 drivers/nvme/host/Makefile |   1 +
 drivers/nvme/host/core.c   |  91 ++++++++++++--
 drivers/nvme/host/nvme.h   |  39 ++++++
 drivers/nvme/host/zns.c    | 238 +++++++++++++++++++++++++++++++++++++
 include/linux/nvme.h       | 111 +++++++++++++++++
 5 files changed, 468 insertions(+), 12 deletions(-)
 create mode 100644 drivers/nvme/host/zns.c

Comments

Javier González June 16, 2020, 10:41 a.m. UTC | #1
On 16.06.2020 08:34, Keith Busch wrote:
>Add support for NVM Express Zoned Namespaces (ZNS) Command Set defined
>in NVM Express TP4053. Zoned namespaces are discovered based on their
>Command Set Identifier reported in the namespaces Namespace
>Identification Descriptor list. A successfully discovered Zoned
>Namespace will be registered with the block layer as a host managed
>zoned block device with Zone Append command support. A namespace that
>does not support append is not supported by the driver.

Why are we enforcing the append command? Append is optional on the
current ZNS specification, so we should not make this mandatory in the
implementation. See specifics below.

>
>Signed-off-by: Hans Holmberg <hans.holmberg@wdc.com>
>Signed-off-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
>Signed-off-by: Ajay Joshi <ajay.joshi@wdc.com>
>Signed-off-by: Aravind Ramesh <aravind.ramesh@wdc.com>
>Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
>Signed-off-by: Matias Bjørling <matias.bjorling@wdc.com>
>Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
>Signed-off-by: Keith Busch <keith.busch@wdc.com>
>---
> drivers/nvme/host/Makefile |   1 +
> drivers/nvme/host/core.c   |  91 ++++++++++++--
> drivers/nvme/host/nvme.h   |  39 ++++++
> drivers/nvme/host/zns.c    | 238 +++++++++++++++++++++++++++++++++++++
> include/linux/nvme.h       | 111 +++++++++++++++++
> 5 files changed, 468 insertions(+), 12 deletions(-)
> create mode 100644 drivers/nvme/host/zns.c
>
>diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile
>index fc7b26be692d..d7f6a87687b8 100644
>--- a/drivers/nvme/host/Makefile
>+++ b/drivers/nvme/host/Makefile
>@@ -13,6 +13,7 @@ nvme-core-y				:= core.o
> nvme-core-$(CONFIG_TRACING)		+= trace.o
> nvme-core-$(CONFIG_NVME_MULTIPATH)	+= multipath.o
> nvme-core-$(CONFIG_NVM)			+= lightnvm.o
>+nvme-core-$(CONFIG_BLK_DEV_ZONED)	+= zns.o
> nvme-core-$(CONFIG_FAULT_INJECTION_DEBUG_FS)	+= fault_inject.o
> nvme-core-$(CONFIG_NVME_HWMON)		+= hwmon.o
>
>diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
>index 58f137b9f2c5..e961910da4ac 100644
>--- a/drivers/nvme/host/core.c
>+++ b/drivers/nvme/host/core.c
>@@ -89,7 +89,7 @@ static dev_t nvme_chr_devt;
> static struct class *nvme_class;
> static struct class *nvme_subsys_class;
>
>-static int nvme_revalidate_disk(struct gendisk *disk);
>+static int _nvme_revalidate_disk(struct gendisk *disk);
> static void nvme_put_subsystem(struct nvme_subsystem *subsys);
> static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
> 					   unsigned nsid);
>@@ -287,6 +287,10 @@ void nvme_complete_rq(struct request *req)
> 			nvme_retry_req(req);
> 			return;
> 		}
>+	} else if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
>+		   req_op(req) == REQ_OP_ZONE_APPEND) {
>+		req->__sector = nvme_lba_to_sect(req->q->queuedata,
>+			le64_to_cpu(nvme_req(req)->result.u64));
> 	}
>
> 	nvme_trace_bio_complete(req, status);
>@@ -673,7 +677,8 @@ static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns,
> }
>
> static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
>-		struct request *req, struct nvme_command *cmnd)
>+		struct request *req, struct nvme_command *cmnd,
>+		enum nvme_opcode op)
> {
> 	struct nvme_ctrl *ctrl = ns->ctrl;
> 	u16 control = 0;
>@@ -687,7 +692,7 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
> 	if (req->cmd_flags & REQ_RAHEAD)
> 		dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
>
>-	cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read);
>+	cmnd->rw.opcode = op;
> 	cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id);
> 	cmnd->rw.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
> 	cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
>@@ -716,6 +721,8 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
> 		case NVME_NS_DPS_PI_TYPE2:
> 			control |= NVME_RW_PRINFO_PRCHK_GUARD |
> 					NVME_RW_PRINFO_PRCHK_REF;
>+			if (op == nvme_cmd_zone_append)
>+				control |= NVME_RW_APPEND_PIREMAP;
> 			cmnd->rw.reftag = cpu_to_le32(t10_pi_ref_tag(req));
> 			break;
> 		}
>@@ -756,6 +763,19 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
> 	case REQ_OP_FLUSH:
> 		nvme_setup_flush(ns, cmd);
> 		break;
>+	case REQ_OP_ZONE_RESET_ALL:
>+	case REQ_OP_ZONE_RESET:
>+		ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_RESET);
>+		break;
>+	case REQ_OP_ZONE_OPEN:
>+		ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_OPEN);
>+		break;
>+	case REQ_OP_ZONE_CLOSE:
>+		ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_CLOSE);
>+		break;
>+	case REQ_OP_ZONE_FINISH:
>+		ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_FINISH);
>+		break;
> 	case REQ_OP_WRITE_ZEROES:
> 		ret = nvme_setup_write_zeroes(ns, req, cmd);
> 		break;
>@@ -763,8 +783,13 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
> 		ret = nvme_setup_discard(ns, req, cmd);
> 		break;
> 	case REQ_OP_READ:
>+		ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_read);
>+		break;
> 	case REQ_OP_WRITE:
>-		ret = nvme_setup_rw(ns, req, cmd);
>+		ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_write);
>+		break;
>+	case REQ_OP_ZONE_APPEND:
>+		ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_zone_append);
> 		break;
> 	default:
> 		WARN_ON_ONCE(1);
>@@ -1392,14 +1417,23 @@ static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
> 	return effects;
> }
>
>-static void nvme_update_formats(struct nvme_ctrl *ctrl)
>+static void nvme_update_formats(struct nvme_ctrl *ctrl, u32 *effects)
> {
> 	struct nvme_ns *ns;
>
> 	down_read(&ctrl->namespaces_rwsem);
> 	list_for_each_entry(ns, &ctrl->namespaces, list)
>-		if (ns->disk && nvme_revalidate_disk(ns->disk))
>+		if (ns->disk && _nvme_revalidate_disk(ns->disk))
> 			nvme_set_queue_dying(ns);
>+		else if (blk_queue_is_zoned(ns->disk->queue)) {
>+			/*
>+			 * IO commands are required to fully revalidate a zoned
>+			 * device. Force the command effects to trigger rescan
>+			 * work so report zones can run in a context with
>+			 * unfrozen IO queues.
>+			 */
>+			*effects |= NVME_CMD_EFFECTS_NCC;
>+		}
> 	up_read(&ctrl->namespaces_rwsem);
> }
>
>@@ -1411,7 +1445,7 @@ static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects)
> 	 * this command.
> 	 */
> 	if (effects & NVME_CMD_EFFECTS_LBCC)
>-		nvme_update_formats(ctrl);
>+		nvme_update_formats(ctrl, &effects);
> 	if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) {
> 		nvme_unfreeze(ctrl);
> 		nvme_mpath_unfreeze(ctrl->subsys);
>@@ -1526,7 +1560,7 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
>  * Issue ioctl requests on the first available path.  Note that unlike normal
>  * block layer requests we will not retry failed request on another controller.
>  */
>-static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
>+struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
> 		struct nvme_ns_head **head, int *srcu_idx)
> {
> #ifdef CONFIG_NVME_MULTIPATH
>@@ -1546,7 +1580,7 @@ static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
> 	return disk->private_data;
> }
>
>-static void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx)
>+void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx)
> {
> 	if (head)
> 		srcu_read_unlock(&head->srcu, idx);
>@@ -1939,21 +1973,28 @@ static void nvme_update_disk_info(struct gendisk *disk,
>
> static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
> {
>+	unsigned lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK;
> 	struct nvme_ns *ns = disk->private_data;
> 	struct nvme_ctrl *ctrl = ns->ctrl;
>+	int ret;
> 	u32 iob;
>
> 	/*
> 	 * If identify namespace failed, use default 512 byte block size so
> 	 * block layer can use before failing read/write for 0 capacity.
> 	 */
>-	ns->lba_shift = id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ds;
>+	ns->lba_shift = id->lbaf[lbaf].ds;
> 	if (ns->lba_shift == 0)
> 		ns->lba_shift = 9;
>
> 	switch (ns->head->ids.csi) {
> 	case NVME_CSI_NVM:
> 		break;
>+	case NVME_CSI_ZNS:
>+		ret = nvme_update_zone_info(disk, ns, lbaf);
>+		if (ret)
>+			return ret;
>+		break;
> 	default:
> 		dev_warn(ctrl->device, "unknown csi:%d ns:%d\n",
> 			ns->head->ids.csi, ns->head->ns_id);
>@@ -1967,7 +2008,7 @@ static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
> 		iob = nvme_lba_to_sect(ns, le16_to_cpu(id->noiob));
>
> 	ns->features = 0;
>-	ns->ms = le16_to_cpu(id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ms);
>+	ns->ms = le16_to_cpu(id->lbaf[lbaf].ms);
> 	/* the PI implementation requires metadata equal t10 pi tuple size */
> 	if (ns->ms == sizeof(struct t10_pi_tuple))
> 		ns->pi_type = id->dps & NVME_NS_DPS_PI_MASK;
>@@ -2010,7 +2051,7 @@ static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
> 	return 0;
> }
>
>-static int nvme_revalidate_disk(struct gendisk *disk)
>+static int _nvme_revalidate_disk(struct gendisk *disk)
> {
> 	struct nvme_ns *ns = disk->private_data;
> 	struct nvme_ctrl *ctrl = ns->ctrl;
>@@ -2058,6 +2099,28 @@ static int nvme_revalidate_disk(struct gendisk *disk)
> 	return ret;
> }
>
>+static int nvme_revalidate_disk(struct gendisk *disk)
>+{
>+	int ret;
>+
>+	ret = _nvme_revalidate_disk(disk);
>+	if (ret)
>+		return ret;
>+
>+#ifdef CONFIG_BLK_DEV_ZONED
>+	if (blk_queue_is_zoned(disk->queue)) {
>+		struct nvme_ns *ns = disk->private_data;
>+		struct nvme_ctrl *ctrl = ns->ctrl;
>+
>+		ret = blk_revalidate_disk_zones(disk, NULL);
>+		if (!ret)
>+			blk_queue_max_zone_append_sectors(disk->queue,
>+							  ctrl->max_zone_append);
>+	}
>+#endif
>+	return ret;
>+}
>+
> static char nvme_pr_type(enum pr_type type)
> {
> 	switch (type) {
>@@ -2188,6 +2251,7 @@ static const struct block_device_operations nvme_fops = {
> 	.release	= nvme_release,
> 	.getgeo		= nvme_getgeo,
> 	.revalidate_disk= nvme_revalidate_disk,
>+	.report_zones	= nvme_report_zones,
> 	.pr_ops		= &nvme_pr_ops,
> };
>
>@@ -2213,6 +2277,7 @@ const struct block_device_operations nvme_ns_head_ops = {
> 	.ioctl		= nvme_ioctl,
> 	.compat_ioctl	= nvme_compat_ioctl,
> 	.getgeo		= nvme_getgeo,
>+	.report_zones	= nvme_report_zones,
> 	.pr_ops		= &nvme_pr_ops,
> };
> #endif /* CONFIG_NVME_MULTIPATH */
>@@ -4439,6 +4504,8 @@ static inline void _nvme_check_size(void)
> 	BUILD_BUG_ON(sizeof(struct nvme_command) != 64);
> 	BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != NVME_IDENTIFY_DATA_SIZE);
> 	BUILD_BUG_ON(sizeof(struct nvme_id_ns) != NVME_IDENTIFY_DATA_SIZE);
>+	BUILD_BUG_ON(sizeof(struct nvme_id_ns_zns) != NVME_IDENTIFY_DATA_SIZE);
>+	BUILD_BUG_ON(sizeof(struct nvme_id_ctrl_zns) != NVME_IDENTIFY_DATA_SIZE);
> 	BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
> 	BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
> 	BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64);
>diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
>index 58428e3a590e..662f95fbd909 100644
>--- a/drivers/nvme/host/nvme.h
>+++ b/drivers/nvme/host/nvme.h
>@@ -239,6 +239,9 @@ struct nvme_ctrl {
> 	u32 max_hw_sectors;
> 	u32 max_segments;
> 	u32 max_integrity_segments;
>+#ifdef CONFIG_BLK_DEV_ZONED
>+	u32 max_zone_append;
>+#endif
> 	u16 crdt[3];
> 	u16 oncs;
> 	u16 oacs;
>@@ -403,6 +406,9 @@ struct nvme_ns {
> 	u16 sgs;
> 	u32 sws;
> 	u8 pi_type;
>+#ifdef CONFIG_BLK_DEV_ZONED
>+	u64 zsze;
>+#endif
> 	unsigned long features;
> 	unsigned long flags;
> #define NVME_NS_REMOVING	0
>@@ -568,6 +574,9 @@ int nvme_delete_ctrl(struct nvme_ctrl *ctrl);
>
> int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi,
> 		void *log, size_t size, u64 offset);
>+struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
>+		struct nvme_ns_head **head, int *srcu_idx);
>+void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx);
>
> extern const struct attribute_group *nvme_ns_id_attr_groups[];
> extern const struct block_device_operations nvme_ns_head_ops;
>@@ -689,6 +698,36 @@ static inline void nvme_mpath_start_freeze(struct nvme_subsystem *subsys)
> }
> #endif /* CONFIG_NVME_MULTIPATH */
>
>+#ifdef CONFIG_BLK_DEV_ZONED
>+int nvme_update_zone_info(struct gendisk *disk, struct nvme_ns *ns,
>+			  unsigned lbaf);
>+
>+int nvme_report_zones(struct gendisk *disk, sector_t sector,
>+		      unsigned int nr_zones, report_zones_cb cb, void *data);
>+
>+blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct request *req,
>+				       struct nvme_command *cmnd,
>+				       enum nvme_zone_mgmt_action action);
>+#else
>+#define nvme_report_zones NULL
>+
>+static inline blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns,
>+		struct request *req, struct nvme_command *cmnd,
>+		enum nvme_zone_mgmt_action action)
>+{
>+	return BLK_STS_NOTSUPP;
>+}
>+
>+static inline int nvme_update_zone_info(struct gendisk *disk,
>+					struct nvme_ns *ns,
>+					unsigned lbaf)
>+{
>+	dev_warn(ns->ctrl->device,
>+		 "Please enable CONFIG_BLK_DEV_ZONED to support ZNS devices\n");
>+	return -EPROTONOSUPPORT;
>+}
>+#endif
>+
> #ifdef CONFIG_NVM
> int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node);
> void nvme_nvm_unregister(struct nvme_ns *ns);
>diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c
>new file mode 100644
>index 000000000000..c08f6281b614
>--- /dev/null
>+++ b/drivers/nvme/host/zns.c
>@@ -0,0 +1,238 @@
>+// SPDX-License-Identifier: GPL-2.0
>+/*
>+ * Copyright (C) 2020 Western Digital Corporation or its affiliates.
>+ */
>+
>+#include <linux/blkdev.h>
>+#include <linux/vmalloc.h>
>+#include "nvme.h"
>+
>+static int nvme_set_max_append(struct nvme_ctrl *ctrl)
>+{
>+	struct nvme_command c = { };
>+	struct nvme_id_ctrl_zns *id;
>+	int status;
>+
>+	id = kzalloc(sizeof(*id), GFP_KERNEL);
>+	if (!id)
>+		return -ENOMEM;
>+
>+	c.identify.opcode = nvme_admin_identify;
>+	c.identify.cns = NVME_ID_CNS_CS_CTRL;
>+	c.identify.csi = NVME_CSI_ZNS;
>+
>+	status = nvme_submit_sync_cmd(ctrl->admin_q, &c, id, sizeof(*id));
>+	if (status) {
>+		kfree(id);
>+		return status;
>+	}
>+
>+	ctrl->max_zone_append = 1 << (id->zamds + 3);
>+	kfree(id);
>+	return 0;
>+}
>+
>+int nvme_update_zone_info(struct gendisk *disk, struct nvme_ns *ns,
>+			  unsigned lbaf)
>+{
>+	struct nvme_effects_log *log = ns->head->effects;
>+	struct request_queue *q = disk->queue;
>+	struct nvme_command c = { };
>+	struct nvme_id_ns_zns *id;
>+	int status;
>+
>+	/* Driver requires zone append support */
>+	if (!(log->iocs[nvme_cmd_zone_append] & NVME_CMD_EFFECTS_CSUPP))
>+		return -ENODEV;

Following up on the initial comment, this check should go.

>+
>+	/* Lazily query controller append limit for the first zoned namespace */
>+	if (!ns->ctrl->max_zone_append) {
>+		status = nvme_set_max_append(ns->ctrl);
>+		if (status)
>+			return status;
>+	}

This should only be applied if append is supported.

>+
>+	id = kzalloc(sizeof(*id), GFP_KERNEL);
>+	if (!id)
>+		return -ENOMEM;
>+
>+	c.identify.opcode = nvme_admin_identify;
>+	c.identify.nsid = cpu_to_le32(ns->head->ns_id);
>+	c.identify.cns = NVME_ID_CNS_CS_NS;
>+	c.identify.csi = NVME_CSI_ZNS;
>+
>+	status = nvme_submit_sync_cmd(ns->ctrl->admin_q, &c, id, sizeof(*id));
>+	if (status)
>+		goto free_data;
>+
>+	/*
>+	 * We currently do not handle devices requiring any of the zoned
>+	 * operation characteristics.
>+	 */
>+	if (id->zoc) {
>+		status = -EINVAL;
>+		goto free_data;
>+	}

I understand that "Variable Zone Capacity" is not supported as it
requires major changes at this moment, but we should support controllers
that enable "Zone Active Excursions", even when the AER event is not
implemented in this patchset.

>+
>+	ns->zsze = nvme_lba_to_sect(ns, le64_to_cpu(id->lbafe[lbaf].zsze));
>+	if (!ns->zsze) {
>+		status = -EINVAL;
>+		goto free_data;
>+	}
>+
>+	q->limits.zoned = BLK_ZONED_HM;
>+	blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q);
>+free_data:
>+	kfree(id);
>+	return status;
>+}
>+
>+static void *nvme_zns_alloc_report_buffer(struct nvme_ns *ns,
>+					  unsigned int nr_zones, size_t *buflen)
>+{
>+	struct request_queue *q = ns->disk->queue;
>+	size_t bufsize;
>+	void *buf;
>+
>+	const size_t min_bufsize = sizeof(struct nvme_zone_report) +
>+				   sizeof(struct nvme_zone_descriptor);
>+
>+	nr_zones = min_t(unsigned int, nr_zones,
>+			 get_capacity(ns->disk) >> ilog2(ns->zsze));
>+
>+	bufsize = sizeof(struct nvme_zone_report) +
>+		nr_zones * sizeof(struct nvme_zone_descriptor);
>+	bufsize = min_t(size_t, bufsize,
>+			queue_max_hw_sectors(q) << SECTOR_SHIFT);
>+	bufsize = min_t(size_t, bufsize, queue_max_segments(q) << PAGE_SHIFT);
>+
>+	while (bufsize >= min_bufsize) {
>+		buf = __vmalloc(bufsize,
>+				GFP_KERNEL | __GFP_ZERO | __GFP_NORETRY);
>+		if (buf) {
>+			*buflen = bufsize;
>+			return buf;
>+		}
>+		bufsize >>= 1;
>+	}
>+	return NULL;
>+}
>+
>+static int __nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector,
>+				  struct nvme_zone_report *report,
>+				  size_t buflen)
>+{
>+	struct nvme_command c = { };
>+	int ret;
>+
>+	c.zmr.opcode = nvme_cmd_zone_mgmt_recv;
>+	c.zmr.nsid = cpu_to_le32(ns->head->ns_id);
>+	c.zmr.slba = cpu_to_le64(nvme_sect_to_lba(ns, sector));
>+	c.zmr.numd = cpu_to_le32(nvme_bytes_to_numd(buflen));
>+	c.zmr.zra = NVME_ZRA_ZONE_REPORT;
>+	c.zmr.zrasf = NVME_ZRASF_ZONE_REPORT_ALL;
>+	c.zmr.pr = NVME_REPORT_ZONE_PARTIAL;
>+
>+	ret = nvme_submit_sync_cmd(ns->queue, &c, report, buflen);
>+	if (ret)
>+		return ret;
>+
>+	return le64_to_cpu(report->nr_zones);
>+}
>+
>+static int nvme_zone_parse_entry(struct nvme_ns *ns,
>+				 struct nvme_zone_descriptor *entry,
>+				 unsigned int idx, report_zones_cb cb,
>+				 void *data)
>+{
>+	struct blk_zone zone = { };
>+
>+	if ((entry->zt & 0xf) != NVME_ZONE_TYPE_SEQWRITE_REQ) {
>+		dev_err(ns->ctrl->device, "invalid zone type %#x\n",
>+				entry->zt);
>+		return -EINVAL;
>+	}
>+
>+	zone.type = BLK_ZONE_TYPE_SEQWRITE_REQ;
>+	zone.cond = entry->zs >> 4;
>+	zone.len = ns->zsze;
>+	zone.capacity = nvme_lba_to_sect(ns, le64_to_cpu(entry->zcap));
>+	zone.start = nvme_lba_to_sect(ns, le64_to_cpu(entry->zslba));
>+	zone.wp = nvme_lba_to_sect(ns, le64_to_cpu(entry->wp));
>+
>+	return cb(&zone, idx, data);
>+}
>+
>+static int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector,
>+			unsigned int nr_zones, report_zones_cb cb, void *data)
>+{
>+	struct nvme_zone_report *report;
>+	int ret, zone_idx = 0;
>+	unsigned int nz, i;
>+	size_t buflen;
>+
>+	report = nvme_zns_alloc_report_buffer(ns, nr_zones, &buflen);
>+	if (!report)
>+		return -ENOMEM;
>+
>+	sector &= ~(ns->zsze - 1);
>+	while (zone_idx < nr_zones && sector < get_capacity(ns->disk)) {
>+		memset(report, 0, buflen);
>+		ret = __nvme_ns_report_zones(ns, sector, report, buflen);
>+		if (ret < 0)
>+			goto out_free;
>+
>+		nz = min_t(unsigned int, ret, nr_zones);
>+		if (!nz)
>+			break;
>+
>+		for (i = 0; i < nz && zone_idx < nr_zones; i++) {
>+			ret = nvme_zone_parse_entry(ns, &report->entries[i],
>+						    zone_idx, cb, data);
>+			if (ret)
>+				goto out_free;
>+			zone_idx++;
>+		}
>+
>+		sector += ns->zsze * nz;
>+	}
>+
>+	ret = zone_idx;
>+out_free:
>+	kvfree(report);
>+	return ret;
>+}
>+
>+int nvme_report_zones(struct gendisk *disk, sector_t sector,
>+		      unsigned int nr_zones, report_zones_cb cb, void *data)
>+{
>+	struct nvme_ns_head *head = NULL;
>+	struct nvme_ns *ns;
>+	int srcu_idx, ret;
>+
>+	ns = nvme_get_ns_from_disk(disk, &head, &srcu_idx);
>+	if (unlikely(!ns))
>+		return -EWOULDBLOCK;
>+
>+	if (ns->head->ids.csi == NVME_CSI_ZNS)
>+		ret = nvme_ns_report_zones(ns, sector, nr_zones, cb, data);
>+	else
>+		ret = -EINVAL;
>+	nvme_put_ns_from_disk(head, srcu_idx);
>+
>+	return ret;
>+}
>+
>+blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct request *req,
>+		struct nvme_command *c, enum nvme_zone_mgmt_action action)
>+{
>+	c->zms.opcode = nvme_cmd_zone_mgmt_send;
>+	c->zms.nsid = cpu_to_le32(ns->head->ns_id);
>+	c->zms.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
>+	c->zms.action = action;
>+
>+	if (req_op(req) == REQ_OP_ZONE_RESET_ALL)
>+		c->zms.select = 1;
>+
>+	return BLK_STS_OK;
>+}
>diff --git a/include/linux/nvme.h b/include/linux/nvme.h
>index ea25da572eed..7b3fa7de07bd 100644
>--- a/include/linux/nvme.h
>+++ b/include/linux/nvme.h
>@@ -374,6 +374,30 @@ struct nvme_id_ns {
> 	__u8			vs[3712];
> };
>
>+struct nvme_zns_lbafe {
>+	__le64			zsze;
>+	__u8			zdes;
>+	__u8			rsvd9[7];
>+};
>+
>+struct nvme_id_ns_zns {
>+	__le16			zoc;
>+	__le16			ozcs;
>+	__le32			mar;
>+	__le32			mor;
>+	__le32			rrl;
>+	__le32			frl;
>+	__u8			rsvd20[2796];
>+	struct nvme_zns_lbafe	lbafe[16];
>+	__u8			rsvd3072[768];
>+	__u8			vs[256];
>+};
>+
>+struct nvme_id_ctrl_zns {
>+	__u8	zamds;
>+	__u8	rsvd1[4095];
>+};
>+
> enum {
> 	NVME_ID_CNS_NS			= 0x00,
> 	NVME_ID_CNS_CTRL		= 0x01,
>@@ -392,6 +416,7 @@ enum {
>
> enum {
> 	NVME_CSI_NVM			= 0,
>+	NVME_CSI_ZNS			= 2,
> };
>
> enum {
>@@ -532,6 +557,27 @@ struct nvme_ana_rsp_hdr {
> 	__le16	rsvd10[3];
> };
>
>+struct nvme_zone_descriptor {
>+	__u8		zt;
>+	__u8		zs;
>+	__u8		za;
>+	__u8		rsvd3[5];
>+	__le64		zcap;
>+	__le64		zslba;
>+	__le64		wp;
>+	__u8		rsvd32[32];
>+};
>+
>+enum {
>+	NVME_ZONE_TYPE_SEQWRITE_REQ	= 0x2,
>+};
>+
>+struct nvme_zone_report {
>+	__le64		nr_zones;
>+	__u8		resv8[56];
>+	struct nvme_zone_descriptor entries[];
>+};
>+
> enum {
> 	NVME_SMART_CRIT_SPARE		= 1 << 0,
> 	NVME_SMART_CRIT_TEMPERATURE	= 1 << 1,
>@@ -626,6 +672,9 @@ enum nvme_opcode {
> 	nvme_cmd_resv_report	= 0x0e,
> 	nvme_cmd_resv_acquire	= 0x11,
> 	nvme_cmd_resv_release	= 0x15,
>+	nvme_cmd_zone_mgmt_send	= 0x79,
>+	nvme_cmd_zone_mgmt_recv	= 0x7a,
>+	nvme_cmd_zone_append	= 0x7d,
> };
>
> #define nvme_opcode_name(opcode)	{ opcode, #opcode }
>@@ -764,6 +813,7 @@ struct nvme_rw_command {
> enum {
> 	NVME_RW_LR			= 1 << 15,
> 	NVME_RW_FUA			= 1 << 14,
>+	NVME_RW_APPEND_PIREMAP		= 1 << 9,
> 	NVME_RW_DSM_FREQ_UNSPEC		= 0,
> 	NVME_RW_DSM_FREQ_TYPICAL	= 1,
> 	NVME_RW_DSM_FREQ_RARE		= 2,
>@@ -829,6 +879,53 @@ struct nvme_write_zeroes_cmd {
> 	__le16			appmask;
> };
>
>+enum nvme_zone_mgmt_action {
>+	NVME_ZONE_CLOSE		= 0x1,
>+	NVME_ZONE_FINISH	= 0x2,
>+	NVME_ZONE_OPEN		= 0x3,
>+	NVME_ZONE_RESET		= 0x4,
>+	NVME_ZONE_OFFLINE	= 0x5,
>+	NVME_ZONE_SET_DESC_EXT	= 0x10,
>+};
>+
>+struct nvme_zone_mgmt_send_cmd {
>+	__u8			opcode;
>+	__u8			flags;
>+	__u16			command_id;
>+	__le32			nsid;
>+	__le32			cdw2[2];
>+	__le64			metadata;
>+	union nvme_data_ptr	dptr;
>+	__le64			slba;
>+	__le32			cdw12;
>+	__u8			action;

Why not zsa to make it easier to match to the spec


>+	__u8			select;

sel_all?

>+	__u8			rsvd13[2];
>+	__le32			cdw14[2];
>+};
>+
>+struct nvme_zone_mgmt_recv_cmd {
>+	__u8			opcode;
>+	__u8			flags;
>+	__u16			command_id;
>+	__le32			nsid;
>+	__le64			rsvd2[2];
>+	union nvme_data_ptr	dptr;
>+	__le64			slba;
>+	__le32			numd;
>+	__u8			zra;
>+	__u8			zrasf;
>+	__u8			pr;

Partial Report is just one bit in the "Zone Receive Action Specific
Features". What about zrasfe?

>+	__u8			rsvd13;
>+	__le32			cdw14[2];
>+};
>+
>+enum {
>+	NVME_ZRA_ZONE_REPORT		= 0,
>+	NVME_ZRASF_ZONE_REPORT_ALL	= 0,
>+	NVME_REPORT_ZONE_PARTIAL	= 1,
>+};
>+
> /* Features */
>
> enum {
>@@ -1300,6 +1397,8 @@ struct nvme_command {
> 		struct nvme_format_cmd format;
> 		struct nvme_dsm_cmd dsm;
> 		struct nvme_write_zeroes_cmd write_zeroes;
>+		struct nvme_zone_mgmt_send_cmd zms;
>+		struct nvme_zone_mgmt_recv_cmd zmr;
> 		struct nvme_abort_cmd abort;
> 		struct nvme_get_log_page_command get_log_page;
> 		struct nvmf_common_command fabrics;
>@@ -1433,6 +1532,18 @@ enum {
> 	NVME_SC_DISCOVERY_RESTART	= 0x190,
> 	NVME_SC_AUTH_REQUIRED		= 0x191,
>
>+	/*
>+	 * I/O Command Set Specific - Zoned commands:
>+	 */
>+	NVME_SC_ZONE_BOUNDARY_ERROR	= 0x1b8,
>+	NVME_SC_ZONE_FULL		= 0x1b9,
>+	NVME_SC_ZONE_READ_ONLY		= 0x1ba,
>+	NVME_SC_ZONE_OFFLINE		= 0x1bb,
>+	NVME_SC_ZONE_INVALID_WRITE	= 0x1bc,
>+	NVME_SC_ZONE_TOO_MANY_ACTIVE	= 0x1bd,
>+	NVME_SC_ZONE_TOO_MANY_OPEN	= 0x1be,
>+	NVME_SC_ZONE_INVALID_TRANSITION	= 0x1bf,
>+
> 	/*
> 	 * Media and Data Integrity Errors:
> 	 */
>-- 
>2.24.1
>
Matias Bjorling June 16, 2020, 11:18 a.m. UTC | #2
On 16/06/2020 12.41, Javier González wrote:
> On 16.06.2020 08:34, Keith Busch wrote:
>> Add support for NVM Express Zoned Namespaces (ZNS) Command Set defined
>> in NVM Express TP4053. Zoned namespaces are discovered based on their
>> Command Set Identifier reported in the namespaces Namespace
>> Identification Descriptor list. A successfully discovered Zoned
>> Namespace will be registered with the block layer as a host managed
>> zoned block device with Zone Append command support. A namespace that
>> does not support append is not supported by the driver.
>
> Why are we enforcing the append command? Append is optional on the
> current ZNS specification, so we should not make this mandatory in the
> implementation. See specifics below.

There is already general support in the kernel for the zone append 
command. Feel free to submit patches to emulate the support. It is 
outside the scope of this patchset.

>
>>
>> Signed-off-by: Hans Holmberg <hans.holmberg@wdc.com>
>> Signed-off-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
>> Signed-off-by: Ajay Joshi <ajay.joshi@wdc.com>
>> Signed-off-by: Aravind Ramesh <aravind.ramesh@wdc.com>
>> Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
>> Signed-off-by: Matias Bjørling <matias.bjorling@wdc.com>
>> Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
>> Signed-off-by: Keith Busch <keith.busch@wdc.com>
>> ---
>> drivers/nvme/host/Makefile |   1 +
>> drivers/nvme/host/core.c   |  91 ++++++++++++--
>> drivers/nvme/host/nvme.h   |  39 ++++++
>> drivers/nvme/host/zns.c    | 238 +++++++++++++++++++++++++++++++++++++
>> include/linux/nvme.h       | 111 +++++++++++++++++
>> 5 files changed, 468 insertions(+), 12 deletions(-)
>> create mode 100644 drivers/nvme/host/zns.c
>>
>> diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile
>> index fc7b26be692d..d7f6a87687b8 100644
>> --- a/drivers/nvme/host/Makefile
>> +++ b/drivers/nvme/host/Makefile
>> @@ -13,6 +13,7 @@ nvme-core-y                := core.o
>> nvme-core-$(CONFIG_TRACING)        += trace.o
>> nvme-core-$(CONFIG_NVME_MULTIPATH)    += multipath.o
>> nvme-core-$(CONFIG_NVM)            += lightnvm.o
>> +nvme-core-$(CONFIG_BLK_DEV_ZONED)    += zns.o
>> nvme-core-$(CONFIG_FAULT_INJECTION_DEBUG_FS)    += fault_inject.o
>> nvme-core-$(CONFIG_NVME_HWMON)        += hwmon.o
>>
>> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
>> index 58f137b9f2c5..e961910da4ac 100644
>> --- a/drivers/nvme/host/core.c
>> +++ b/drivers/nvme/host/core.c
>> @@ -89,7 +89,7 @@ static dev_t nvme_chr_devt;
>> static struct class *nvme_class;
>> static struct class *nvme_subsys_class;
>>
>> -static int nvme_revalidate_disk(struct gendisk *disk);
>> +static int _nvme_revalidate_disk(struct gendisk *disk);
>> static void nvme_put_subsystem(struct nvme_subsystem *subsys);
>> static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
>>                        unsigned nsid);
>> @@ -287,6 +287,10 @@ void nvme_complete_rq(struct request *req)
>>             nvme_retry_req(req);
>>             return;
>>         }
>> +    } else if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
>> +           req_op(req) == REQ_OP_ZONE_APPEND) {
>> +        req->__sector = nvme_lba_to_sect(req->q->queuedata,
>> +            le64_to_cpu(nvme_req(req)->result.u64));
>>     }
>>
>>     nvme_trace_bio_complete(req, status);
>> @@ -673,7 +677,8 @@ static inline blk_status_t 
>> nvme_setup_write_zeroes(struct nvme_ns *ns,
>> }
>>
>> static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
>> -        struct request *req, struct nvme_command *cmnd)
>> +        struct request *req, struct nvme_command *cmnd,
>> +        enum nvme_opcode op)
>> {
>>     struct nvme_ctrl *ctrl = ns->ctrl;
>>     u16 control = 0;
>> @@ -687,7 +692,7 @@ static inline blk_status_t nvme_setup_rw(struct 
>> nvme_ns *ns,
>>     if (req->cmd_flags & REQ_RAHEAD)
>>         dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
>>
>> -    cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : 
>> nvme_cmd_read);
>> +    cmnd->rw.opcode = op;
>>     cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id);
>>     cmnd->rw.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
>>     cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> 
>> ns->lba_shift) - 1);
>> @@ -716,6 +721,8 @@ static inline blk_status_t nvme_setup_rw(struct 
>> nvme_ns *ns,
>>         case NVME_NS_DPS_PI_TYPE2:
>>             control |= NVME_RW_PRINFO_PRCHK_GUARD |
>>                     NVME_RW_PRINFO_PRCHK_REF;
>> +            if (op == nvme_cmd_zone_append)
>> +                control |= NVME_RW_APPEND_PIREMAP;
>>             cmnd->rw.reftag = cpu_to_le32(t10_pi_ref_tag(req));
>>             break;
>>         }
>> @@ -756,6 +763,19 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, 
>> struct request *req,
>>     case REQ_OP_FLUSH:
>>         nvme_setup_flush(ns, cmd);
>>         break;
>> +    case REQ_OP_ZONE_RESET_ALL:
>> +    case REQ_OP_ZONE_RESET:
>> +        ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_RESET);
>> +        break;
>> +    case REQ_OP_ZONE_OPEN:
>> +        ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_OPEN);
>> +        break;
>> +    case REQ_OP_ZONE_CLOSE:
>> +        ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_CLOSE);
>> +        break;
>> +    case REQ_OP_ZONE_FINISH:
>> +        ret = nvme_setup_zone_mgmt_send(ns, req, cmd, 
>> NVME_ZONE_FINISH);
>> +        break;
>>     case REQ_OP_WRITE_ZEROES:
>>         ret = nvme_setup_write_zeroes(ns, req, cmd);
>>         break;
>> @@ -763,8 +783,13 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, 
>> struct request *req,
>>         ret = nvme_setup_discard(ns, req, cmd);
>>         break;
>>     case REQ_OP_READ:
>> +        ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_read);
>> +        break;
>>     case REQ_OP_WRITE:
>> -        ret = nvme_setup_rw(ns, req, cmd);
>> +        ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_write);
>> +        break;
>> +    case REQ_OP_ZONE_APPEND:
>> +        ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_zone_append);
>>         break;
>>     default:
>>         WARN_ON_ONCE(1);
>> @@ -1392,14 +1417,23 @@ static u32 nvme_passthru_start(struct 
>> nvme_ctrl *ctrl, struct nvme_ns *ns,
>>     return effects;
>> }
>>
>> -static void nvme_update_formats(struct nvme_ctrl *ctrl)
>> +static void nvme_update_formats(struct nvme_ctrl *ctrl, u32 *effects)
>> {
>>     struct nvme_ns *ns;
>>
>>     down_read(&ctrl->namespaces_rwsem);
>>     list_for_each_entry(ns, &ctrl->namespaces, list)
>> -        if (ns->disk && nvme_revalidate_disk(ns->disk))
>> +        if (ns->disk && _nvme_revalidate_disk(ns->disk))
>>             nvme_set_queue_dying(ns);
>> +        else if (blk_queue_is_zoned(ns->disk->queue)) {
>> +            /*
>> +             * IO commands are required to fully revalidate a zoned
>> +             * device. Force the command effects to trigger rescan
>> +             * work so report zones can run in a context with
>> +             * unfrozen IO queues.
>> +             */
>> +            *effects |= NVME_CMD_EFFECTS_NCC;
>> +        }
>>     up_read(&ctrl->namespaces_rwsem);
>> }
>>
>> @@ -1411,7 +1445,7 @@ static void nvme_passthru_end(struct nvme_ctrl 
>> *ctrl, u32 effects)
>>      * this command.
>>      */
>>     if (effects & NVME_CMD_EFFECTS_LBCC)
>> -        nvme_update_formats(ctrl);
>> +        nvme_update_formats(ctrl, &effects);
>>     if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) {
>>         nvme_unfreeze(ctrl);
>>         nvme_mpath_unfreeze(ctrl->subsys);
>> @@ -1526,7 +1560,7 @@ static int nvme_user_cmd64(struct nvme_ctrl 
>> *ctrl, struct nvme_ns *ns,
>>  * Issue ioctl requests on the first available path.  Note that 
>> unlike normal
>>  * block layer requests we will not retry failed request on another 
>> controller.
>>  */
>> -static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
>> +struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
>>         struct nvme_ns_head **head, int *srcu_idx)
>> {
>> #ifdef CONFIG_NVME_MULTIPATH
>> @@ -1546,7 +1580,7 @@ static struct nvme_ns 
>> *nvme_get_ns_from_disk(struct gendisk *disk,
>>     return disk->private_data;
>> }
>>
>> -static void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx)
>> +void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx)
>> {
>>     if (head)
>>         srcu_read_unlock(&head->srcu, idx);
>> @@ -1939,21 +1973,28 @@ static void nvme_update_disk_info(struct 
>> gendisk *disk,
>>
>> static int __nvme_revalidate_disk(struct gendisk *disk, struct 
>> nvme_id_ns *id)
>> {
>> +    unsigned lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK;
>>     struct nvme_ns *ns = disk->private_data;
>>     struct nvme_ctrl *ctrl = ns->ctrl;
>> +    int ret;
>>     u32 iob;
>>
>>     /*
>>      * If identify namespace failed, use default 512 byte block size so
>>      * block layer can use before failing read/write for 0 capacity.
>>      */
>> -    ns->lba_shift = id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ds;
>> +    ns->lba_shift = id->lbaf[lbaf].ds;
>>     if (ns->lba_shift == 0)
>>         ns->lba_shift = 9;
>>
>>     switch (ns->head->ids.csi) {
>>     case NVME_CSI_NVM:
>>         break;
>> +    case NVME_CSI_ZNS:
>> +        ret = nvme_update_zone_info(disk, ns, lbaf);
>> +        if (ret)
>> +            return ret;
>> +        break;
>>     default:
>>         dev_warn(ctrl->device, "unknown csi:%d ns:%d\n",
>>             ns->head->ids.csi, ns->head->ns_id);
>> @@ -1967,7 +2008,7 @@ static int __nvme_revalidate_disk(struct 
>> gendisk *disk, struct nvme_id_ns *id)
>>         iob = nvme_lba_to_sect(ns, le16_to_cpu(id->noiob));
>>
>>     ns->features = 0;
>> -    ns->ms = le16_to_cpu(id->lbaf[id->flbas & 
>> NVME_NS_FLBAS_LBA_MASK].ms);
>> +    ns->ms = le16_to_cpu(id->lbaf[lbaf].ms);
>>     /* the PI implementation requires metadata equal t10 pi tuple 
>> size */
>>     if (ns->ms == sizeof(struct t10_pi_tuple))
>>         ns->pi_type = id->dps & NVME_NS_DPS_PI_MASK;
>> @@ -2010,7 +2051,7 @@ static int __nvme_revalidate_disk(struct 
>> gendisk *disk, struct nvme_id_ns *id)
>>     return 0;
>> }
>>
>> -static int nvme_revalidate_disk(struct gendisk *disk)
>> +static int _nvme_revalidate_disk(struct gendisk *disk)
>> {
>>     struct nvme_ns *ns = disk->private_data;
>>     struct nvme_ctrl *ctrl = ns->ctrl;
>> @@ -2058,6 +2099,28 @@ static int nvme_revalidate_disk(struct gendisk 
>> *disk)
>>     return ret;
>> }
>>
>> +static int nvme_revalidate_disk(struct gendisk *disk)
>> +{
>> +    int ret;
>> +
>> +    ret = _nvme_revalidate_disk(disk);
>> +    if (ret)
>> +        return ret;
>> +
>> +#ifdef CONFIG_BLK_DEV_ZONED
>> +    if (blk_queue_is_zoned(disk->queue)) {
>> +        struct nvme_ns *ns = disk->private_data;
>> +        struct nvme_ctrl *ctrl = ns->ctrl;
>> +
>> +        ret = blk_revalidate_disk_zones(disk, NULL);
>> +        if (!ret)
>> +            blk_queue_max_zone_append_sectors(disk->queue,
>> +                              ctrl->max_zone_append);
>> +    }
>> +#endif
>> +    return ret;
>> +}
>> +
>> static char nvme_pr_type(enum pr_type type)
>> {
>>     switch (type) {
>> @@ -2188,6 +2251,7 @@ static const struct block_device_operations 
>> nvme_fops = {
>>     .release    = nvme_release,
>>     .getgeo        = nvme_getgeo,
>>     .revalidate_disk= nvme_revalidate_disk,
>> +    .report_zones    = nvme_report_zones,
>>     .pr_ops        = &nvme_pr_ops,
>> };
>>
>> @@ -2213,6 +2277,7 @@ const struct block_device_operations 
>> nvme_ns_head_ops = {
>>     .ioctl        = nvme_ioctl,
>>     .compat_ioctl    = nvme_compat_ioctl,
>>     .getgeo        = nvme_getgeo,
>> +    .report_zones    = nvme_report_zones,
>>     .pr_ops        = &nvme_pr_ops,
>> };
>> #endif /* CONFIG_NVME_MULTIPATH */
>> @@ -4439,6 +4504,8 @@ static inline void _nvme_check_size(void)
>>     BUILD_BUG_ON(sizeof(struct nvme_command) != 64);
>>     BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != 
>> NVME_IDENTIFY_DATA_SIZE);
>>     BUILD_BUG_ON(sizeof(struct nvme_id_ns) != NVME_IDENTIFY_DATA_SIZE);
>> +    BUILD_BUG_ON(sizeof(struct nvme_id_ns_zns) != 
>> NVME_IDENTIFY_DATA_SIZE);
>> +    BUILD_BUG_ON(sizeof(struct nvme_id_ctrl_zns) != 
>> NVME_IDENTIFY_DATA_SIZE);
>>     BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
>>     BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
>>     BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64);
>> diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
>> index 58428e3a590e..662f95fbd909 100644
>> --- a/drivers/nvme/host/nvme.h
>> +++ b/drivers/nvme/host/nvme.h
>> @@ -239,6 +239,9 @@ struct nvme_ctrl {
>>     u32 max_hw_sectors;
>>     u32 max_segments;
>>     u32 max_integrity_segments;
>> +#ifdef CONFIG_BLK_DEV_ZONED
>> +    u32 max_zone_append;
>> +#endif
>>     u16 crdt[3];
>>     u16 oncs;
>>     u16 oacs;
>> @@ -403,6 +406,9 @@ struct nvme_ns {
>>     u16 sgs;
>>     u32 sws;
>>     u8 pi_type;
>> +#ifdef CONFIG_BLK_DEV_ZONED
>> +    u64 zsze;
>> +#endif
>>     unsigned long features;
>>     unsigned long flags;
>> #define NVME_NS_REMOVING    0
>> @@ -568,6 +574,9 @@ int nvme_delete_ctrl(struct nvme_ctrl *ctrl);
>>
>> int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 
>> lsp, u8 csi,
>>         void *log, size_t size, u64 offset);
>> +struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
>> +        struct nvme_ns_head **head, int *srcu_idx);
>> +void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx);
>>
>> extern const struct attribute_group *nvme_ns_id_attr_groups[];
>> extern const struct block_device_operations nvme_ns_head_ops;
>> @@ -689,6 +698,36 @@ static inline void 
>> nvme_mpath_start_freeze(struct nvme_subsystem *subsys)
>> }
>> #endif /* CONFIG_NVME_MULTIPATH */
>>
>> +#ifdef CONFIG_BLK_DEV_ZONED
>> +int nvme_update_zone_info(struct gendisk *disk, struct nvme_ns *ns,
>> +              unsigned lbaf);
>> +
>> +int nvme_report_zones(struct gendisk *disk, sector_t sector,
>> +              unsigned int nr_zones, report_zones_cb cb, void *data);
>> +
>> +blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct 
>> request *req,
>> +                       struct nvme_command *cmnd,
>> +                       enum nvme_zone_mgmt_action action);
>> +#else
>> +#define nvme_report_zones NULL
>> +
>> +static inline blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns 
>> *ns,
>> +        struct request *req, struct nvme_command *cmnd,
>> +        enum nvme_zone_mgmt_action action)
>> +{
>> +    return BLK_STS_NOTSUPP;
>> +}
>> +
>> +static inline int nvme_update_zone_info(struct gendisk *disk,
>> +                    struct nvme_ns *ns,
>> +                    unsigned lbaf)
>> +{
>> +    dev_warn(ns->ctrl->device,
>> +         "Please enable CONFIG_BLK_DEV_ZONED to support ZNS 
>> devices\n");
>> +    return -EPROTONOSUPPORT;
>> +}
>> +#endif
>> +
>> #ifdef CONFIG_NVM
>> int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node);
>> void nvme_nvm_unregister(struct nvme_ns *ns);
>> diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c
>> new file mode 100644
>> index 000000000000..c08f6281b614
>> --- /dev/null
>> +++ b/drivers/nvme/host/zns.c
>> @@ -0,0 +1,238 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +/*
>> + * Copyright (C) 2020 Western Digital Corporation or its affiliates.
>> + */
>> +
>> +#include <linux/blkdev.h>
>> +#include <linux/vmalloc.h>
>> +#include "nvme.h"
>> +
>> +static int nvme_set_max_append(struct nvme_ctrl *ctrl)
>> +{
>> +    struct nvme_command c = { };
>> +    struct nvme_id_ctrl_zns *id;
>> +    int status;
>> +
>> +    id = kzalloc(sizeof(*id), GFP_KERNEL);
>> +    if (!id)
>> +        return -ENOMEM;
>> +
>> +    c.identify.opcode = nvme_admin_identify;
>> +    c.identify.cns = NVME_ID_CNS_CS_CTRL;
>> +    c.identify.csi = NVME_CSI_ZNS;
>> +
>> +    status = nvme_submit_sync_cmd(ctrl->admin_q, &c, id, sizeof(*id));
>> +    if (status) {
>> +        kfree(id);
>> +        return status;
>> +    }
>> +
>> +    ctrl->max_zone_append = 1 << (id->zamds + 3);
>> +    kfree(id);
>> +    return 0;
>> +}
>> +
>> +int nvme_update_zone_info(struct gendisk *disk, struct nvme_ns *ns,
>> +              unsigned lbaf)
>> +{
>> +    struct nvme_effects_log *log = ns->head->effects;
>> +    struct request_queue *q = disk->queue;
>> +    struct nvme_command c = { };
>> +    struct nvme_id_ns_zns *id;
>> +    int status;
>> +
>> +    /* Driver requires zone append support */
>> +    if (!(log->iocs[nvme_cmd_zone_append] & NVME_CMD_EFFECTS_CSUPP))
>> +        return -ENODEV;
>
> Following up on the initial comment, this check should go.

See first comment.

>
>> +
>> +    /* Lazily query controller append limit for the first zoned 
>> namespace */
>> +    if (!ns->ctrl->max_zone_append) {
>> +        status = nvme_set_max_append(ns->ctrl);
>> +        if (status)
>> +            return status;
>> +    }
>
> This should only be applied if append is supported.

See first comment.

>
>> +
>> +    id = kzalloc(sizeof(*id), GFP_KERNEL);
>> +    if (!id)
>> +        return -ENOMEM;
>> +
>> +    c.identify.opcode = nvme_admin_identify;
>> +    c.identify.nsid = cpu_to_le32(ns->head->ns_id);
>> +    c.identify.cns = NVME_ID_CNS_CS_NS;
>> +    c.identify.csi = NVME_CSI_ZNS;
>> +
>> +    status = nvme_submit_sync_cmd(ns->ctrl->admin_q, &c, id, 
>> sizeof(*id));
>> +    if (status)
>> +        goto free_data;
>> +
>> +    /*
>> +     * We currently do not handle devices requiring any of the zoned
>> +     * operation characteristics.
>> +     */
>> +    if (id->zoc) {
>> +        status = -EINVAL;
>> +        goto free_data;
>> +    }
>
> I understand that "Variable Zone Capacity" is not supported as it
> requires major changes at this moment, but we should support controllers
> that enable "Zone Active Excursions", even when the AER event is not
> implemented in this patchset.


NAK. Similarly to VZC, this allows an unsuspecting user to have major 
data loss when a zone is suddenly moved to Full.


>
>> +
>> +    ns->zsze = nvme_lba_to_sect(ns, le64_to_cpu(id->lbafe[lbaf].zsze));
>> +    if (!ns->zsze) {
>> +        status = -EINVAL;
>> +        goto free_data;
>> +    }
>> +
>> +    q->limits.zoned = BLK_ZONED_HM;
>> +    blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q);
>> +free_data:
>> +    kfree(id);
>> +    return status;
>> +}
>> +
>> +static void *nvme_zns_alloc_report_buffer(struct nvme_ns *ns,
>> +                      unsigned int nr_zones, size_t *buflen)
>> +{
>> +    struct request_queue *q = ns->disk->queue;
>> +    size_t bufsize;
>> +    void *buf;
>> +
>> +    const size_t min_bufsize = sizeof(struct nvme_zone_report) +
>> +                   sizeof(struct nvme_zone_descriptor);
>> +
>> +    nr_zones = min_t(unsigned int, nr_zones,
>> +             get_capacity(ns->disk) >> ilog2(ns->zsze));
>> +
>> +    bufsize = sizeof(struct nvme_zone_report) +
>> +        nr_zones * sizeof(struct nvme_zone_descriptor);
>> +    bufsize = min_t(size_t, bufsize,
>> +            queue_max_hw_sectors(q) << SECTOR_SHIFT);
>> +    bufsize = min_t(size_t, bufsize, queue_max_segments(q) << 
>> PAGE_SHIFT);
>> +
>> +    while (bufsize >= min_bufsize) {
>> +        buf = __vmalloc(bufsize,
>> +                GFP_KERNEL | __GFP_ZERO | __GFP_NORETRY);
>> +        if (buf) {
>> +            *buflen = bufsize;
>> +            return buf;
>> +        }
>> +        bufsize >>= 1;
>> +    }
>> +    return NULL;
>> +}
>> +
>> +static int __nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector,
>> +                  struct nvme_zone_report *report,
>> +                  size_t buflen)
>> +{
>> +    struct nvme_command c = { };
>> +    int ret;
>> +
>> +    c.zmr.opcode = nvme_cmd_zone_mgmt_recv;
>> +    c.zmr.nsid = cpu_to_le32(ns->head->ns_id);
>> +    c.zmr.slba = cpu_to_le64(nvme_sect_to_lba(ns, sector));
>> +    c.zmr.numd = cpu_to_le32(nvme_bytes_to_numd(buflen));
>> +    c.zmr.zra = NVME_ZRA_ZONE_REPORT;
>> +    c.zmr.zrasf = NVME_ZRASF_ZONE_REPORT_ALL;
>> +    c.zmr.pr = NVME_REPORT_ZONE_PARTIAL;
>> +
>> +    ret = nvme_submit_sync_cmd(ns->queue, &c, report, buflen);
>> +    if (ret)
>> +        return ret;
>> +
>> +    return le64_to_cpu(report->nr_zones);
>> +}
>> +
>> +static int nvme_zone_parse_entry(struct nvme_ns *ns,
>> +                 struct nvme_zone_descriptor *entry,
>> +                 unsigned int idx, report_zones_cb cb,
>> +                 void *data)
>> +{
>> +    struct blk_zone zone = { };
>> +
>> +    if ((entry->zt & 0xf) != NVME_ZONE_TYPE_SEQWRITE_REQ) {
>> +        dev_err(ns->ctrl->device, "invalid zone type %#x\n",
>> +                entry->zt);
>> +        return -EINVAL;
>> +    }
>> +
>> +    zone.type = BLK_ZONE_TYPE_SEQWRITE_REQ;
>> +    zone.cond = entry->zs >> 4;
>> +    zone.len = ns->zsze;
>> +    zone.capacity = nvme_lba_to_sect(ns, le64_to_cpu(entry->zcap));
>> +    zone.start = nvme_lba_to_sect(ns, le64_to_cpu(entry->zslba));
>> +    zone.wp = nvme_lba_to_sect(ns, le64_to_cpu(entry->wp));
>> +
>> +    return cb(&zone, idx, data);
>> +}
>> +
>> +static int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector,
>> +            unsigned int nr_zones, report_zones_cb cb, void *data)
>> +{
>> +    struct nvme_zone_report *report;
>> +    int ret, zone_idx = 0;
>> +    unsigned int nz, i;
>> +    size_t buflen;
>> +
>> +    report = nvme_zns_alloc_report_buffer(ns, nr_zones, &buflen);
>> +    if (!report)
>> +        return -ENOMEM;
>> +
>> +    sector &= ~(ns->zsze - 1);
>> +    while (zone_idx < nr_zones && sector < get_capacity(ns->disk)) {
>> +        memset(report, 0, buflen);
>> +        ret = __nvme_ns_report_zones(ns, sector, report, buflen);
>> +        if (ret < 0)
>> +            goto out_free;
>> +
>> +        nz = min_t(unsigned int, ret, nr_zones);
>> +        if (!nz)
>> +            break;
>> +
>> +        for (i = 0; i < nz && zone_idx < nr_zones; i++) {
>> +            ret = nvme_zone_parse_entry(ns, &report->entries[i],
>> +                            zone_idx, cb, data);
>> +            if (ret)
>> +                goto out_free;
>> +            zone_idx++;
>> +        }
>> +
>> +        sector += ns->zsze * nz;
>> +    }
>> +
>> +    ret = zone_idx;
>> +out_free:
>> +    kvfree(report);
>> +    return ret;
>> +}
>> +
>> +int nvme_report_zones(struct gendisk *disk, sector_t sector,
>> +              unsigned int nr_zones, report_zones_cb cb, void *data)
>> +{
>> +    struct nvme_ns_head *head = NULL;
>> +    struct nvme_ns *ns;
>> +    int srcu_idx, ret;
>> +
>> +    ns = nvme_get_ns_from_disk(disk, &head, &srcu_idx);
>> +    if (unlikely(!ns))
>> +        return -EWOULDBLOCK;
>> +
>> +    if (ns->head->ids.csi == NVME_CSI_ZNS)
>> +        ret = nvme_ns_report_zones(ns, sector, nr_zones, cb, data);
>> +    else
>> +        ret = -EINVAL;
>> +    nvme_put_ns_from_disk(head, srcu_idx);
>> +
>> +    return ret;
>> +}
>> +
>> +blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct 
>> request *req,
>> +        struct nvme_command *c, enum nvme_zone_mgmt_action action)
>> +{
>> +    c->zms.opcode = nvme_cmd_zone_mgmt_send;
>> +    c->zms.nsid = cpu_to_le32(ns->head->ns_id);
>> +    c->zms.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
>> +    c->zms.action = action;
>> +
>> +    if (req_op(req) == REQ_OP_ZONE_RESET_ALL)
>> +        c->zms.select = 1;
>> +
>> +    return BLK_STS_OK;
>> +}
>> diff --git a/include/linux/nvme.h b/include/linux/nvme.h
>> index ea25da572eed..7b3fa7de07bd 100644
>> --- a/include/linux/nvme.h
>> +++ b/include/linux/nvme.h
>> @@ -374,6 +374,30 @@ struct nvme_id_ns {
>>     __u8            vs[3712];
>> };
>>
>> +struct nvme_zns_lbafe {
>> +    __le64            zsze;
>> +    __u8            zdes;
>> +    __u8            rsvd9[7];
>> +};
>> +
>> +struct nvme_id_ns_zns {
>> +    __le16            zoc;
>> +    __le16            ozcs;
>> +    __le32            mar;
>> +    __le32            mor;
>> +    __le32            rrl;
>> +    __le32            frl;
>> +    __u8            rsvd20[2796];
>> +    struct nvme_zns_lbafe    lbafe[16];
>> +    __u8            rsvd3072[768];
>> +    __u8            vs[256];
>> +};
>> +
>> +struct nvme_id_ctrl_zns {
>> +    __u8    zamds;
>> +    __u8    rsvd1[4095];
>> +};
>> +
>> enum {
>>     NVME_ID_CNS_NS            = 0x00,
>>     NVME_ID_CNS_CTRL        = 0x01,
>> @@ -392,6 +416,7 @@ enum {
>>
>> enum {
>>     NVME_CSI_NVM            = 0,
>> +    NVME_CSI_ZNS            = 2,
>> };
>>
>> enum {
>> @@ -532,6 +557,27 @@ struct nvme_ana_rsp_hdr {
>>     __le16    rsvd10[3];
>> };
>>
>> +struct nvme_zone_descriptor {
>> +    __u8        zt;
>> +    __u8        zs;
>> +    __u8        za;
>> +    __u8        rsvd3[5];
>> +    __le64        zcap;
>> +    __le64        zslba;
>> +    __le64        wp;
>> +    __u8        rsvd32[32];
>> +};
>> +
>> +enum {
>> +    NVME_ZONE_TYPE_SEQWRITE_REQ    = 0x2,
>> +};
>> +
>> +struct nvme_zone_report {
>> +    __le64        nr_zones;
>> +    __u8        resv8[56];
>> +    struct nvme_zone_descriptor entries[];
>> +};
>> +
>> enum {
>>     NVME_SMART_CRIT_SPARE        = 1 << 0,
>>     NVME_SMART_CRIT_TEMPERATURE    = 1 << 1,
>> @@ -626,6 +672,9 @@ enum nvme_opcode {
>>     nvme_cmd_resv_report    = 0x0e,
>>     nvme_cmd_resv_acquire    = 0x11,
>>     nvme_cmd_resv_release    = 0x15,
>> +    nvme_cmd_zone_mgmt_send    = 0x79,
>> +    nvme_cmd_zone_mgmt_recv    = 0x7a,
>> +    nvme_cmd_zone_append    = 0x7d,
>> };
>>
>> #define nvme_opcode_name(opcode)    { opcode, #opcode }
>> @@ -764,6 +813,7 @@ struct nvme_rw_command {
>> enum {
>>     NVME_RW_LR            = 1 << 15,
>>     NVME_RW_FUA            = 1 << 14,
>> +    NVME_RW_APPEND_PIREMAP        = 1 << 9,
>>     NVME_RW_DSM_FREQ_UNSPEC        = 0,
>>     NVME_RW_DSM_FREQ_TYPICAL    = 1,
>>     NVME_RW_DSM_FREQ_RARE        = 2,
>> @@ -829,6 +879,53 @@ struct nvme_write_zeroes_cmd {
>>     __le16            appmask;
>> };
>>
>> +enum nvme_zone_mgmt_action {
>> +    NVME_ZONE_CLOSE        = 0x1,
>> +    NVME_ZONE_FINISH    = 0x2,
>> +    NVME_ZONE_OPEN        = 0x3,
>> +    NVME_ZONE_RESET        = 0x4,
>> +    NVME_ZONE_OFFLINE    = 0x5,
>> +    NVME_ZONE_SET_DESC_EXT    = 0x10,
>> +};
>> +
>> +struct nvme_zone_mgmt_send_cmd {
>> +    __u8            opcode;
>> +    __u8            flags;
>> +    __u16            command_id;
>> +    __le32            nsid;
>> +    __le32            cdw2[2];
>> +    __le64            metadata;
>> +    union nvme_data_ptr    dptr;
>> +    __le64            slba;
>> +    __le32            cdw12;
>> +    __u8            action;
>
> Why not zsa to make it easier to match to the spec
>
>
>> +    __u8            select;
>
> sel_all?
>
>> +    __u8            rsvd13[2];
>> +    __le32            cdw14[2];
>> +};
>> +
>> +struct nvme_zone_mgmt_recv_cmd {
>> +    __u8            opcode;
>> +    __u8            flags;
>> +    __u16            command_id;
>> +    __le32            nsid;
>> +    __le64            rsvd2[2];
>> +    union nvme_data_ptr    dptr;
>> +    __le64            slba;
>> +    __le32            numd;
>> +    __u8            zra;
>> +    __u8            zrasf;
>> +    __u8            pr;
>
> Partial Report is just one bit in the "Zone Receive Action Specific
> Features". What about zrasfe?

There currently no users of pr, and bit 1-7 are reserved in the spec. 
Users of the pr variable should shift and mask as necessary.

zrasf looks good to me. It is defined as a byte in the spec.

>
>> +    __u8            rsvd13;
>> +    __le32            cdw14[2];
>> +};
>> +
>> +enum {
>> +    NVME_ZRA_ZONE_REPORT        = 0,
>> +    NVME_ZRASF_ZONE_REPORT_ALL    = 0,
>> +    NVME_REPORT_ZONE_PARTIAL    = 1,
>> +};
>> +
>> /* Features */
>>
>> enum {
>> @@ -1300,6 +1397,8 @@ struct nvme_command {
>>         struct nvme_format_cmd format;
>>         struct nvme_dsm_cmd dsm;
>>         struct nvme_write_zeroes_cmd write_zeroes;
>> +        struct nvme_zone_mgmt_send_cmd zms;
>> +        struct nvme_zone_mgmt_recv_cmd zmr;
>>         struct nvme_abort_cmd abort;
>>         struct nvme_get_log_page_command get_log_page;
>>         struct nvmf_common_command fabrics;
>> @@ -1433,6 +1532,18 @@ enum {
>>     NVME_SC_DISCOVERY_RESTART    = 0x190,
>>     NVME_SC_AUTH_REQUIRED        = 0x191,
>>
>> +    /*
>> +     * I/O Command Set Specific - Zoned commands:
>> +     */
>> +    NVME_SC_ZONE_BOUNDARY_ERROR    = 0x1b8,
>> +    NVME_SC_ZONE_FULL        = 0x1b9,
>> +    NVME_SC_ZONE_READ_ONLY        = 0x1ba,
>> +    NVME_SC_ZONE_OFFLINE        = 0x1bb,
>> +    NVME_SC_ZONE_INVALID_WRITE    = 0x1bc,
>> +    NVME_SC_ZONE_TOO_MANY_ACTIVE    = 0x1bd,
>> +    NVME_SC_ZONE_TOO_MANY_OPEN    = 0x1be,
>> +    NVME_SC_ZONE_INVALID_TRANSITION    = 0x1bf,
>> +
>>     /*
>>      * Media and Data Integrity Errors:
>>      */
>> -- 
>> 2.24.1
>>
Javier González June 16, 2020, noon UTC | #3
On 16.06.2020 13:18, Matias Bjørling wrote:
>On 16/06/2020 12.41, Javier González wrote:
>>On 16.06.2020 08:34, Keith Busch wrote:
>>>Add support for NVM Express Zoned Namespaces (ZNS) Command Set defined
>>>in NVM Express TP4053. Zoned namespaces are discovered based on their
>>>Command Set Identifier reported in the namespaces Namespace
>>>Identification Descriptor list. A successfully discovered Zoned
>>>Namespace will be registered with the block layer as a host managed
>>>zoned block device with Zone Append command support. A namespace that
>>>does not support append is not supported by the driver.
>>
>>Why are we enforcing the append command? Append is optional on the
>>current ZNS specification, so we should not make this mandatory in the
>>implementation. See specifics below.

>
>There is already general support in the kernel for the zone append 
>command. Feel free to submit patches to emulate the support. It is 
>outside the scope of this patchset.
>

It is fine that the kernel supports append, but the ZNS specification
does not impose the implementation for append, so the driver should not
do that either.

ZNS SSDs that choose to leave append as a non-implemented optional
command should not rely on emulated SW support, specially when
traditional writes work very fine for a large part of current ZNS use
cases.

Please, remove this virtual constraint.

>>
>>>
>>>Signed-off-by: Hans Holmberg <hans.holmberg@wdc.com>
>>>Signed-off-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
>>>Signed-off-by: Ajay Joshi <ajay.joshi@wdc.com>
>>>Signed-off-by: Aravind Ramesh <aravind.ramesh@wdc.com>
>>>Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
>>>Signed-off-by: Matias Bjørling <matias.bjorling@wdc.com>
>>>Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
>>>Signed-off-by: Keith Busch <keith.busch@wdc.com>
>>>---
>>>drivers/nvme/host/Makefile |   1 +
>>>drivers/nvme/host/core.c   |  91 ++++++++++++--
>>>drivers/nvme/host/nvme.h   |  39 ++++++
>>>drivers/nvme/host/zns.c    | 238 +++++++++++++++++++++++++++++++++++++
>>>include/linux/nvme.h       | 111 +++++++++++++++++
>>>5 files changed, 468 insertions(+), 12 deletions(-)
>>>create mode 100644 drivers/nvme/host/zns.c
>>>
>>>diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile
>>>index fc7b26be692d..d7f6a87687b8 100644
>>>--- a/drivers/nvme/host/Makefile
>>>+++ b/drivers/nvme/host/Makefile
>>>@@ -13,6 +13,7 @@ nvme-core-y                := core.o
>>>nvme-core-$(CONFIG_TRACING)        += trace.o
>>>nvme-core-$(CONFIG_NVME_MULTIPATH)    += multipath.o
>>>nvme-core-$(CONFIG_NVM)            += lightnvm.o
>>>+nvme-core-$(CONFIG_BLK_DEV_ZONED)    += zns.o
>>>nvme-core-$(CONFIG_FAULT_INJECTION_DEBUG_FS)    += fault_inject.o
>>>nvme-core-$(CONFIG_NVME_HWMON)        += hwmon.o
>>>
>>>diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
>>>index 58f137b9f2c5..e961910da4ac 100644
>>>--- a/drivers/nvme/host/core.c
>>>+++ b/drivers/nvme/host/core.c
>>>@@ -89,7 +89,7 @@ static dev_t nvme_chr_devt;
>>>static struct class *nvme_class;
>>>static struct class *nvme_subsys_class;
>>>
>>>-static int nvme_revalidate_disk(struct gendisk *disk);
>>>+static int _nvme_revalidate_disk(struct gendisk *disk);
>>>static void nvme_put_subsystem(struct nvme_subsystem *subsys);
>>>static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
>>>                       unsigned nsid);
>>>@@ -287,6 +287,10 @@ void nvme_complete_rq(struct request *req)
>>>            nvme_retry_req(req);
>>>            return;
>>>        }
>>>+    } else if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
>>>+           req_op(req) == REQ_OP_ZONE_APPEND) {
>>>+        req->__sector = nvme_lba_to_sect(req->q->queuedata,
>>>+            le64_to_cpu(nvme_req(req)->result.u64));
>>>    }
>>>
>>>    nvme_trace_bio_complete(req, status);
>>>@@ -673,7 +677,8 @@ static inline blk_status_t 
>>>nvme_setup_write_zeroes(struct nvme_ns *ns,
>>>}
>>>
>>>static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
>>>-        struct request *req, struct nvme_command *cmnd)
>>>+        struct request *req, struct nvme_command *cmnd,
>>>+        enum nvme_opcode op)
>>>{
>>>    struct nvme_ctrl *ctrl = ns->ctrl;
>>>    u16 control = 0;
>>>@@ -687,7 +692,7 @@ static inline blk_status_t 
>>>nvme_setup_rw(struct nvme_ns *ns,
>>>    if (req->cmd_flags & REQ_RAHEAD)
>>>        dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
>>>
>>>-    cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : 
>>>nvme_cmd_read);
>>>+    cmnd->rw.opcode = op;
>>>    cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id);
>>>    cmnd->rw.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
>>>    cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> 
>>>ns->lba_shift) - 1);
>>>@@ -716,6 +721,8 @@ static inline blk_status_t 
>>>nvme_setup_rw(struct nvme_ns *ns,
>>>        case NVME_NS_DPS_PI_TYPE2:
>>>            control |= NVME_RW_PRINFO_PRCHK_GUARD |
>>>                    NVME_RW_PRINFO_PRCHK_REF;
>>>+            if (op == nvme_cmd_zone_append)
>>>+                control |= NVME_RW_APPEND_PIREMAP;
>>>            cmnd->rw.reftag = cpu_to_le32(t10_pi_ref_tag(req));
>>>            break;
>>>        }
>>>@@ -756,6 +763,19 @@ blk_status_t nvme_setup_cmd(struct nvme_ns 
>>>*ns, struct request *req,
>>>    case REQ_OP_FLUSH:
>>>        nvme_setup_flush(ns, cmd);
>>>        break;
>>>+    case REQ_OP_ZONE_RESET_ALL:
>>>+    case REQ_OP_ZONE_RESET:
>>>+        ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_RESET);
>>>+        break;
>>>+    case REQ_OP_ZONE_OPEN:
>>>+        ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_OPEN);
>>>+        break;
>>>+    case REQ_OP_ZONE_CLOSE:
>>>+        ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_CLOSE);
>>>+        break;
>>>+    case REQ_OP_ZONE_FINISH:
>>>+        ret = nvme_setup_zone_mgmt_send(ns, req, cmd, 
>>>NVME_ZONE_FINISH);
>>>+        break;
>>>    case REQ_OP_WRITE_ZEROES:
>>>        ret = nvme_setup_write_zeroes(ns, req, cmd);
>>>        break;
>>>@@ -763,8 +783,13 @@ blk_status_t nvme_setup_cmd(struct nvme_ns 
>>>*ns, struct request *req,
>>>        ret = nvme_setup_discard(ns, req, cmd);
>>>        break;
>>>    case REQ_OP_READ:
>>>+        ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_read);
>>>+        break;
>>>    case REQ_OP_WRITE:
>>>-        ret = nvme_setup_rw(ns, req, cmd);
>>>+        ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_write);
>>>+        break;
>>>+    case REQ_OP_ZONE_APPEND:
>>>+        ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_zone_append);
>>>        break;
>>>    default:
>>>        WARN_ON_ONCE(1);
>>>@@ -1392,14 +1417,23 @@ static u32 nvme_passthru_start(struct 
>>>nvme_ctrl *ctrl, struct nvme_ns *ns,
>>>    return effects;
>>>}
>>>
>>>-static void nvme_update_formats(struct nvme_ctrl *ctrl)
>>>+static void nvme_update_formats(struct nvme_ctrl *ctrl, u32 *effects)
>>>{
>>>    struct nvme_ns *ns;
>>>
>>>    down_read(&ctrl->namespaces_rwsem);
>>>    list_for_each_entry(ns, &ctrl->namespaces, list)
>>>-        if (ns->disk && nvme_revalidate_disk(ns->disk))
>>>+        if (ns->disk && _nvme_revalidate_disk(ns->disk))
>>>            nvme_set_queue_dying(ns);
>>>+        else if (blk_queue_is_zoned(ns->disk->queue)) {
>>>+            /*
>>>+             * IO commands are required to fully revalidate a zoned
>>>+             * device. Force the command effects to trigger rescan
>>>+             * work so report zones can run in a context with
>>>+             * unfrozen IO queues.
>>>+             */
>>>+            *effects |= NVME_CMD_EFFECTS_NCC;
>>>+        }
>>>    up_read(&ctrl->namespaces_rwsem);
>>>}
>>>
>>>@@ -1411,7 +1445,7 @@ static void nvme_passthru_end(struct 
>>>nvme_ctrl *ctrl, u32 effects)
>>>     * this command.
>>>     */
>>>    if (effects & NVME_CMD_EFFECTS_LBCC)
>>>-        nvme_update_formats(ctrl);
>>>+        nvme_update_formats(ctrl, &effects);
>>>    if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) {
>>>        nvme_unfreeze(ctrl);
>>>        nvme_mpath_unfreeze(ctrl->subsys);
>>>@@ -1526,7 +1560,7 @@ static int nvme_user_cmd64(struct nvme_ctrl 
>>>*ctrl, struct nvme_ns *ns,
>>> * Issue ioctl requests on the first available path.  Note that 
>>>unlike normal
>>> * block layer requests we will not retry failed request on 
>>>another controller.
>>> */
>>>-static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
>>>+struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
>>>        struct nvme_ns_head **head, int *srcu_idx)
>>>{
>>>#ifdef CONFIG_NVME_MULTIPATH
>>>@@ -1546,7 +1580,7 @@ static struct nvme_ns 
>>>*nvme_get_ns_from_disk(struct gendisk *disk,
>>>    return disk->private_data;
>>>}
>>>
>>>-static void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx)
>>>+void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx)
>>>{
>>>    if (head)
>>>        srcu_read_unlock(&head->srcu, idx);
>>>@@ -1939,21 +1973,28 @@ static void nvme_update_disk_info(struct 
>>>gendisk *disk,
>>>
>>>static int __nvme_revalidate_disk(struct gendisk *disk, struct 
>>>nvme_id_ns *id)
>>>{
>>>+    unsigned lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK;
>>>    struct nvme_ns *ns = disk->private_data;
>>>    struct nvme_ctrl *ctrl = ns->ctrl;
>>>+    int ret;
>>>    u32 iob;
>>>
>>>    /*
>>>     * If identify namespace failed, use default 512 byte block size so
>>>     * block layer can use before failing read/write for 0 capacity.
>>>     */
>>>-    ns->lba_shift = id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ds;
>>>+    ns->lba_shift = id->lbaf[lbaf].ds;
>>>    if (ns->lba_shift == 0)
>>>        ns->lba_shift = 9;
>>>
>>>    switch (ns->head->ids.csi) {
>>>    case NVME_CSI_NVM:
>>>        break;
>>>+    case NVME_CSI_ZNS:
>>>+        ret = nvme_update_zone_info(disk, ns, lbaf);
>>>+        if (ret)
>>>+            return ret;
>>>+        break;
>>>    default:
>>>        dev_warn(ctrl->device, "unknown csi:%d ns:%d\n",
>>>            ns->head->ids.csi, ns->head->ns_id);
>>>@@ -1967,7 +2008,7 @@ static int __nvme_revalidate_disk(struct 
>>>gendisk *disk, struct nvme_id_ns *id)
>>>        iob = nvme_lba_to_sect(ns, le16_to_cpu(id->noiob));
>>>
>>>    ns->features = 0;
>>>-    ns->ms = le16_to_cpu(id->lbaf[id->flbas & 
>>>NVME_NS_FLBAS_LBA_MASK].ms);
>>>+    ns->ms = le16_to_cpu(id->lbaf[lbaf].ms);
>>>    /* the PI implementation requires metadata equal t10 pi tuple 
>>>size */
>>>    if (ns->ms == sizeof(struct t10_pi_tuple))
>>>        ns->pi_type = id->dps & NVME_NS_DPS_PI_MASK;
>>>@@ -2010,7 +2051,7 @@ static int __nvme_revalidate_disk(struct 
>>>gendisk *disk, struct nvme_id_ns *id)
>>>    return 0;
>>>}
>>>
>>>-static int nvme_revalidate_disk(struct gendisk *disk)
>>>+static int _nvme_revalidate_disk(struct gendisk *disk)
>>>{
>>>    struct nvme_ns *ns = disk->private_data;
>>>    struct nvme_ctrl *ctrl = ns->ctrl;
>>>@@ -2058,6 +2099,28 @@ static int nvme_revalidate_disk(struct 
>>>gendisk *disk)
>>>    return ret;
>>>}
>>>
>>>+static int nvme_revalidate_disk(struct gendisk *disk)
>>>+{
>>>+    int ret;
>>>+
>>>+    ret = _nvme_revalidate_disk(disk);
>>>+    if (ret)
>>>+        return ret;
>>>+
>>>+#ifdef CONFIG_BLK_DEV_ZONED
>>>+    if (blk_queue_is_zoned(disk->queue)) {
>>>+        struct nvme_ns *ns = disk->private_data;
>>>+        struct nvme_ctrl *ctrl = ns->ctrl;
>>>+
>>>+        ret = blk_revalidate_disk_zones(disk, NULL);
>>>+        if (!ret)
>>>+            blk_queue_max_zone_append_sectors(disk->queue,
>>>+                              ctrl->max_zone_append);
>>>+    }
>>>+#endif
>>>+    return ret;
>>>+}
>>>+
>>>static char nvme_pr_type(enum pr_type type)
>>>{
>>>    switch (type) {
>>>@@ -2188,6 +2251,7 @@ static const struct block_device_operations 
>>>nvme_fops = {
>>>    .release    = nvme_release,
>>>    .getgeo        = nvme_getgeo,
>>>    .revalidate_disk= nvme_revalidate_disk,
>>>+    .report_zones    = nvme_report_zones,
>>>    .pr_ops        = &nvme_pr_ops,
>>>};
>>>
>>>@@ -2213,6 +2277,7 @@ const struct block_device_operations 
>>>nvme_ns_head_ops = {
>>>    .ioctl        = nvme_ioctl,
>>>    .compat_ioctl    = nvme_compat_ioctl,
>>>    .getgeo        = nvme_getgeo,
>>>+    .report_zones    = nvme_report_zones,
>>>    .pr_ops        = &nvme_pr_ops,
>>>};
>>>#endif /* CONFIG_NVME_MULTIPATH */
>>>@@ -4439,6 +4504,8 @@ static inline void _nvme_check_size(void)
>>>    BUILD_BUG_ON(sizeof(struct nvme_command) != 64);
>>>    BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != 
>>>NVME_IDENTIFY_DATA_SIZE);
>>>    BUILD_BUG_ON(sizeof(struct nvme_id_ns) != NVME_IDENTIFY_DATA_SIZE);
>>>+    BUILD_BUG_ON(sizeof(struct nvme_id_ns_zns) != 
>>>NVME_IDENTIFY_DATA_SIZE);
>>>+    BUILD_BUG_ON(sizeof(struct nvme_id_ctrl_zns) != 
>>>NVME_IDENTIFY_DATA_SIZE);
>>>    BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
>>>    BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
>>>    BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64);
>>>diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
>>>index 58428e3a590e..662f95fbd909 100644
>>>--- a/drivers/nvme/host/nvme.h
>>>+++ b/drivers/nvme/host/nvme.h
>>>@@ -239,6 +239,9 @@ struct nvme_ctrl {
>>>    u32 max_hw_sectors;
>>>    u32 max_segments;
>>>    u32 max_integrity_segments;
>>>+#ifdef CONFIG_BLK_DEV_ZONED
>>>+    u32 max_zone_append;
>>>+#endif
>>>    u16 crdt[3];
>>>    u16 oncs;
>>>    u16 oacs;
>>>@@ -403,6 +406,9 @@ struct nvme_ns {
>>>    u16 sgs;
>>>    u32 sws;
>>>    u8 pi_type;
>>>+#ifdef CONFIG_BLK_DEV_ZONED
>>>+    u64 zsze;
>>>+#endif
>>>    unsigned long features;
>>>    unsigned long flags;
>>>#define NVME_NS_REMOVING    0
>>>@@ -568,6 +574,9 @@ int nvme_delete_ctrl(struct nvme_ctrl *ctrl);
>>>
>>>int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 
>>>lsp, u8 csi,
>>>        void *log, size_t size, u64 offset);
>>>+struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
>>>+        struct nvme_ns_head **head, int *srcu_idx);
>>>+void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx);
>>>
>>>extern const struct attribute_group *nvme_ns_id_attr_groups[];
>>>extern const struct block_device_operations nvme_ns_head_ops;
>>>@@ -689,6 +698,36 @@ static inline void 
>>>nvme_mpath_start_freeze(struct nvme_subsystem *subsys)
>>>}
>>>#endif /* CONFIG_NVME_MULTIPATH */
>>>
>>>+#ifdef CONFIG_BLK_DEV_ZONED
>>>+int nvme_update_zone_info(struct gendisk *disk, struct nvme_ns *ns,
>>>+              unsigned lbaf);
>>>+
>>>+int nvme_report_zones(struct gendisk *disk, sector_t sector,
>>>+              unsigned int nr_zones, report_zones_cb cb, void *data);
>>>+
>>>+blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct 
>>>request *req,
>>>+                       struct nvme_command *cmnd,
>>>+                       enum nvme_zone_mgmt_action action);
>>>+#else
>>>+#define nvme_report_zones NULL
>>>+
>>>+static inline blk_status_t nvme_setup_zone_mgmt_send(struct 
>>>nvme_ns *ns,
>>>+        struct request *req, struct nvme_command *cmnd,
>>>+        enum nvme_zone_mgmt_action action)
>>>+{
>>>+    return BLK_STS_NOTSUPP;
>>>+}
>>>+
>>>+static inline int nvme_update_zone_info(struct gendisk *disk,
>>>+                    struct nvme_ns *ns,
>>>+                    unsigned lbaf)
>>>+{
>>>+    dev_warn(ns->ctrl->device,
>>>+         "Please enable CONFIG_BLK_DEV_ZONED to support ZNS 
>>>devices\n");
>>>+    return -EPROTONOSUPPORT;
>>>+}
>>>+#endif
>>>+
>>>#ifdef CONFIG_NVM
>>>int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node);
>>>void nvme_nvm_unregister(struct nvme_ns *ns);
>>>diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c
>>>new file mode 100644
>>>index 000000000000..c08f6281b614
>>>--- /dev/null
>>>+++ b/drivers/nvme/host/zns.c
>>>@@ -0,0 +1,238 @@
>>>+// SPDX-License-Identifier: GPL-2.0
>>>+/*
>>>+ * Copyright (C) 2020 Western Digital Corporation or its affiliates.
>>>+ */
>>>+
>>>+#include <linux/blkdev.h>
>>>+#include <linux/vmalloc.h>
>>>+#include "nvme.h"
>>>+
>>>+static int nvme_set_max_append(struct nvme_ctrl *ctrl)
>>>+{
>>>+    struct nvme_command c = { };
>>>+    struct nvme_id_ctrl_zns *id;
>>>+    int status;
>>>+
>>>+    id = kzalloc(sizeof(*id), GFP_KERNEL);
>>>+    if (!id)
>>>+        return -ENOMEM;
>>>+
>>>+    c.identify.opcode = nvme_admin_identify;
>>>+    c.identify.cns = NVME_ID_CNS_CS_CTRL;
>>>+    c.identify.csi = NVME_CSI_ZNS;
>>>+
>>>+    status = nvme_submit_sync_cmd(ctrl->admin_q, &c, id, sizeof(*id));
>>>+    if (status) {
>>>+        kfree(id);
>>>+        return status;
>>>+    }
>>>+
>>>+    ctrl->max_zone_append = 1 << (id->zamds + 3);
>>>+    kfree(id);
>>>+    return 0;
>>>+}
>>>+
>>>+int nvme_update_zone_info(struct gendisk *disk, struct nvme_ns *ns,
>>>+              unsigned lbaf)
>>>+{
>>>+    struct nvme_effects_log *log = ns->head->effects;
>>>+    struct request_queue *q = disk->queue;
>>>+    struct nvme_command c = { };
>>>+    struct nvme_id_ns_zns *id;
>>>+    int status;
>>>+
>>>+    /* Driver requires zone append support */
>>>+    if (!(log->iocs[nvme_cmd_zone_append] & NVME_CMD_EFFECTS_CSUPP))
>>>+        return -ENODEV;
>>
>>Following up on the initial comment, this check should go.
>
>See first comment.

See above and please remove.

>
>>
>>>+
>>>+    /* Lazily query controller append limit for the first zoned 
>>>namespace */
>>>+    if (!ns->ctrl->max_zone_append) {
>>>+        status = nvme_set_max_append(ns->ctrl);
>>>+        if (status)
>>>+            return status;
>>>+    }
>>
>>This should only be applied if append is supported.
>
>See first comment.
>
>>
>>>+
>>>+    id = kzalloc(sizeof(*id), GFP_KERNEL);
>>>+    if (!id)
>>>+        return -ENOMEM;
>>>+
>>>+    c.identify.opcode = nvme_admin_identify;
>>>+    c.identify.nsid = cpu_to_le32(ns->head->ns_id);
>>>+    c.identify.cns = NVME_ID_CNS_CS_NS;
>>>+    c.identify.csi = NVME_CSI_ZNS;
>>>+
>>>+    status = nvme_submit_sync_cmd(ns->ctrl->admin_q, &c, id, 
>>>sizeof(*id));
>>>+    if (status)
>>>+        goto free_data;
>>>+
>>>+    /*
>>>+     * We currently do not handle devices requiring any of the zoned
>>>+     * operation characteristics.
>>>+     */
>>>+    if (id->zoc) {
>>>+        status = -EINVAL;
>>>+        goto free_data;
>>>+    }
>>
>>I understand that "Variable Zone Capacity" is not supported as it
>>requires major changes at this moment, but we should support controllers
>>that enable "Zone Active Excursions", even when the AER event is not
>>implemented in this patchset.
>
>
>NAK. Similarly to VZC, this allows an unsuspecting user to have major 
>data loss when a zone is suddenly moved to Full.

I buy that.

>
>
>>
>>>+
>>>+    ns->zsze = nvme_lba_to_sect(ns, le64_to_cpu(id->lbafe[lbaf].zsze));
>>>+    if (!ns->zsze) {
>>>+        status = -EINVAL;
>>>+        goto free_data;
>>>+    }
>>>+
>>>+    q->limits.zoned = BLK_ZONED_HM;
>>>+    blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q);
>>>+free_data:
>>>+    kfree(id);
>>>+    return status;
>>>+}
>>>+
>>>+static void *nvme_zns_alloc_report_buffer(struct nvme_ns *ns,
>>>+                      unsigned int nr_zones, size_t *buflen)
>>>+{
>>>+    struct request_queue *q = ns->disk->queue;
>>>+    size_t bufsize;
>>>+    void *buf;
>>>+
>>>+    const size_t min_bufsize = sizeof(struct nvme_zone_report) +
>>>+                   sizeof(struct nvme_zone_descriptor);
>>>+
>>>+    nr_zones = min_t(unsigned int, nr_zones,
>>>+             get_capacity(ns->disk) >> ilog2(ns->zsze));
>>>+
>>>+    bufsize = sizeof(struct nvme_zone_report) +
>>>+        nr_zones * sizeof(struct nvme_zone_descriptor);
>>>+    bufsize = min_t(size_t, bufsize,
>>>+            queue_max_hw_sectors(q) << SECTOR_SHIFT);
>>>+    bufsize = min_t(size_t, bufsize, queue_max_segments(q) << 
>>>PAGE_SHIFT);
>>>+
>>>+    while (bufsize >= min_bufsize) {
>>>+        buf = __vmalloc(bufsize,
>>>+                GFP_KERNEL | __GFP_ZERO | __GFP_NORETRY);
>>>+        if (buf) {
>>>+            *buflen = bufsize;
>>>+            return buf;
>>>+        }
>>>+        bufsize >>= 1;
>>>+    }
>>>+    return NULL;
>>>+}
>>>+
>>>+static int __nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector,
>>>+                  struct nvme_zone_report *report,
>>>+                  size_t buflen)
>>>+{
>>>+    struct nvme_command c = { };
>>>+    int ret;
>>>+
>>>+    c.zmr.opcode = nvme_cmd_zone_mgmt_recv;
>>>+    c.zmr.nsid = cpu_to_le32(ns->head->ns_id);
>>>+    c.zmr.slba = cpu_to_le64(nvme_sect_to_lba(ns, sector));
>>>+    c.zmr.numd = cpu_to_le32(nvme_bytes_to_numd(buflen));
>>>+    c.zmr.zra = NVME_ZRA_ZONE_REPORT;
>>>+    c.zmr.zrasf = NVME_ZRASF_ZONE_REPORT_ALL;
>>>+    c.zmr.pr = NVME_REPORT_ZONE_PARTIAL;
>>>+
>>>+    ret = nvme_submit_sync_cmd(ns->queue, &c, report, buflen);
>>>+    if (ret)
>>>+        return ret;
>>>+
>>>+    return le64_to_cpu(report->nr_zones);
>>>+}
>>>+
>>>+static int nvme_zone_parse_entry(struct nvme_ns *ns,
>>>+                 struct nvme_zone_descriptor *entry,
>>>+                 unsigned int idx, report_zones_cb cb,
>>>+                 void *data)
>>>+{
>>>+    struct blk_zone zone = { };
>>>+
>>>+    if ((entry->zt & 0xf) != NVME_ZONE_TYPE_SEQWRITE_REQ) {
>>>+        dev_err(ns->ctrl->device, "invalid zone type %#x\n",
>>>+                entry->zt);
>>>+        return -EINVAL;
>>>+    }
>>>+
>>>+    zone.type = BLK_ZONE_TYPE_SEQWRITE_REQ;
>>>+    zone.cond = entry->zs >> 4;
>>>+    zone.len = ns->zsze;
>>>+    zone.capacity = nvme_lba_to_sect(ns, le64_to_cpu(entry->zcap));
>>>+    zone.start = nvme_lba_to_sect(ns, le64_to_cpu(entry->zslba));
>>>+    zone.wp = nvme_lba_to_sect(ns, le64_to_cpu(entry->wp));
>>>+
>>>+    return cb(&zone, idx, data);
>>>+}
>>>+
>>>+static int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector,
>>>+            unsigned int nr_zones, report_zones_cb cb, void *data)
>>>+{
>>>+    struct nvme_zone_report *report;
>>>+    int ret, zone_idx = 0;
>>>+    unsigned int nz, i;
>>>+    size_t buflen;
>>>+
>>>+    report = nvme_zns_alloc_report_buffer(ns, nr_zones, &buflen);
>>>+    if (!report)
>>>+        return -ENOMEM;
>>>+
>>>+    sector &= ~(ns->zsze - 1);
>>>+    while (zone_idx < nr_zones && sector < get_capacity(ns->disk)) {
>>>+        memset(report, 0, buflen);
>>>+        ret = __nvme_ns_report_zones(ns, sector, report, buflen);
>>>+        if (ret < 0)
>>>+            goto out_free;
>>>+
>>>+        nz = min_t(unsigned int, ret, nr_zones);
>>>+        if (!nz)
>>>+            break;
>>>+
>>>+        for (i = 0; i < nz && zone_idx < nr_zones; i++) {
>>>+            ret = nvme_zone_parse_entry(ns, &report->entries[i],
>>>+                            zone_idx, cb, data);
>>>+            if (ret)
>>>+                goto out_free;
>>>+            zone_idx++;
>>>+        }
>>>+
>>>+        sector += ns->zsze * nz;
>>>+    }
>>>+
>>>+    ret = zone_idx;
>>>+out_free:
>>>+    kvfree(report);
>>>+    return ret;
>>>+}
>>>+
>>>+int nvme_report_zones(struct gendisk *disk, sector_t sector,
>>>+              unsigned int nr_zones, report_zones_cb cb, void *data)
>>>+{
>>>+    struct nvme_ns_head *head = NULL;
>>>+    struct nvme_ns *ns;
>>>+    int srcu_idx, ret;
>>>+
>>>+    ns = nvme_get_ns_from_disk(disk, &head, &srcu_idx);
>>>+    if (unlikely(!ns))
>>>+        return -EWOULDBLOCK;
>>>+
>>>+    if (ns->head->ids.csi == NVME_CSI_ZNS)
>>>+        ret = nvme_ns_report_zones(ns, sector, nr_zones, cb, data);
>>>+    else
>>>+        ret = -EINVAL;
>>>+    nvme_put_ns_from_disk(head, srcu_idx);
>>>+
>>>+    return ret;
>>>+}
>>>+
>>>+blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct 
>>>request *req,
>>>+        struct nvme_command *c, enum nvme_zone_mgmt_action action)
>>>+{
>>>+    c->zms.opcode = nvme_cmd_zone_mgmt_send;
>>>+    c->zms.nsid = cpu_to_le32(ns->head->ns_id);
>>>+    c->zms.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
>>>+    c->zms.action = action;
>>>+
>>>+    if (req_op(req) == REQ_OP_ZONE_RESET_ALL)
>>>+        c->zms.select = 1;
>>>+
>>>+    return BLK_STS_OK;
>>>+}
>>>diff --git a/include/linux/nvme.h b/include/linux/nvme.h
>>>index ea25da572eed..7b3fa7de07bd 100644
>>>--- a/include/linux/nvme.h
>>>+++ b/include/linux/nvme.h
>>>@@ -374,6 +374,30 @@ struct nvme_id_ns {
>>>    __u8            vs[3712];
>>>};
>>>
>>>+struct nvme_zns_lbafe {
>>>+    __le64            zsze;
>>>+    __u8            zdes;
>>>+    __u8            rsvd9[7];
>>>+};
>>>+
>>>+struct nvme_id_ns_zns {
>>>+    __le16            zoc;
>>>+    __le16            ozcs;
>>>+    __le32            mar;
>>>+    __le32            mor;
>>>+    __le32            rrl;
>>>+    __le32            frl;
>>>+    __u8            rsvd20[2796];
>>>+    struct nvme_zns_lbafe    lbafe[16];
>>>+    __u8            rsvd3072[768];
>>>+    __u8            vs[256];
>>>+};
>>>+
>>>+struct nvme_id_ctrl_zns {
>>>+    __u8    zamds;
>>>+    __u8    rsvd1[4095];
>>>+};
>>>+
>>>enum {
>>>    NVME_ID_CNS_NS            = 0x00,
>>>    NVME_ID_CNS_CTRL        = 0x01,
>>>@@ -392,6 +416,7 @@ enum {
>>>
>>>enum {
>>>    NVME_CSI_NVM            = 0,
>>>+    NVME_CSI_ZNS            = 2,
>>>};
>>>
>>>enum {
>>>@@ -532,6 +557,27 @@ struct nvme_ana_rsp_hdr {
>>>    __le16    rsvd10[3];
>>>};
>>>
>>>+struct nvme_zone_descriptor {
>>>+    __u8        zt;
>>>+    __u8        zs;
>>>+    __u8        za;
>>>+    __u8        rsvd3[5];
>>>+    __le64        zcap;
>>>+    __le64        zslba;
>>>+    __le64        wp;
>>>+    __u8        rsvd32[32];
>>>+};
>>>+
>>>+enum {
>>>+    NVME_ZONE_TYPE_SEQWRITE_REQ    = 0x2,
>>>+};
>>>+
>>>+struct nvme_zone_report {
>>>+    __le64        nr_zones;
>>>+    __u8        resv8[56];
>>>+    struct nvme_zone_descriptor entries[];
>>>+};
>>>+
>>>enum {
>>>    NVME_SMART_CRIT_SPARE        = 1 << 0,
>>>    NVME_SMART_CRIT_TEMPERATURE    = 1 << 1,
>>>@@ -626,6 +672,9 @@ enum nvme_opcode {
>>>    nvme_cmd_resv_report    = 0x0e,
>>>    nvme_cmd_resv_acquire    = 0x11,
>>>    nvme_cmd_resv_release    = 0x15,
>>>+    nvme_cmd_zone_mgmt_send    = 0x79,
>>>+    nvme_cmd_zone_mgmt_recv    = 0x7a,
>>>+    nvme_cmd_zone_append    = 0x7d,
>>>};
>>>
>>>#define nvme_opcode_name(opcode)    { opcode, #opcode }
>>>@@ -764,6 +813,7 @@ struct nvme_rw_command {
>>>enum {
>>>    NVME_RW_LR            = 1 << 15,
>>>    NVME_RW_FUA            = 1 << 14,
>>>+    NVME_RW_APPEND_PIREMAP        = 1 << 9,
>>>    NVME_RW_DSM_FREQ_UNSPEC        = 0,
>>>    NVME_RW_DSM_FREQ_TYPICAL    = 1,
>>>    NVME_RW_DSM_FREQ_RARE        = 2,
>>>@@ -829,6 +879,53 @@ struct nvme_write_zeroes_cmd {
>>>    __le16            appmask;
>>>};
>>>
>>>+enum nvme_zone_mgmt_action {
>>>+    NVME_ZONE_CLOSE        = 0x1,
>>>+    NVME_ZONE_FINISH    = 0x2,
>>>+    NVME_ZONE_OPEN        = 0x3,
>>>+    NVME_ZONE_RESET        = 0x4,
>>>+    NVME_ZONE_OFFLINE    = 0x5,
>>>+    NVME_ZONE_SET_DESC_EXT    = 0x10,
>>>+};
>>>+
>>>+struct nvme_zone_mgmt_send_cmd {
>>>+    __u8            opcode;
>>>+    __u8            flags;
>>>+    __u16            command_id;
>>>+    __le32            nsid;
>>>+    __le32            cdw2[2];
>>>+    __le64            metadata;
>>>+    union nvme_data_ptr    dptr;
>>>+    __le64            slba;
>>>+    __le32            cdw12;
>>>+    __u8            action;
>>
>>Why not zsa to make it easier to match to the spec
>>
>>
>>>+    __u8            select;
>>
>>sel_all?
>>
>>>+    __u8            rsvd13[2];
>>>+    __le32            cdw14[2];
>>>+};
>>>+
>>>+struct nvme_zone_mgmt_recv_cmd {
>>>+    __u8            opcode;
>>>+    __u8            flags;
>>>+    __u16            command_id;
>>>+    __le32            nsid;
>>>+    __le64            rsvd2[2];
>>>+    union nvme_data_ptr    dptr;
>>>+    __le64            slba;
>>>+    __le32            numd;
>>>+    __u8            zra;
>>>+    __u8            zrasf;
>>>+    __u8            pr;
>>
>>Partial Report is just one bit in the "Zone Receive Action Specific
>>Features". What about zrasfe?
>
>There currently no users of pr, and bit 1-7 are reserved in the spec. 
>Users of the pr variable should shift and mask as necessary.
>
>zrasf looks good to me. It is defined as a byte in the spec.

I meant for the pr variable name. Agree with the rest.

>
>>
>>>+    __u8            rsvd13;
>>>+    __le32            cdw14[2];
>>>+};
>>>+
>>>+enum {
>>>+    NVME_ZRA_ZONE_REPORT        = 0,
>>>+    NVME_ZRASF_ZONE_REPORT_ALL    = 0,
>>>+    NVME_REPORT_ZONE_PARTIAL    = 1,
>>>+};
>>>+
>>>/* Features */
>>>
>>>enum {
>>>@@ -1300,6 +1397,8 @@ struct nvme_command {
>>>        struct nvme_format_cmd format;
>>>        struct nvme_dsm_cmd dsm;
>>>        struct nvme_write_zeroes_cmd write_zeroes;
>>>+        struct nvme_zone_mgmt_send_cmd zms;
>>>+        struct nvme_zone_mgmt_recv_cmd zmr;
>>>        struct nvme_abort_cmd abort;
>>>        struct nvme_get_log_page_command get_log_page;
>>>        struct nvmf_common_command fabrics;
>>>@@ -1433,6 +1532,18 @@ enum {
>>>    NVME_SC_DISCOVERY_RESTART    = 0x190,
>>>    NVME_SC_AUTH_REQUIRED        = 0x191,
>>>
>>>+    /*
>>>+     * I/O Command Set Specific - Zoned commands:
>>>+     */
>>>+    NVME_SC_ZONE_BOUNDARY_ERROR    = 0x1b8,
>>>+    NVME_SC_ZONE_FULL        = 0x1b9,
>>>+    NVME_SC_ZONE_READ_ONLY        = 0x1ba,
>>>+    NVME_SC_ZONE_OFFLINE        = 0x1bb,
>>>+    NVME_SC_ZONE_INVALID_WRITE    = 0x1bc,
>>>+    NVME_SC_ZONE_TOO_MANY_ACTIVE    = 0x1bd,
>>>+    NVME_SC_ZONE_TOO_MANY_OPEN    = 0x1be,
>>>+    NVME_SC_ZONE_INVALID_TRANSITION    = 0x1bf,
>>>+
>>>    /*
>>>     * Media and Data Integrity Errors:
>>>     */
>>>-- 
>>>2.24.1
>>>
>
Matias Bjorling June 16, 2020, 12:06 p.m. UTC | #4
On 16/06/2020 14.00, Javier González wrote:
> On 16.06.2020 13:18, Matias Bjørling wrote:
>> On 16/06/2020 12.41, Javier González wrote:
>>> On 16.06.2020 08:34, Keith Busch wrote:
>>>> Add support for NVM Express Zoned Namespaces (ZNS) Command Set defined
>>>> in NVM Express TP4053. Zoned namespaces are discovered based on their
>>>> Command Set Identifier reported in the namespaces Namespace
>>>> Identification Descriptor list. A successfully discovered Zoned
>>>> Namespace will be registered with the block layer as a host managed
>>>> zoned block device with Zone Append command support. A namespace that
>>>> does not support append is not supported by the driver.
>>>
>>> Why are we enforcing the append command? Append is optional on the
>>> current ZNS specification, so we should not make this mandatory in the
>>> implementation. See specifics below.
>
>>
>> There is already general support in the kernel for the zone append 
>> command. Feel free to submit patches to emulate the support. It is 
>> outside the scope of this patchset.
>>
>
> It is fine that the kernel supports append, but the ZNS specification
> does not impose the implementation for append, so the driver should not
> do that either.
>
> ZNS SSDs that choose to leave append as a non-implemented optional
> command should not rely on emulated SW support, specially when
> traditional writes work very fine for a large part of current ZNS use
> cases.
>
> Please, remove this virtual constraint.

The Zone Append command is mandatory for zoned block devices. Please see 
https://lwn.net/Articles/818709/ for the background. Please submit 
patches if you want to have support for ZNS devices that does not 
implement the Zone Append command. It is outside the scope of this patchset.

>
>>>
>>>>
>>>> Signed-off-by: Hans Holmberg <hans.holmberg@wdc.com>
>>>> Signed-off-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
>>>> Signed-off-by: Ajay Joshi <ajay.joshi@wdc.com>
>>>> Signed-off-by: Aravind Ramesh <aravind.ramesh@wdc.com>
>>>> Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
>>>> Signed-off-by: Matias Bjørling <matias.bjorling@wdc.com>
>>>> Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
>>>> Signed-off-by: Keith Busch <keith.busch@wdc.com>
>>>> ---
>>>> drivers/nvme/host/Makefile |   1 +
>>>> drivers/nvme/host/core.c   |  91 ++++++++++++--
>>>> drivers/nvme/host/nvme.h   |  39 ++++++
>>>> drivers/nvme/host/zns.c    | 238 +++++++++++++++++++++++++++++++++++++
>>>> include/linux/nvme.h       | 111 +++++++++++++++++
>>>> 5 files changed, 468 insertions(+), 12 deletions(-)
>>>> create mode 100644 drivers/nvme/host/zns.c
>>>>
>>>> diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile
>>>> index fc7b26be692d..d7f6a87687b8 100644
>>>> --- a/drivers/nvme/host/Makefile
>>>> +++ b/drivers/nvme/host/Makefile
>>>> @@ -13,6 +13,7 @@ nvme-core-y                := core.o
>>>> nvme-core-$(CONFIG_TRACING)        += trace.o
>>>> nvme-core-$(CONFIG_NVME_MULTIPATH)    += multipath.o
>>>> nvme-core-$(CONFIG_NVM)            += lightnvm.o
>>>> +nvme-core-$(CONFIG_BLK_DEV_ZONED)    += zns.o
>>>> nvme-core-$(CONFIG_FAULT_INJECTION_DEBUG_FS)    += fault_inject.o
>>>> nvme-core-$(CONFIG_NVME_HWMON)        += hwmon.o
>>>>
>>>> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
>>>> index 58f137b9f2c5..e961910da4ac 100644
>>>> --- a/drivers/nvme/host/core.c
>>>> +++ b/drivers/nvme/host/core.c
>>>> @@ -89,7 +89,7 @@ static dev_t nvme_chr_devt;
>>>> static struct class *nvme_class;
>>>> static struct class *nvme_subsys_class;
>>>>
>>>> -static int nvme_revalidate_disk(struct gendisk *disk);
>>>> +static int _nvme_revalidate_disk(struct gendisk *disk);
>>>> static void nvme_put_subsystem(struct nvme_subsystem *subsys);
>>>> static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
>>>>                        unsigned nsid);
>>>> @@ -287,6 +287,10 @@ void nvme_complete_rq(struct request *req)
>>>>             nvme_retry_req(req);
>>>>             return;
>>>>         }
>>>> +    } else if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
>>>> +           req_op(req) == REQ_OP_ZONE_APPEND) {
>>>> +        req->__sector = nvme_lba_to_sect(req->q->queuedata,
>>>> +            le64_to_cpu(nvme_req(req)->result.u64));
>>>>     }
>>>>
>>>>     nvme_trace_bio_complete(req, status);
>>>> @@ -673,7 +677,8 @@ static inline blk_status_t 
>>>> nvme_setup_write_zeroes(struct nvme_ns *ns,
>>>> }
>>>>
>>>> static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
>>>> -        struct request *req, struct nvme_command *cmnd)
>>>> +        struct request *req, struct nvme_command *cmnd,
>>>> +        enum nvme_opcode op)
>>>> {
>>>>     struct nvme_ctrl *ctrl = ns->ctrl;
>>>>     u16 control = 0;
>>>> @@ -687,7 +692,7 @@ static inline blk_status_t nvme_setup_rw(struct 
>>>> nvme_ns *ns,
>>>>     if (req->cmd_flags & REQ_RAHEAD)
>>>>         dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
>>>>
>>>> -    cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : 
>>>> nvme_cmd_read);
>>>> +    cmnd->rw.opcode = op;
>>>>     cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id);
>>>>     cmnd->rw.slba = cpu_to_le64(nvme_sect_to_lba(ns, 
>>>> blk_rq_pos(req)));
>>>>     cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> 
>>>> ns->lba_shift) - 1);
>>>> @@ -716,6 +721,8 @@ static inline blk_status_t nvme_setup_rw(struct 
>>>> nvme_ns *ns,
>>>>         case NVME_NS_DPS_PI_TYPE2:
>>>>             control |= NVME_RW_PRINFO_PRCHK_GUARD |
>>>>                     NVME_RW_PRINFO_PRCHK_REF;
>>>> +            if (op == nvme_cmd_zone_append)
>>>> +                control |= NVME_RW_APPEND_PIREMAP;
>>>>             cmnd->rw.reftag = cpu_to_le32(t10_pi_ref_tag(req));
>>>>             break;
>>>>         }
>>>> @@ -756,6 +763,19 @@ blk_status_t nvme_setup_cmd(struct nvme_ns 
>>>> *ns, struct request *req,
>>>>     case REQ_OP_FLUSH:
>>>>         nvme_setup_flush(ns, cmd);
>>>>         break;
>>>> +    case REQ_OP_ZONE_RESET_ALL:
>>>> +    case REQ_OP_ZONE_RESET:
>>>> +        ret = nvme_setup_zone_mgmt_send(ns, req, cmd, 
>>>> NVME_ZONE_RESET);
>>>> +        break;
>>>> +    case REQ_OP_ZONE_OPEN:
>>>> +        ret = nvme_setup_zone_mgmt_send(ns, req, cmd, 
>>>> NVME_ZONE_OPEN);
>>>> +        break;
>>>> +    case REQ_OP_ZONE_CLOSE:
>>>> +        ret = nvme_setup_zone_mgmt_send(ns, req, cmd, 
>>>> NVME_ZONE_CLOSE);
>>>> +        break;
>>>> +    case REQ_OP_ZONE_FINISH:
>>>> +        ret = nvme_setup_zone_mgmt_send(ns, req, cmd, 
>>>> NVME_ZONE_FINISH);
>>>> +        break;
>>>>     case REQ_OP_WRITE_ZEROES:
>>>>         ret = nvme_setup_write_zeroes(ns, req, cmd);
>>>>         break;
>>>> @@ -763,8 +783,13 @@ blk_status_t nvme_setup_cmd(struct nvme_ns 
>>>> *ns, struct request *req,
>>>>         ret = nvme_setup_discard(ns, req, cmd);
>>>>         break;
>>>>     case REQ_OP_READ:
>>>> +        ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_read);
>>>> +        break;
>>>>     case REQ_OP_WRITE:
>>>> -        ret = nvme_setup_rw(ns, req, cmd);
>>>> +        ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_write);
>>>> +        break;
>>>> +    case REQ_OP_ZONE_APPEND:
>>>> +        ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_zone_append);
>>>>         break;
>>>>     default:
>>>>         WARN_ON_ONCE(1);
>>>> @@ -1392,14 +1417,23 @@ static u32 nvme_passthru_start(struct 
>>>> nvme_ctrl *ctrl, struct nvme_ns *ns,
>>>>     return effects;
>>>> }
>>>>
>>>> -static void nvme_update_formats(struct nvme_ctrl *ctrl)
>>>> +static void nvme_update_formats(struct nvme_ctrl *ctrl, u32 *effects)
>>>> {
>>>>     struct nvme_ns *ns;
>>>>
>>>>     down_read(&ctrl->namespaces_rwsem);
>>>>     list_for_each_entry(ns, &ctrl->namespaces, list)
>>>> -        if (ns->disk && nvme_revalidate_disk(ns->disk))
>>>> +        if (ns->disk && _nvme_revalidate_disk(ns->disk))
>>>>             nvme_set_queue_dying(ns);
>>>> +        else if (blk_queue_is_zoned(ns->disk->queue)) {
>>>> +            /*
>>>> +             * IO commands are required to fully revalidate a zoned
>>>> +             * device. Force the command effects to trigger rescan
>>>> +             * work so report zones can run in a context with
>>>> +             * unfrozen IO queues.
>>>> +             */
>>>> +            *effects |= NVME_CMD_EFFECTS_NCC;
>>>> +        }
>>>>     up_read(&ctrl->namespaces_rwsem);
>>>> }
>>>>
>>>> @@ -1411,7 +1445,7 @@ static void nvme_passthru_end(struct 
>>>> nvme_ctrl *ctrl, u32 effects)
>>>>      * this command.
>>>>      */
>>>>     if (effects & NVME_CMD_EFFECTS_LBCC)
>>>> -        nvme_update_formats(ctrl);
>>>> +        nvme_update_formats(ctrl, &effects);
>>>>     if (effects & (NVME_CMD_EFFECTS_LBCC | 
>>>> NVME_CMD_EFFECTS_CSE_MASK)) {
>>>>         nvme_unfreeze(ctrl);
>>>>         nvme_mpath_unfreeze(ctrl->subsys);
>>>> @@ -1526,7 +1560,7 @@ static int nvme_user_cmd64(struct nvme_ctrl 
>>>> *ctrl, struct nvme_ns *ns,
>>>>  * Issue ioctl requests on the first available path.  Note that 
>>>> unlike normal
>>>>  * block layer requests we will not retry failed request on another 
>>>> controller.
>>>>  */
>>>> -static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
>>>> +struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
>>>>         struct nvme_ns_head **head, int *srcu_idx)
>>>> {
>>>> #ifdef CONFIG_NVME_MULTIPATH
>>>> @@ -1546,7 +1580,7 @@ static struct nvme_ns 
>>>> *nvme_get_ns_from_disk(struct gendisk *disk,
>>>>     return disk->private_data;
>>>> }
>>>>
>>>> -static void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx)
>>>> +void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx)
>>>> {
>>>>     if (head)
>>>>         srcu_read_unlock(&head->srcu, idx);
>>>> @@ -1939,21 +1973,28 @@ static void nvme_update_disk_info(struct 
>>>> gendisk *disk,
>>>>
>>>> static int __nvme_revalidate_disk(struct gendisk *disk, struct 
>>>> nvme_id_ns *id)
>>>> {
>>>> +    unsigned lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK;
>>>>     struct nvme_ns *ns = disk->private_data;
>>>>     struct nvme_ctrl *ctrl = ns->ctrl;
>>>> +    int ret;
>>>>     u32 iob;
>>>>
>>>>     /*
>>>>      * If identify namespace failed, use default 512 byte block 
>>>> size so
>>>>      * block layer can use before failing read/write for 0 capacity.
>>>>      */
>>>> -    ns->lba_shift = id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ds;
>>>> +    ns->lba_shift = id->lbaf[lbaf].ds;
>>>>     if (ns->lba_shift == 0)
>>>>         ns->lba_shift = 9;
>>>>
>>>>     switch (ns->head->ids.csi) {
>>>>     case NVME_CSI_NVM:
>>>>         break;
>>>> +    case NVME_CSI_ZNS:
>>>> +        ret = nvme_update_zone_info(disk, ns, lbaf);
>>>> +        if (ret)
>>>> +            return ret;
>>>> +        break;
>>>>     default:
>>>>         dev_warn(ctrl->device, "unknown csi:%d ns:%d\n",
>>>>             ns->head->ids.csi, ns->head->ns_id);
>>>> @@ -1967,7 +2008,7 @@ static int __nvme_revalidate_disk(struct 
>>>> gendisk *disk, struct nvme_id_ns *id)
>>>>         iob = nvme_lba_to_sect(ns, le16_to_cpu(id->noiob));
>>>>
>>>>     ns->features = 0;
>>>> -    ns->ms = le16_to_cpu(id->lbaf[id->flbas & 
>>>> NVME_NS_FLBAS_LBA_MASK].ms);
>>>> +    ns->ms = le16_to_cpu(id->lbaf[lbaf].ms);
>>>>     /* the PI implementation requires metadata equal t10 pi tuple 
>>>> size */
>>>>     if (ns->ms == sizeof(struct t10_pi_tuple))
>>>>         ns->pi_type = id->dps & NVME_NS_DPS_PI_MASK;
>>>> @@ -2010,7 +2051,7 @@ static int __nvme_revalidate_disk(struct 
>>>> gendisk *disk, struct nvme_id_ns *id)
>>>>     return 0;
>>>> }
>>>>
>>>> -static int nvme_revalidate_disk(struct gendisk *disk)
>>>> +static int _nvme_revalidate_disk(struct gendisk *disk)
>>>> {
>>>>     struct nvme_ns *ns = disk->private_data;
>>>>     struct nvme_ctrl *ctrl = ns->ctrl;
>>>> @@ -2058,6 +2099,28 @@ static int nvme_revalidate_disk(struct 
>>>> gendisk *disk)
>>>>     return ret;
>>>> }
>>>>
>>>> +static int nvme_revalidate_disk(struct gendisk *disk)
>>>> +{
>>>> +    int ret;
>>>> +
>>>> +    ret = _nvme_revalidate_disk(disk);
>>>> +    if (ret)
>>>> +        return ret;
>>>> +
>>>> +#ifdef CONFIG_BLK_DEV_ZONED
>>>> +    if (blk_queue_is_zoned(disk->queue)) {
>>>> +        struct nvme_ns *ns = disk->private_data;
>>>> +        struct nvme_ctrl *ctrl = ns->ctrl;
>>>> +
>>>> +        ret = blk_revalidate_disk_zones(disk, NULL);
>>>> +        if (!ret)
>>>> + blk_queue_max_zone_append_sectors(disk->queue,
>>>> +                              ctrl->max_zone_append);
>>>> +    }
>>>> +#endif
>>>> +    return ret;
>>>> +}
>>>> +
>>>> static char nvme_pr_type(enum pr_type type)
>>>> {
>>>>     switch (type) {
>>>> @@ -2188,6 +2251,7 @@ static const struct block_device_operations 
>>>> nvme_fops = {
>>>>     .release    = nvme_release,
>>>>     .getgeo        = nvme_getgeo,
>>>>     .revalidate_disk= nvme_revalidate_disk,
>>>> +    .report_zones    = nvme_report_zones,
>>>>     .pr_ops        = &nvme_pr_ops,
>>>> };
>>>>
>>>> @@ -2213,6 +2277,7 @@ const struct block_device_operations 
>>>> nvme_ns_head_ops = {
>>>>     .ioctl        = nvme_ioctl,
>>>>     .compat_ioctl    = nvme_compat_ioctl,
>>>>     .getgeo        = nvme_getgeo,
>>>> +    .report_zones    = nvme_report_zones,
>>>>     .pr_ops        = &nvme_pr_ops,
>>>> };
>>>> #endif /* CONFIG_NVME_MULTIPATH */
>>>> @@ -4439,6 +4504,8 @@ static inline void _nvme_check_size(void)
>>>>     BUILD_BUG_ON(sizeof(struct nvme_command) != 64);
>>>>     BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != 
>>>> NVME_IDENTIFY_DATA_SIZE);
>>>>     BUILD_BUG_ON(sizeof(struct nvme_id_ns) != 
>>>> NVME_IDENTIFY_DATA_SIZE);
>>>> +    BUILD_BUG_ON(sizeof(struct nvme_id_ns_zns) != 
>>>> NVME_IDENTIFY_DATA_SIZE);
>>>> +    BUILD_BUG_ON(sizeof(struct nvme_id_ctrl_zns) != 
>>>> NVME_IDENTIFY_DATA_SIZE);
>>>>     BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
>>>>     BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
>>>>     BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64);
>>>> diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
>>>> index 58428e3a590e..662f95fbd909 100644
>>>> --- a/drivers/nvme/host/nvme.h
>>>> +++ b/drivers/nvme/host/nvme.h
>>>> @@ -239,6 +239,9 @@ struct nvme_ctrl {
>>>>     u32 max_hw_sectors;
>>>>     u32 max_segments;
>>>>     u32 max_integrity_segments;
>>>> +#ifdef CONFIG_BLK_DEV_ZONED
>>>> +    u32 max_zone_append;
>>>> +#endif
>>>>     u16 crdt[3];
>>>>     u16 oncs;
>>>>     u16 oacs;
>>>> @@ -403,6 +406,9 @@ struct nvme_ns {
>>>>     u16 sgs;
>>>>     u32 sws;
>>>>     u8 pi_type;
>>>> +#ifdef CONFIG_BLK_DEV_ZONED
>>>> +    u64 zsze;
>>>> +#endif
>>>>     unsigned long features;
>>>>     unsigned long flags;
>>>> #define NVME_NS_REMOVING    0
>>>> @@ -568,6 +574,9 @@ int nvme_delete_ctrl(struct nvme_ctrl *ctrl);
>>>>
>>>> int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 
>>>> lsp, u8 csi,
>>>>         void *log, size_t size, u64 offset);
>>>> +struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
>>>> +        struct nvme_ns_head **head, int *srcu_idx);
>>>> +void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx);
>>>>
>>>> extern const struct attribute_group *nvme_ns_id_attr_groups[];
>>>> extern const struct block_device_operations nvme_ns_head_ops;
>>>> @@ -689,6 +698,36 @@ static inline void 
>>>> nvme_mpath_start_freeze(struct nvme_subsystem *subsys)
>>>> }
>>>> #endif /* CONFIG_NVME_MULTIPATH */
>>>>
>>>> +#ifdef CONFIG_BLK_DEV_ZONED
>>>> +int nvme_update_zone_info(struct gendisk *disk, struct nvme_ns *ns,
>>>> +              unsigned lbaf);
>>>> +
>>>> +int nvme_report_zones(struct gendisk *disk, sector_t sector,
>>>> +              unsigned int nr_zones, report_zones_cb cb, void *data);
>>>> +
>>>> +blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct 
>>>> request *req,
>>>> +                       struct nvme_command *cmnd,
>>>> +                       enum nvme_zone_mgmt_action action);
>>>> +#else
>>>> +#define nvme_report_zones NULL
>>>> +
>>>> +static inline blk_status_t nvme_setup_zone_mgmt_send(struct 
>>>> nvme_ns *ns,
>>>> +        struct request *req, struct nvme_command *cmnd,
>>>> +        enum nvme_zone_mgmt_action action)
>>>> +{
>>>> +    return BLK_STS_NOTSUPP;
>>>> +}
>>>> +
>>>> +static inline int nvme_update_zone_info(struct gendisk *disk,
>>>> +                    struct nvme_ns *ns,
>>>> +                    unsigned lbaf)
>>>> +{
>>>> +    dev_warn(ns->ctrl->device,
>>>> +         "Please enable CONFIG_BLK_DEV_ZONED to support ZNS 
>>>> devices\n");
>>>> +    return -EPROTONOSUPPORT;
>>>> +}
>>>> +#endif
>>>> +
>>>> #ifdef CONFIG_NVM
>>>> int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node);
>>>> void nvme_nvm_unregister(struct nvme_ns *ns);
>>>> diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c
>>>> new file mode 100644
>>>> index 000000000000..c08f6281b614
>>>> --- /dev/null
>>>> +++ b/drivers/nvme/host/zns.c
>>>> @@ -0,0 +1,238 @@
>>>> +// SPDX-License-Identifier: GPL-2.0
>>>> +/*
>>>> + * Copyright (C) 2020 Western Digital Corporation or its affiliates.
>>>> + */
>>>> +
>>>> +#include <linux/blkdev.h>
>>>> +#include <linux/vmalloc.h>
>>>> +#include "nvme.h"
>>>> +
>>>> +static int nvme_set_max_append(struct nvme_ctrl *ctrl)
>>>> +{
>>>> +    struct nvme_command c = { };
>>>> +    struct nvme_id_ctrl_zns *id;
>>>> +    int status;
>>>> +
>>>> +    id = kzalloc(sizeof(*id), GFP_KERNEL);
>>>> +    if (!id)
>>>> +        return -ENOMEM;
>>>> +
>>>> +    c.identify.opcode = nvme_admin_identify;
>>>> +    c.identify.cns = NVME_ID_CNS_CS_CTRL;
>>>> +    c.identify.csi = NVME_CSI_ZNS;
>>>> +
>>>> +    status = nvme_submit_sync_cmd(ctrl->admin_q, &c, id, 
>>>> sizeof(*id));
>>>> +    if (status) {
>>>> +        kfree(id);
>>>> +        return status;
>>>> +    }
>>>> +
>>>> +    ctrl->max_zone_append = 1 << (id->zamds + 3);
>>>> +    kfree(id);
>>>> +    return 0;
>>>> +}
>>>> +
>>>> +int nvme_update_zone_info(struct gendisk *disk, struct nvme_ns *ns,
>>>> +              unsigned lbaf)
>>>> +{
>>>> +    struct nvme_effects_log *log = ns->head->effects;
>>>> +    struct request_queue *q = disk->queue;
>>>> +    struct nvme_command c = { };
>>>> +    struct nvme_id_ns_zns *id;
>>>> +    int status;
>>>> +
>>>> +    /* Driver requires zone append support */
>>>> +    if (!(log->iocs[nvme_cmd_zone_append] & NVME_CMD_EFFECTS_CSUPP))
>>>> +        return -ENODEV;
>>>
>>> Following up on the initial comment, this check should go.
>>
>> See first comment.
>
> See above and please remove.

Please send patches.

>>
>>>
>>>> +
>>>> +    /* Lazily query controller append limit for the first zoned 
>>>> namespace */
>>>> +    if (!ns->ctrl->max_zone_append) {
>>>> +        status = nvme_set_max_append(ns->ctrl);
>>>> +        if (status)
>>>> +            return status;
>>>> +    }
>>>
>>> This should only be applied if append is supported.
>>
>> See first comment.
>>
>>>
>>>> +
>>>> +    id = kzalloc(sizeof(*id), GFP_KERNEL);
>>>> +    if (!id)
>>>> +        return -ENOMEM;
>>>> +
>>>> +    c.identify.opcode = nvme_admin_identify;
>>>> +    c.identify.nsid = cpu_to_le32(ns->head->ns_id);
>>>> +    c.identify.cns = NVME_ID_CNS_CS_NS;
>>>> +    c.identify.csi = NVME_CSI_ZNS;
>>>> +
>>>> +    status = nvme_submit_sync_cmd(ns->ctrl->admin_q, &c, id, 
>>>> sizeof(*id));
>>>> +    if (status)
>>>> +        goto free_data;
>>>> +
>>>> +    /*
>>>> +     * We currently do not handle devices requiring any of the zoned
>>>> +     * operation characteristics.
>>>> +     */
>>>> +    if (id->zoc) {
>>>> +        status = -EINVAL;
>>>> +        goto free_data;
>>>> +    }
>>>
>>> I understand that "Variable Zone Capacity" is not supported as it
>>> requires major changes at this moment, but we should support 
>>> controllers
>>> that enable "Zone Active Excursions", even when the AER event is not
>>> implemented in this patchset.
>>
>>
>> NAK. Similarly to VZC, this allows an unsuspecting user to have major 
>> data loss when a zone is suddenly moved to Full.
>
> I buy that.
>
>>
>>
>>>
>>>> +
>>>> +    ns->zsze = nvme_lba_to_sect(ns, 
>>>> le64_to_cpu(id->lbafe[lbaf].zsze));
>>>> +    if (!ns->zsze) {
>>>> +        status = -EINVAL;
>>>> +        goto free_data;
>>>> +    }
>>>> +
>>>> +    q->limits.zoned = BLK_ZONED_HM;
>>>> +    blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q);
>>>> +free_data:
>>>> +    kfree(id);
>>>> +    return status;
>>>> +}
>>>> +
>>>> +static void *nvme_zns_alloc_report_buffer(struct nvme_ns *ns,
>>>> +                      unsigned int nr_zones, size_t *buflen)
>>>> +{
>>>> +    struct request_queue *q = ns->disk->queue;
>>>> +    size_t bufsize;
>>>> +    void *buf;
>>>> +
>>>> +    const size_t min_bufsize = sizeof(struct nvme_zone_report) +
>>>> +                   sizeof(struct nvme_zone_descriptor);
>>>> +
>>>> +    nr_zones = min_t(unsigned int, nr_zones,
>>>> +             get_capacity(ns->disk) >> ilog2(ns->zsze));
>>>> +
>>>> +    bufsize = sizeof(struct nvme_zone_report) +
>>>> +        nr_zones * sizeof(struct nvme_zone_descriptor);
>>>> +    bufsize = min_t(size_t, bufsize,
>>>> +            queue_max_hw_sectors(q) << SECTOR_SHIFT);
>>>> +    bufsize = min_t(size_t, bufsize, queue_max_segments(q) << 
>>>> PAGE_SHIFT);
>>>> +
>>>> +    while (bufsize >= min_bufsize) {
>>>> +        buf = __vmalloc(bufsize,
>>>> +                GFP_KERNEL | __GFP_ZERO | __GFP_NORETRY);
>>>> +        if (buf) {
>>>> +            *buflen = bufsize;
>>>> +            return buf;
>>>> +        }
>>>> +        bufsize >>= 1;
>>>> +    }
>>>> +    return NULL;
>>>> +}
>>>> +
>>>> +static int __nvme_ns_report_zones(struct nvme_ns *ns, sector_t 
>>>> sector,
>>>> +                  struct nvme_zone_report *report,
>>>> +                  size_t buflen)
>>>> +{
>>>> +    struct nvme_command c = { };
>>>> +    int ret;
>>>> +
>>>> +    c.zmr.opcode = nvme_cmd_zone_mgmt_recv;
>>>> +    c.zmr.nsid = cpu_to_le32(ns->head->ns_id);
>>>> +    c.zmr.slba = cpu_to_le64(nvme_sect_to_lba(ns, sector));
>>>> +    c.zmr.numd = cpu_to_le32(nvme_bytes_to_numd(buflen));
>>>> +    c.zmr.zra = NVME_ZRA_ZONE_REPORT;
>>>> +    c.zmr.zrasf = NVME_ZRASF_ZONE_REPORT_ALL;
>>>> +    c.zmr.pr = NVME_REPORT_ZONE_PARTIAL;
>>>> +
>>>> +    ret = nvme_submit_sync_cmd(ns->queue, &c, report, buflen);
>>>> +    if (ret)
>>>> +        return ret;
>>>> +
>>>> +    return le64_to_cpu(report->nr_zones);
>>>> +}
>>>> +
>>>> +static int nvme_zone_parse_entry(struct nvme_ns *ns,
>>>> +                 struct nvme_zone_descriptor *entry,
>>>> +                 unsigned int idx, report_zones_cb cb,
>>>> +                 void *data)
>>>> +{
>>>> +    struct blk_zone zone = { };
>>>> +
>>>> +    if ((entry->zt & 0xf) != NVME_ZONE_TYPE_SEQWRITE_REQ) {
>>>> +        dev_err(ns->ctrl->device, "invalid zone type %#x\n",
>>>> +                entry->zt);
>>>> +        return -EINVAL;
>>>> +    }
>>>> +
>>>> +    zone.type = BLK_ZONE_TYPE_SEQWRITE_REQ;
>>>> +    zone.cond = entry->zs >> 4;
>>>> +    zone.len = ns->zsze;
>>>> +    zone.capacity = nvme_lba_to_sect(ns, le64_to_cpu(entry->zcap));
>>>> +    zone.start = nvme_lba_to_sect(ns, le64_to_cpu(entry->zslba));
>>>> +    zone.wp = nvme_lba_to_sect(ns, le64_to_cpu(entry->wp));
>>>> +
>>>> +    return cb(&zone, idx, data);
>>>> +}
>>>> +
>>>> +static int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector,
>>>> +            unsigned int nr_zones, report_zones_cb cb, void *data)
>>>> +{
>>>> +    struct nvme_zone_report *report;
>>>> +    int ret, zone_idx = 0;
>>>> +    unsigned int nz, i;
>>>> +    size_t buflen;
>>>> +
>>>> +    report = nvme_zns_alloc_report_buffer(ns, nr_zones, &buflen);
>>>> +    if (!report)
>>>> +        return -ENOMEM;
>>>> +
>>>> +    sector &= ~(ns->zsze - 1);
>>>> +    while (zone_idx < nr_zones && sector < get_capacity(ns->disk)) {
>>>> +        memset(report, 0, buflen);
>>>> +        ret = __nvme_ns_report_zones(ns, sector, report, buflen);
>>>> +        if (ret < 0)
>>>> +            goto out_free;
>>>> +
>>>> +        nz = min_t(unsigned int, ret, nr_zones);
>>>> +        if (!nz)
>>>> +            break;
>>>> +
>>>> +        for (i = 0; i < nz && zone_idx < nr_zones; i++) {
>>>> +            ret = nvme_zone_parse_entry(ns, &report->entries[i],
>>>> +                            zone_idx, cb, data);
>>>> +            if (ret)
>>>> +                goto out_free;
>>>> +            zone_idx++;
>>>> +        }
>>>> +
>>>> +        sector += ns->zsze * nz;
>>>> +    }
>>>> +
>>>> +    ret = zone_idx;
>>>> +out_free:
>>>> +    kvfree(report);
>>>> +    return ret;
>>>> +}
>>>> +
>>>> +int nvme_report_zones(struct gendisk *disk, sector_t sector,
>>>> +              unsigned int nr_zones, report_zones_cb cb, void *data)
>>>> +{
>>>> +    struct nvme_ns_head *head = NULL;
>>>> +    struct nvme_ns *ns;
>>>> +    int srcu_idx, ret;
>>>> +
>>>> +    ns = nvme_get_ns_from_disk(disk, &head, &srcu_idx);
>>>> +    if (unlikely(!ns))
>>>> +        return -EWOULDBLOCK;
>>>> +
>>>> +    if (ns->head->ids.csi == NVME_CSI_ZNS)
>>>> +        ret = nvme_ns_report_zones(ns, sector, nr_zones, cb, data);
>>>> +    else
>>>> +        ret = -EINVAL;
>>>> +    nvme_put_ns_from_disk(head, srcu_idx);
>>>> +
>>>> +    return ret;
>>>> +}
>>>> +
>>>> +blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct 
>>>> request *req,
>>>> +        struct nvme_command *c, enum nvme_zone_mgmt_action action)
>>>> +{
>>>> +    c->zms.opcode = nvme_cmd_zone_mgmt_send;
>>>> +    c->zms.nsid = cpu_to_le32(ns->head->ns_id);
>>>> +    c->zms.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
>>>> +    c->zms.action = action;
>>>> +
>>>> +    if (req_op(req) == REQ_OP_ZONE_RESET_ALL)
>>>> +        c->zms.select = 1;
>>>> +
>>>> +    return BLK_STS_OK;
>>>> +}
>>>> diff --git a/include/linux/nvme.h b/include/linux/nvme.h
>>>> index ea25da572eed..7b3fa7de07bd 100644
>>>> --- a/include/linux/nvme.h
>>>> +++ b/include/linux/nvme.h
>>>> @@ -374,6 +374,30 @@ struct nvme_id_ns {
>>>>     __u8            vs[3712];
>>>> };
>>>>
>>>> +struct nvme_zns_lbafe {
>>>> +    __le64            zsze;
>>>> +    __u8            zdes;
>>>> +    __u8            rsvd9[7];
>>>> +};
>>>> +
>>>> +struct nvme_id_ns_zns {
>>>> +    __le16            zoc;
>>>> +    __le16            ozcs;
>>>> +    __le32            mar;
>>>> +    __le32            mor;
>>>> +    __le32            rrl;
>>>> +    __le32            frl;
>>>> +    __u8            rsvd20[2796];
>>>> +    struct nvme_zns_lbafe    lbafe[16];
>>>> +    __u8            rsvd3072[768];
>>>> +    __u8            vs[256];
>>>> +};
>>>> +
>>>> +struct nvme_id_ctrl_zns {
>>>> +    __u8    zamds;
>>>> +    __u8    rsvd1[4095];
>>>> +};
>>>> +
>>>> enum {
>>>>     NVME_ID_CNS_NS            = 0x00,
>>>>     NVME_ID_CNS_CTRL        = 0x01,
>>>> @@ -392,6 +416,7 @@ enum {
>>>>
>>>> enum {
>>>>     NVME_CSI_NVM            = 0,
>>>> +    NVME_CSI_ZNS            = 2,
>>>> };
>>>>
>>>> enum {
>>>> @@ -532,6 +557,27 @@ struct nvme_ana_rsp_hdr {
>>>>     __le16    rsvd10[3];
>>>> };
>>>>
>>>> +struct nvme_zone_descriptor {
>>>> +    __u8        zt;
>>>> +    __u8        zs;
>>>> +    __u8        za;
>>>> +    __u8        rsvd3[5];
>>>> +    __le64        zcap;
>>>> +    __le64        zslba;
>>>> +    __le64        wp;
>>>> +    __u8        rsvd32[32];
>>>> +};
>>>> +
>>>> +enum {
>>>> +    NVME_ZONE_TYPE_SEQWRITE_REQ    = 0x2,
>>>> +};
>>>> +
>>>> +struct nvme_zone_report {
>>>> +    __le64        nr_zones;
>>>> +    __u8        resv8[56];
>>>> +    struct nvme_zone_descriptor entries[];
>>>> +};
>>>> +
>>>> enum {
>>>>     NVME_SMART_CRIT_SPARE        = 1 << 0,
>>>>     NVME_SMART_CRIT_TEMPERATURE    = 1 << 1,
>>>> @@ -626,6 +672,9 @@ enum nvme_opcode {
>>>>     nvme_cmd_resv_report    = 0x0e,
>>>>     nvme_cmd_resv_acquire    = 0x11,
>>>>     nvme_cmd_resv_release    = 0x15,
>>>> +    nvme_cmd_zone_mgmt_send    = 0x79,
>>>> +    nvme_cmd_zone_mgmt_recv    = 0x7a,
>>>> +    nvme_cmd_zone_append    = 0x7d,
>>>> };
>>>>
>>>> #define nvme_opcode_name(opcode)    { opcode, #opcode }
>>>> @@ -764,6 +813,7 @@ struct nvme_rw_command {
>>>> enum {
>>>>     NVME_RW_LR            = 1 << 15,
>>>>     NVME_RW_FUA            = 1 << 14,
>>>> +    NVME_RW_APPEND_PIREMAP        = 1 << 9,
>>>>     NVME_RW_DSM_FREQ_UNSPEC        = 0,
>>>>     NVME_RW_DSM_FREQ_TYPICAL    = 1,
>>>>     NVME_RW_DSM_FREQ_RARE        = 2,
>>>> @@ -829,6 +879,53 @@ struct nvme_write_zeroes_cmd {
>>>>     __le16            appmask;
>>>> };
>>>>
>>>> +enum nvme_zone_mgmt_action {
>>>> +    NVME_ZONE_CLOSE        = 0x1,
>>>> +    NVME_ZONE_FINISH    = 0x2,
>>>> +    NVME_ZONE_OPEN        = 0x3,
>>>> +    NVME_ZONE_RESET        = 0x4,
>>>> +    NVME_ZONE_OFFLINE    = 0x5,
>>>> +    NVME_ZONE_SET_DESC_EXT    = 0x10,
>>>> +};
>>>> +
>>>> +struct nvme_zone_mgmt_send_cmd {
>>>> +    __u8            opcode;
>>>> +    __u8            flags;
>>>> +    __u16            command_id;
>>>> +    __le32            nsid;
>>>> +    __le32            cdw2[2];
>>>> +    __le64            metadata;
>>>> +    union nvme_data_ptr    dptr;
>>>> +    __le64            slba;
>>>> +    __le32            cdw12;
>>>> +    __u8            action;
>>>
>>> Why not zsa to make it easier to match to the spec
>>>
>>>
>>>> +    __u8            select;
>>>
>>> sel_all?
>>>
>>>> +    __u8            rsvd13[2];
>>>> +    __le32            cdw14[2];
>>>> +};
>>>> +
>>>> +struct nvme_zone_mgmt_recv_cmd {
>>>> +    __u8            opcode;
>>>> +    __u8            flags;
>>>> +    __u16            command_id;
>>>> +    __le32            nsid;
>>>> +    __le64            rsvd2[2];
>>>> +    union nvme_data_ptr    dptr;
>>>> +    __le64            slba;
>>>> +    __le32            numd;
>>>> +    __u8            zra;
>>>> +    __u8            zrasf;
>>>> +    __u8            pr;
>>>
>>> Partial Report is just one bit in the "Zone Receive Action Specific
>>> Features". What about zrasfe?
>>
>> There currently no users of pr, and bit 1-7 are reserved in the spec. 
>> Users of the pr variable should shift and mask as necessary.
>>
>> zrasf looks good to me. It is defined as a byte in the spec.
>
> I meant for the pr variable name. Agree with the rest.
>
>>
>>>
>>>> +    __u8            rsvd13;
>>>> +    __le32            cdw14[2];
>>>> +};
>>>> +
>>>> +enum {
>>>> +    NVME_ZRA_ZONE_REPORT        = 0,
>>>> +    NVME_ZRASF_ZONE_REPORT_ALL    = 0,
>>>> +    NVME_REPORT_ZONE_PARTIAL    = 1,
>>>> +};
>>>> +
>>>> /* Features */
>>>>
>>>> enum {
>>>> @@ -1300,6 +1397,8 @@ struct nvme_command {
>>>>         struct nvme_format_cmd format;
>>>>         struct nvme_dsm_cmd dsm;
>>>>         struct nvme_write_zeroes_cmd write_zeroes;
>>>> +        struct nvme_zone_mgmt_send_cmd zms;
>>>> +        struct nvme_zone_mgmt_recv_cmd zmr;
>>>>         struct nvme_abort_cmd abort;
>>>>         struct nvme_get_log_page_command get_log_page;
>>>>         struct nvmf_common_command fabrics;
>>>> @@ -1433,6 +1532,18 @@ enum {
>>>>     NVME_SC_DISCOVERY_RESTART    = 0x190,
>>>>     NVME_SC_AUTH_REQUIRED        = 0x191,
>>>>
>>>> +    /*
>>>> +     * I/O Command Set Specific - Zoned commands:
>>>> +     */
>>>> +    NVME_SC_ZONE_BOUNDARY_ERROR    = 0x1b8,
>>>> +    NVME_SC_ZONE_FULL        = 0x1b9,
>>>> +    NVME_SC_ZONE_READ_ONLY        = 0x1ba,
>>>> +    NVME_SC_ZONE_OFFLINE        = 0x1bb,
>>>> +    NVME_SC_ZONE_INVALID_WRITE    = 0x1bc,
>>>> +    NVME_SC_ZONE_TOO_MANY_ACTIVE    = 0x1bd,
>>>> +    NVME_SC_ZONE_TOO_MANY_OPEN    = 0x1be,
>>>> +    NVME_SC_ZONE_INVALID_TRANSITION    = 0x1bf,
>>>> +
>>>>     /*
>>>>      * Media and Data Integrity Errors:
>>>>      */
>>>> -- 
>>>> 2.24.1
>>>>
>>
Javier González June 16, 2020, 12:24 p.m. UTC | #5
On 16.06.2020 14:06, Matias Bjørling wrote:
>On 16/06/2020 14.00, Javier González wrote:
>>On 16.06.2020 13:18, Matias Bjørling wrote:
>>>On 16/06/2020 12.41, Javier González wrote:
>>>>On 16.06.2020 08:34, Keith Busch wrote:
>>>>>Add support for NVM Express Zoned Namespaces (ZNS) Command Set defined
>>>>>in NVM Express TP4053. Zoned namespaces are discovered based on their
>>>>>Command Set Identifier reported in the namespaces Namespace
>>>>>Identification Descriptor list. A successfully discovered Zoned
>>>>>Namespace will be registered with the block layer as a host managed
>>>>>zoned block device with Zone Append command support. A namespace that
>>>>>does not support append is not supported by the driver.
>>>>
>>>>Why are we enforcing the append command? Append is optional on the
>>>>current ZNS specification, so we should not make this mandatory in the
>>>>implementation. See specifics below.
>>
>>>
>>>There is already general support in the kernel for the zone append 
>>>command. Feel free to submit patches to emulate the support. It is 
>>>outside the scope of this patchset.
>>>
>>
>>It is fine that the kernel supports append, but the ZNS specification
>>does not impose the implementation for append, so the driver should not
>>do that either.
>>
>>ZNS SSDs that choose to leave append as a non-implemented optional
>>command should not rely on emulated SW support, specially when
>>traditional writes work very fine for a large part of current ZNS use
>>cases.
>>
>>Please, remove this virtual constraint.
>
>The Zone Append command is mandatory for zoned block devices. Please 
>see https://lwn.net/Articles/818709/ for the background.

I do not see anywhere in the block layer that append is mandatory for
zoned devices. Append is emulated on ZBC, but beyond that there is no
mandatory bits. Please explain.

> Please submitpatches if you want to have support for ZNS devices that
> does not implement the Zone Append command. It is outside the scope
> of this patchset.

That we will.
Matias Bjorling June 16, 2020, 12:27 p.m. UTC | #6
On 16/06/2020 14.24, Javier González wrote:
> On 16.06.2020 14:06, Matias Bjørling wrote:
>> On 16/06/2020 14.00, Javier González wrote:
>>> On 16.06.2020 13:18, Matias Bjørling wrote:
>>>> On 16/06/2020 12.41, Javier González wrote:
>>>>> On 16.06.2020 08:34, Keith Busch wrote:
>>>>>> Add support for NVM Express Zoned Namespaces (ZNS) Command Set 
>>>>>> defined
>>>>>> in NVM Express TP4053. Zoned namespaces are discovered based on 
>>>>>> their
>>>>>> Command Set Identifier reported in the namespaces Namespace
>>>>>> Identification Descriptor list. A successfully discovered Zoned
>>>>>> Namespace will be registered with the block layer as a host managed
>>>>>> zoned block device with Zone Append command support. A namespace 
>>>>>> that
>>>>>> does not support append is not supported by the driver.
>>>>>
>>>>> Why are we enforcing the append command? Append is optional on the
>>>>> current ZNS specification, so we should not make this mandatory in 
>>>>> the
>>>>> implementation. See specifics below.
>>>
>>>>
>>>> There is already general support in the kernel for the zone append 
>>>> command. Feel free to submit patches to emulate the support. It is 
>>>> outside the scope of this patchset.
>>>>
>>>
>>> It is fine that the kernel supports append, but the ZNS specification
>>> does not impose the implementation for append, so the driver should not
>>> do that either.
>>>
>>> ZNS SSDs that choose to leave append as a non-implemented optional
>>> command should not rely on emulated SW support, specially when
>>> traditional writes work very fine for a large part of current ZNS use
>>> cases.
>>>
>>> Please, remove this virtual constraint.
>>
>> The Zone Append command is mandatory for zoned block devices. Please 
>> see https://lwn.net/Articles/818709/ for the background.
>
> I do not see anywhere in the block layer that append is mandatory for
> zoned devices. Append is emulated on ZBC, but beyond that there is no
> mandatory bits. Please explain.
>
>> Please submitpatches if you want to have support for ZNS devices that
>> does not implement the Zone Append command. It is outside the scope
>> of this patchset.
>
> That we will.
>
Thanks, appreciate it.

Best, Matias
Judy Brock June 16, 2020, 12:35 p.m. UTC | #7
>>> A namespace that does not support append is not supported by the driver.

I am not that knowledgeable about Linux kernel drivers so maybe this is a dumb question but won't the driver in question be the default kernel driver for ZNS devices? If so, why would that driver deliberately reject a device which is 100% compliant with the ZNS spec?
That would seem to favor specific implementations which seems like an inappropriate thing for a community driver to do unless there is an actual technical reason the driver is unable to function w/o append. Is there any such reason and if so what is it? Thanks and sorry if I've misunderstood.


Judy

-----Original Message-----
From: linux-nvme [mailto:linux-nvme-bounces@lists.infradead.org] On Behalf Of Javier González
Sent: Tuesday, June 16, 2020 5:00 AM
To: Matias Bjørling
Cc: Jens Axboe; Niklas Cassel; Damien Le Moal; Ajay Joshi; Sagi Grimberg; Keith Busch; Dmitry Fomichev; Aravind Ramesh; linux-nvme@lists.infradead.org; linux-block@vger.kernel.org; Hans Holmberg; Christoph Hellwig; Matias Bjørling
Subject: Re: [PATCH 5/5] nvme: support for zoned namespaces

On 16.06.2020 13:18, Matias Bjørling wrote:
>On 16/06/2020 12.41, Javier González wrote:
>>On 16.06.2020 08:34, Keith Busch wrote:
>>>Add support for NVM Express Zoned Namespaces (ZNS) Command Set defined
>>>in NVM Express TP4053. Zoned namespaces are discovered based on their
>>>Command Set Identifier reported in the namespaces Namespace
>>>Identification Descriptor list. A successfully discovered Zoned
>>>Namespace will be registered with the block layer as a host managed
>>>zoned block device with Zone Append command support. A namespace that
>>>does not support append is not supported by the driver.
>>
>>Why are we enforcing the append command? Append is optional on the
>>current ZNS specification, so we should not make this mandatory in the
>>implementation. See specifics below.

>
>There is already general support in the kernel for the zone append 
>command. Feel free to submit patches to emulate the support. It is 
>outside the scope of this patchset.
>

It is fine that the kernel supports append, but the ZNS specification
does not impose the implementation for append, so the driver should not
do that either.

ZNS SSDs that choose to leave append as a non-implemented optional
command should not rely on emulated SW support, specially when
traditional writes work very fine for a large part of current ZNS use
cases.

Please, remove this virtual constraint.

>>
>>>
>>>Signed-off-by: Hans Holmberg <hans.holmberg@wdc.com>
>>>Signed-off-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
>>>Signed-off-by: Ajay Joshi <ajay.joshi@wdc.com>
>>>Signed-off-by: Aravind Ramesh <aravind.ramesh@wdc.com>
>>>Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
>>>Signed-off-by: Matias Bjørling <matias.bjorling@wdc.com>
>>>Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
>>>Signed-off-by: Keith Busch <keith.busch@wdc.com>
>>>---
>>>drivers/nvme/host/Makefile |   1 +
>>>drivers/nvme/host/core.c   |  91 ++++++++++++--
>>>drivers/nvme/host/nvme.h   |  39 ++++++
>>>drivers/nvme/host/zns.c    | 238 +++++++++++++++++++++++++++++++++++++
>>>include/linux/nvme.h       | 111 +++++++++++++++++
>>>5 files changed, 468 insertions(+), 12 deletions(-)
>>>create mode 100644 drivers/nvme/host/zns.c
>>>
>>>diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile
>>>index fc7b26be692d..d7f6a87687b8 100644
>>>--- a/drivers/nvme/host/Makefile
>>>+++ b/drivers/nvme/host/Makefile
>>>@@ -13,6 +13,7 @@ nvme-core-y                := core.o
>>>nvme-core-$(CONFIG_TRACING)        += trace.o
>>>nvme-core-$(CONFIG_NVME_MULTIPATH)    += multipath.o
>>>nvme-core-$(CONFIG_NVM)            += lightnvm.o
>>>+nvme-core-$(CONFIG_BLK_DEV_ZONED)    += zns.o
>>>nvme-core-$(CONFIG_FAULT_INJECTION_DEBUG_FS)    += fault_inject.o
>>>nvme-core-$(CONFIG_NVME_HWMON)        += hwmon.o
>>>
>>>diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
>>>index 58f137b9f2c5..e961910da4ac 100644
>>>--- a/drivers/nvme/host/core.c
>>>+++ b/drivers/nvme/host/core.c
>>>@@ -89,7 +89,7 @@ static dev_t nvme_chr_devt;
>>>static struct class *nvme_class;
>>>static struct class *nvme_subsys_class;
>>>
>>>-static int nvme_revalidate_disk(struct gendisk *disk);
>>>+static int _nvme_revalidate_disk(struct gendisk *disk);
>>>static void nvme_put_subsystem(struct nvme_subsystem *subsys);
>>>static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
>>>                       unsigned nsid);
>>>@@ -287,6 +287,10 @@ void nvme_complete_rq(struct request *req)
>>>            nvme_retry_req(req);
>>>            return;
>>>        }
>>>+    } else if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
>>>+           req_op(req) == REQ_OP_ZONE_APPEND) {
>>>+        req->__sector = nvme_lba_to_sect(req->q->queuedata,
>>>+            le64_to_cpu(nvme_req(req)->result.u64));
>>>    }
>>>
>>>    nvme_trace_bio_complete(req, status);
>>>@@ -673,7 +677,8 @@ static inline blk_status_t 
>>>nvme_setup_write_zeroes(struct nvme_ns *ns,
>>>}
>>>
>>>static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
>>>-        struct request *req, struct nvme_command *cmnd)
>>>+        struct request *req, struct nvme_command *cmnd,
>>>+        enum nvme_opcode op)
>>>{
>>>    struct nvme_ctrl *ctrl = ns->ctrl;
>>>    u16 control = 0;
>>>@@ -687,7 +692,7 @@ static inline blk_status_t 
>>>nvme_setup_rw(struct nvme_ns *ns,
>>>    if (req->cmd_flags & REQ_RAHEAD)
>>>        dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
>>>
>>>-    cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : 
>>>nvme_cmd_read);
>>>+    cmnd->rw.opcode = op;
>>>    cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id);
>>>    cmnd->rw.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
>>>    cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> 
>>>ns->lba_shift) - 1);
>>>@@ -716,6 +721,8 @@ static inline blk_status_t 
>>>nvme_setup_rw(struct nvme_ns *ns,
>>>        case NVME_NS_DPS_PI_TYPE2:
>>>            control |= NVME_RW_PRINFO_PRCHK_GUARD |
>>>                    NVME_RW_PRINFO_PRCHK_REF;
>>>+            if (op == nvme_cmd_zone_append)
>>>+                control |= NVME_RW_APPEND_PIREMAP;
>>>            cmnd->rw.reftag = cpu_to_le32(t10_pi_ref_tag(req));
>>>            break;
>>>        }
>>>@@ -756,6 +763,19 @@ blk_status_t nvme_setup_cmd(struct nvme_ns 
>>>*ns, struct request *req,
>>>    case REQ_OP_FLUSH:
>>>        nvme_setup_flush(ns, cmd);
>>>        break;
>>>+    case REQ_OP_ZONE_RESET_ALL:
>>>+    case REQ_OP_ZONE_RESET:
>>>+        ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_RESET);
>>>+        break;
>>>+    case REQ_OP_ZONE_OPEN:
>>>+        ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_OPEN);
>>>+        break;
>>>+    case REQ_OP_ZONE_CLOSE:
>>>+        ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_CLOSE);
>>>+        break;
>>>+    case REQ_OP_ZONE_FINISH:
>>>+        ret = nvme_setup_zone_mgmt_send(ns, req, cmd, 
>>>NVME_ZONE_FINISH);
>>>+        break;
>>>    case REQ_OP_WRITE_ZEROES:
>>>        ret = nvme_setup_write_zeroes(ns, req, cmd);
>>>        break;
>>>@@ -763,8 +783,13 @@ blk_status_t nvme_setup_cmd(struct nvme_ns 
>>>*ns, struct request *req,
>>>        ret = nvme_setup_discard(ns, req, cmd);
>>>        break;
>>>    case REQ_OP_READ:
>>>+        ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_read);
>>>+        break;
>>>    case REQ_OP_WRITE:
>>>-        ret = nvme_setup_rw(ns, req, cmd);
>>>+        ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_write);
>>>+        break;
>>>+    case REQ_OP_ZONE_APPEND:
>>>+        ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_zone_append);
>>>        break;
>>>    default:
>>>        WARN_ON_ONCE(1);
>>>@@ -1392,14 +1417,23 @@ static u32 nvme_passthru_start(struct 
>>>nvme_ctrl *ctrl, struct nvme_ns *ns,
>>>    return effects;
>>>}
>>>
>>>-static void nvme_update_formats(struct nvme_ctrl *ctrl)
>>>+static void nvme_update_formats(struct nvme_ctrl *ctrl, u32 *effects)
>>>{
>>>    struct nvme_ns *ns;
>>>
>>>    down_read(&ctrl->namespaces_rwsem);
>>>    list_for_each_entry(ns, &ctrl->namespaces, list)
>>>-        if (ns->disk && nvme_revalidate_disk(ns->disk))
>>>+        if (ns->disk && _nvme_revalidate_disk(ns->disk))
>>>            nvme_set_queue_dying(ns);
>>>+        else if (blk_queue_is_zoned(ns->disk->queue)) {
>>>+            /*
>>>+             * IO commands are required to fully revalidate a zoned
>>>+             * device. Force the command effects to trigger rescan
>>>+             * work so report zones can run in a context with
>>>+             * unfrozen IO queues.
>>>+             */
>>>+            *effects |= NVME_CMD_EFFECTS_NCC;
>>>+        }
>>>    up_read(&ctrl->namespaces_rwsem);
>>>}
>>>
>>>@@ -1411,7 +1445,7 @@ static void nvme_passthru_end(struct 
>>>nvme_ctrl *ctrl, u32 effects)
>>>     * this command.
>>>     */
>>>    if (effects & NVME_CMD_EFFECTS_LBCC)
>>>-        nvme_update_formats(ctrl);
>>>+        nvme_update_formats(ctrl, &effects);
>>>    if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) {
>>>        nvme_unfreeze(ctrl);
>>>        nvme_mpath_unfreeze(ctrl->subsys);
>>>@@ -1526,7 +1560,7 @@ static int nvme_user_cmd64(struct nvme_ctrl 
>>>*ctrl, struct nvme_ns *ns,
>>> * Issue ioctl requests on the first available path.  Note that 
>>>unlike normal
>>> * block layer requests we will not retry failed request on 
>>>another controller.
>>> */
>>>-static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
>>>+struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
>>>        struct nvme_ns_head **head, int *srcu_idx)
>>>{
>>>#ifdef CONFIG_NVME_MULTIPATH
>>>@@ -1546,7 +1580,7 @@ static struct nvme_ns 
>>>*nvme_get_ns_from_disk(struct gendisk *disk,
>>>    return disk->private_data;
>>>}
>>>
>>>-static void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx)
>>>+void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx)
>>>{
>>>    if (head)
>>>        srcu_read_unlock(&head->srcu, idx);
>>>@@ -1939,21 +1973,28 @@ static void nvme_update_disk_info(struct 
>>>gendisk *disk,
>>>
>>>static int __nvme_revalidate_disk(struct gendisk *disk, struct 
>>>nvme_id_ns *id)
>>>{
>>>+    unsigned lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK;
>>>    struct nvme_ns *ns = disk->private_data;
>>>    struct nvme_ctrl *ctrl = ns->ctrl;
>>>+    int ret;
>>>    u32 iob;
>>>
>>>    /*
>>>     * If identify namespace failed, use default 512 byte block size so
>>>     * block layer can use before failing read/write for 0 capacity.
>>>     */
>>>-    ns->lba_shift = id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ds;
>>>+    ns->lba_shift = id->lbaf[lbaf].ds;
>>>    if (ns->lba_shift == 0)
>>>        ns->lba_shift = 9;
>>>
>>>    switch (ns->head->ids.csi) {
>>>    case NVME_CSI_NVM:
>>>        break;
>>>+    case NVME_CSI_ZNS:
>>>+        ret = nvme_update_zone_info(disk, ns, lbaf);
>>>+        if (ret)
>>>+            return ret;
>>>+        break;
>>>    default:
>>>        dev_warn(ctrl->device, "unknown csi:%d ns:%d\n",
>>>            ns->head->ids.csi, ns->head->ns_id);
>>>@@ -1967,7 +2008,7 @@ static int __nvme_revalidate_disk(struct 
>>>gendisk *disk, struct nvme_id_ns *id)
>>>        iob = nvme_lba_to_sect(ns, le16_to_cpu(id->noiob));
>>>
>>>    ns->features = 0;
>>>-    ns->ms = le16_to_cpu(id->lbaf[id->flbas & 
>>>NVME_NS_FLBAS_LBA_MASK].ms);
>>>+    ns->ms = le16_to_cpu(id->lbaf[lbaf].ms);
>>>    /* the PI implementation requires metadata equal t10 pi tuple 
>>>size */
>>>    if (ns->ms == sizeof(struct t10_pi_tuple))
>>>        ns->pi_type = id->dps & NVME_NS_DPS_PI_MASK;
>>>@@ -2010,7 +2051,7 @@ static int __nvme_revalidate_disk(struct 
>>>gendisk *disk, struct nvme_id_ns *id)
>>>    return 0;
>>>}
>>>
>>>-static int nvme_revalidate_disk(struct gendisk *disk)
>>>+static int _nvme_revalidate_disk(struct gendisk *disk)
>>>{
>>>    struct nvme_ns *ns = disk->private_data;
>>>    struct nvme_ctrl *ctrl = ns->ctrl;
>>>@@ -2058,6 +2099,28 @@ static int nvme_revalidate_disk(struct 
>>>gendisk *disk)
>>>    return ret;
>>>}
>>>
>>>+static int nvme_revalidate_disk(struct gendisk *disk)
>>>+{
>>>+    int ret;
>>>+
>>>+    ret = _nvme_revalidate_disk(disk);
>>>+    if (ret)
>>>+        return ret;
>>>+
>>>+#ifdef CONFIG_BLK_DEV_ZONED
>>>+    if (blk_queue_is_zoned(disk->queue)) {
>>>+        struct nvme_ns *ns = disk->private_data;
>>>+        struct nvme_ctrl *ctrl = ns->ctrl;
>>>+
>>>+        ret = blk_revalidate_disk_zones(disk, NULL);
>>>+        if (!ret)
>>>+            blk_queue_max_zone_append_sectors(disk->queue,
>>>+                              ctrl->max_zone_append);
>>>+    }
>>>+#endif
>>>+    return ret;
>>>+}
>>>+
>>>static char nvme_pr_type(enum pr_type type)
>>>{
>>>    switch (type) {
>>>@@ -2188,6 +2251,7 @@ static const struct block_device_operations 
>>>nvme_fops = {
>>>    .release    = nvme_release,
>>>    .getgeo        = nvme_getgeo,
>>>    .revalidate_disk= nvme_revalidate_disk,
>>>+    .report_zones    = nvme_report_zones,
>>>    .pr_ops        = &nvme_pr_ops,
>>>};
>>>
>>>@@ -2213,6 +2277,7 @@ const struct block_device_operations 
>>>nvme_ns_head_ops = {
>>>    .ioctl        = nvme_ioctl,
>>>    .compat_ioctl    = nvme_compat_ioctl,
>>>    .getgeo        = nvme_getgeo,
>>>+    .report_zones    = nvme_report_zones,
>>>    .pr_ops        = &nvme_pr_ops,
>>>};
>>>#endif /* CONFIG_NVME_MULTIPATH */
>>>@@ -4439,6 +4504,8 @@ static inline void _nvme_check_size(void)
>>>    BUILD_BUG_ON(sizeof(struct nvme_command) != 64);
>>>    BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != 
>>>NVME_IDENTIFY_DATA_SIZE);
>>>    BUILD_BUG_ON(sizeof(struct nvme_id_ns) != NVME_IDENTIFY_DATA_SIZE);
>>>+    BUILD_BUG_ON(sizeof(struct nvme_id_ns_zns) != 
>>>NVME_IDENTIFY_DATA_SIZE);
>>>+    BUILD_BUG_ON(sizeof(struct nvme_id_ctrl_zns) != 
>>>NVME_IDENTIFY_DATA_SIZE);
>>>    BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
>>>    BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
>>>    BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64);
>>>diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
>>>index 58428e3a590e..662f95fbd909 100644
>>>--- a/drivers/nvme/host/nvme.h
>>>+++ b/drivers/nvme/host/nvme.h
>>>@@ -239,6 +239,9 @@ struct nvme_ctrl {
>>>    u32 max_hw_sectors;
>>>    u32 max_segments;
>>>    u32 max_integrity_segments;
>>>+#ifdef CONFIG_BLK_DEV_ZONED
>>>+    u32 max_zone_append;
>>>+#endif
>>>    u16 crdt[3];
>>>    u16 oncs;
>>>    u16 oacs;
>>>@@ -403,6 +406,9 @@ struct nvme_ns {
>>>    u16 sgs;
>>>    u32 sws;
>>>    u8 pi_type;
>>>+#ifdef CONFIG_BLK_DEV_ZONED
>>>+    u64 zsze;
>>>+#endif
>>>    unsigned long features;
>>>    unsigned long flags;
>>>#define NVME_NS_REMOVING    0
>>>@@ -568,6 +574,9 @@ int nvme_delete_ctrl(struct nvme_ctrl *ctrl);
>>>
>>>int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 
>>>lsp, u8 csi,
>>>        void *log, size_t size, u64 offset);
>>>+struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
>>>+        struct nvme_ns_head **head, int *srcu_idx);
>>>+void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx);
>>>
>>>extern const struct attribute_group *nvme_ns_id_attr_groups[];
>>>extern const struct block_device_operations nvme_ns_head_ops;
>>>@@ -689,6 +698,36 @@ static inline void 
>>>nvme_mpath_start_freeze(struct nvme_subsystem *subsys)
>>>}
>>>#endif /* CONFIG_NVME_MULTIPATH */
>>>
>>>+#ifdef CONFIG_BLK_DEV_ZONED
>>>+int nvme_update_zone_info(struct gendisk *disk, struct nvme_ns *ns,
>>>+              unsigned lbaf);
>>>+
>>>+int nvme_report_zones(struct gendisk *disk, sector_t sector,
>>>+              unsigned int nr_zones, report_zones_cb cb, void *data);
>>>+
>>>+blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct 
>>>request *req,
>>>+                       struct nvme_command *cmnd,
>>>+                       enum nvme_zone_mgmt_action action);
>>>+#else
>>>+#define nvme_report_zones NULL
>>>+
>>>+static inline blk_status_t nvme_setup_zone_mgmt_send(struct 
>>>nvme_ns *ns,
>>>+        struct request *req, struct nvme_command *cmnd,
>>>+        enum nvme_zone_mgmt_action action)
>>>+{
>>>+    return BLK_STS_NOTSUPP;
>>>+}
>>>+
>>>+static inline int nvme_update_zone_info(struct gendisk *disk,
>>>+                    struct nvme_ns *ns,
>>>+                    unsigned lbaf)
>>>+{
>>>+    dev_warn(ns->ctrl->device,
>>>+         "Please enable CONFIG_BLK_DEV_ZONED to support ZNS 
>>>devices\n");
>>>+    return -EPROTONOSUPPORT;
>>>+}
>>>+#endif
>>>+
>>>#ifdef CONFIG_NVM
>>>int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node);
>>>void nvme_nvm_unregister(struct nvme_ns *ns);
>>>diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c
>>>new file mode 100644
>>>index 000000000000..c08f6281b614
>>>--- /dev/null
>>>+++ b/drivers/nvme/host/zns.c
>>>@@ -0,0 +1,238 @@
>>>+// SPDX-License-Identifier: GPL-2.0
>>>+/*
>>>+ * Copyright (C) 2020 Western Digital Corporation or its affiliates.
>>>+ */
>>>+
>>>+#include <linux/blkdev.h>
>>>+#include <linux/vmalloc.h>
>>>+#include "nvme.h"
>>>+
>>>+static int nvme_set_max_append(struct nvme_ctrl *ctrl)
>>>+{
>>>+    struct nvme_command c = { };
>>>+    struct nvme_id_ctrl_zns *id;
>>>+    int status;
>>>+
>>>+    id = kzalloc(sizeof(*id), GFP_KERNEL);
>>>+    if (!id)
>>>+        return -ENOMEM;
>>>+
>>>+    c.identify.opcode = nvme_admin_identify;
>>>+    c.identify.cns = NVME_ID_CNS_CS_CTRL;
>>>+    c.identify.csi = NVME_CSI_ZNS;
>>>+
>>>+    status = nvme_submit_sync_cmd(ctrl->admin_q, &c, id, sizeof(*id));
>>>+    if (status) {
>>>+        kfree(id);
>>>+        return status;
>>>+    }
>>>+
>>>+    ctrl->max_zone_append = 1 << (id->zamds + 3);
>>>+    kfree(id);
>>>+    return 0;
>>>+}
>>>+
>>>+int nvme_update_zone_info(struct gendisk *disk, struct nvme_ns *ns,
>>>+              unsigned lbaf)
>>>+{
>>>+    struct nvme_effects_log *log = ns->head->effects;
>>>+    struct request_queue *q = disk->queue;
>>>+    struct nvme_command c = { };
>>>+    struct nvme_id_ns_zns *id;
>>>+    int status;
>>>+
>>>+    /* Driver requires zone append support */
>>>+    if (!(log->iocs[nvme_cmd_zone_append] & NVME_CMD_EFFECTS_CSUPP))
>>>+        return -ENODEV;
>>
>>Following up on the initial comment, this check should go.
>
>See first comment.

See above and please remove.

>
>>
>>>+
>>>+    /* Lazily query controller append limit for the first zoned 
>>>namespace */
>>>+    if (!ns->ctrl->max_zone_append) {
>>>+        status = nvme_set_max_append(ns->ctrl);
>>>+        if (status)
>>>+            return status;
>>>+    }
>>
>>This should only be applied if append is supported.
>
>See first comment.
>
>>
>>>+
>>>+    id = kzalloc(sizeof(*id), GFP_KERNEL);
>>>+    if (!id)
>>>+        return -ENOMEM;
>>>+
>>>+    c.identify.opcode = nvme_admin_identify;
>>>+    c.identify.nsid = cpu_to_le32(ns->head->ns_id);
>>>+    c.identify.cns = NVME_ID_CNS_CS_NS;
>>>+    c.identify.csi = NVME_CSI_ZNS;
>>>+
>>>+    status = nvme_submit_sync_cmd(ns->ctrl->admin_q, &c, id, 
>>>sizeof(*id));
>>>+    if (status)
>>>+        goto free_data;
>>>+
>>>+    /*
>>>+     * We currently do not handle devices requiring any of the zoned
>>>+     * operation characteristics.
>>>+     */
>>>+    if (id->zoc) {
>>>+        status = -EINVAL;
>>>+        goto free_data;
>>>+    }
>>
>>I understand that "Variable Zone Capacity" is not supported as it
>>requires major changes at this moment, but we should support controllers
>>that enable "Zone Active Excursions", even when the AER event is not
>>implemented in this patchset.
>
>
>NAK. Similarly to VZC, this allows an unsuspecting user to have major 
>data loss when a zone is suddenly moved to Full.

I buy that.

>
>
>>
>>>+
>>>+    ns->zsze = nvme_lba_to_sect(ns, le64_to_cpu(id->lbafe[lbaf].zsze));
>>>+    if (!ns->zsze) {
>>>+        status = -EINVAL;
>>>+        goto free_data;
>>>+    }
>>>+
>>>+    q->limits.zoned = BLK_ZONED_HM;
>>>+    blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q);
>>>+free_data:
>>>+    kfree(id);
>>>+    return status;
>>>+}
>>>+
>>>+static void *nvme_zns_alloc_report_buffer(struct nvme_ns *ns,
>>>+                      unsigned int nr_zones, size_t *buflen)
>>>+{
>>>+    struct request_queue *q = ns->disk->queue;
>>>+    size_t bufsize;
>>>+    void *buf;
>>>+
>>>+    const size_t min_bufsize = sizeof(struct nvme_zone_report) +
>>>+                   sizeof(struct nvme_zone_descriptor);
>>>+
>>>+    nr_zones = min_t(unsigned int, nr_zones,
>>>+             get_capacity(ns->disk) >> ilog2(ns->zsze));
>>>+
>>>+    bufsize = sizeof(struct nvme_zone_report) +
>>>+        nr_zones * sizeof(struct nvme_zone_descriptor);
>>>+    bufsize = min_t(size_t, bufsize,
>>>+            queue_max_hw_sectors(q) << SECTOR_SHIFT);
>>>+    bufsize = min_t(size_t, bufsize, queue_max_segments(q) << 
>>>PAGE_SHIFT);
>>>+
>>>+    while (bufsize >= min_bufsize) {
>>>+        buf = __vmalloc(bufsize,
>>>+                GFP_KERNEL | __GFP_ZERO | __GFP_NORETRY);
>>>+        if (buf) {
>>>+            *buflen = bufsize;
>>>+            return buf;
>>>+        }
>>>+        bufsize >>= 1;
>>>+    }
>>>+    return NULL;
>>>+}
>>>+
>>>+static int __nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector,
>>>+                  struct nvme_zone_report *report,
>>>+                  size_t buflen)
>>>+{
>>>+    struct nvme_command c = { };
>>>+    int ret;
>>>+
>>>+    c.zmr.opcode = nvme_cmd_zone_mgmt_recv;
>>>+    c.zmr.nsid = cpu_to_le32(ns->head->ns_id);
>>>+    c.zmr.slba = cpu_to_le64(nvme_sect_to_lba(ns, sector));
>>>+    c.zmr.numd = cpu_to_le32(nvme_bytes_to_numd(buflen));
>>>+    c.zmr.zra = NVME_ZRA_ZONE_REPORT;
>>>+    c.zmr.zrasf = NVME_ZRASF_ZONE_REPORT_ALL;
>>>+    c.zmr.pr = NVME_REPORT_ZONE_PARTIAL;
>>>+
>>>+    ret = nvme_submit_sync_cmd(ns->queue, &c, report, buflen);
>>>+    if (ret)
>>>+        return ret;
>>>+
>>>+    return le64_to_cpu(report->nr_zones);
>>>+}
>>>+
>>>+static int nvme_zone_parse_entry(struct nvme_ns *ns,
>>>+                 struct nvme_zone_descriptor *entry,
>>>+                 unsigned int idx, report_zones_cb cb,
>>>+                 void *data)
>>>+{
>>>+    struct blk_zone zone = { };
>>>+
>>>+    if ((entry->zt & 0xf) != NVME_ZONE_TYPE_SEQWRITE_REQ) {
>>>+        dev_err(ns->ctrl->device, "invalid zone type %#x\n",
>>>+                entry->zt);
>>>+        return -EINVAL;
>>>+    }
>>>+
>>>+    zone.type = BLK_ZONE_TYPE_SEQWRITE_REQ;
>>>+    zone.cond = entry->zs >> 4;
>>>+    zone.len = ns->zsze;
>>>+    zone.capacity = nvme_lba_to_sect(ns, le64_to_cpu(entry->zcap));
>>>+    zone.start = nvme_lba_to_sect(ns, le64_to_cpu(entry->zslba));
>>>+    zone.wp = nvme_lba_to_sect(ns, le64_to_cpu(entry->wp));
>>>+
>>>+    return cb(&zone, idx, data);
>>>+}
>>>+
>>>+static int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector,
>>>+            unsigned int nr_zones, report_zones_cb cb, void *data)
>>>+{
>>>+    struct nvme_zone_report *report;
>>>+    int ret, zone_idx = 0;
>>>+    unsigned int nz, i;
>>>+    size_t buflen;
>>>+
>>>+    report = nvme_zns_alloc_report_buffer(ns, nr_zones, &buflen);
>>>+    if (!report)
>>>+        return -ENOMEM;
>>>+
>>>+    sector &= ~(ns->zsze - 1);
>>>+    while (zone_idx < nr_zones && sector < get_capacity(ns->disk)) {
>>>+        memset(report, 0, buflen);
>>>+        ret = __nvme_ns_report_zones(ns, sector, report, buflen);
>>>+        if (ret < 0)
>>>+            goto out_free;
>>>+
>>>+        nz = min_t(unsigned int, ret, nr_zones);
>>>+        if (!nz)
>>>+            break;
>>>+
>>>+        for (i = 0; i < nz && zone_idx < nr_zones; i++) {
>>>+            ret = nvme_zone_parse_entry(ns, &report->entries[i],
>>>+                            zone_idx, cb, data);
>>>+            if (ret)
>>>+                goto out_free;
>>>+            zone_idx++;
>>>+        }
>>>+
>>>+        sector += ns->zsze * nz;
>>>+    }
>>>+
>>>+    ret = zone_idx;
>>>+out_free:
>>>+    kvfree(report);
>>>+    return ret;
>>>+}
>>>+
>>>+int nvme_report_zones(struct gendisk *disk, sector_t sector,
>>>+              unsigned int nr_zones, report_zones_cb cb, void *data)
>>>+{
>>>+    struct nvme_ns_head *head = NULL;
>>>+    struct nvme_ns *ns;
>>>+    int srcu_idx, ret;
>>>+
>>>+    ns = nvme_get_ns_from_disk(disk, &head, &srcu_idx);
>>>+    if (unlikely(!ns))
>>>+        return -EWOULDBLOCK;
>>>+
>>>+    if (ns->head->ids.csi == NVME_CSI_ZNS)
>>>+        ret = nvme_ns_report_zones(ns, sector, nr_zones, cb, data);
>>>+    else
>>>+        ret = -EINVAL;
>>>+    nvme_put_ns_from_disk(head, srcu_idx);
>>>+
>>>+    return ret;
>>>+}
>>>+
>>>+blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct 
>>>request *req,
>>>+        struct nvme_command *c, enum nvme_zone_mgmt_action action)
>>>+{
>>>+    c->zms.opcode = nvme_cmd_zone_mgmt_send;
>>>+    c->zms.nsid = cpu_to_le32(ns->head->ns_id);
>>>+    c->zms.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
>>>+    c->zms.action = action;
>>>+
>>>+    if (req_op(req) == REQ_OP_ZONE_RESET_ALL)
>>>+        c->zms.select = 1;
>>>+
>>>+    return BLK_STS_OK;
>>>+}
>>>diff --git a/include/linux/nvme.h b/include/linux/nvme.h
>>>index ea25da572eed..7b3fa7de07bd 100644
>>>--- a/include/linux/nvme.h
>>>+++ b/include/linux/nvme.h
>>>@@ -374,6 +374,30 @@ struct nvme_id_ns {
>>>    __u8            vs[3712];
>>>};
>>>
>>>+struct nvme_zns_lbafe {
>>>+    __le64            zsze;
>>>+    __u8            zdes;
>>>+    __u8            rsvd9[7];
>>>+};
>>>+
>>>+struct nvme_id_ns_zns {
>>>+    __le16            zoc;
>>>+    __le16            ozcs;
>>>+    __le32            mar;
>>>+    __le32            mor;
>>>+    __le32            rrl;
>>>+    __le32            frl;
>>>+    __u8            rsvd20[2796];
>>>+    struct nvme_zns_lbafe    lbafe[16];
>>>+    __u8            rsvd3072[768];
>>>+    __u8            vs[256];
>>>+};
>>>+
>>>+struct nvme_id_ctrl_zns {
>>>+    __u8    zamds;
>>>+    __u8    rsvd1[4095];
>>>+};
>>>+
>>>enum {
>>>    NVME_ID_CNS_NS            = 0x00,
>>>    NVME_ID_CNS_CTRL        = 0x01,
>>>@@ -392,6 +416,7 @@ enum {
>>>
>>>enum {
>>>    NVME_CSI_NVM            = 0,
>>>+    NVME_CSI_ZNS            = 2,
>>>};
>>>
>>>enum {
>>>@@ -532,6 +557,27 @@ struct nvme_ana_rsp_hdr {
>>>    __le16    rsvd10[3];
>>>};
>>>
>>>+struct nvme_zone_descriptor {
>>>+    __u8        zt;
>>>+    __u8        zs;
>>>+    __u8        za;
>>>+    __u8        rsvd3[5];
>>>+    __le64        zcap;
>>>+    __le64        zslba;
>>>+    __le64        wp;
>>>+    __u8        rsvd32[32];
>>>+};
>>>+
>>>+enum {
>>>+    NVME_ZONE_TYPE_SEQWRITE_REQ    = 0x2,
>>>+};
>>>+
>>>+struct nvme_zone_report {
>>>+    __le64        nr_zones;
>>>+    __u8        resv8[56];
>>>+    struct nvme_zone_descriptor entries[];
>>>+};
>>>+
>>>enum {
>>>    NVME_SMART_CRIT_SPARE        = 1 << 0,
>>>    NVME_SMART_CRIT_TEMPERATURE    = 1 << 1,
>>>@@ -626,6 +672,9 @@ enum nvme_opcode {
>>>    nvme_cmd_resv_report    = 0x0e,
>>>    nvme_cmd_resv_acquire    = 0x11,
>>>    nvme_cmd_resv_release    = 0x15,
>>>+    nvme_cmd_zone_mgmt_send    = 0x79,
>>>+    nvme_cmd_zone_mgmt_recv    = 0x7a,
>>>+    nvme_cmd_zone_append    = 0x7d,
>>>};
>>>
>>>#define nvme_opcode_name(opcode)    { opcode, #opcode }
>>>@@ -764,6 +813,7 @@ struct nvme_rw_command {
>>>enum {
>>>    NVME_RW_LR            = 1 << 15,
>>>    NVME_RW_FUA            = 1 << 14,
>>>+    NVME_RW_APPEND_PIREMAP        = 1 << 9,
>>>    NVME_RW_DSM_FREQ_UNSPEC        = 0,
>>>    NVME_RW_DSM_FREQ_TYPICAL    = 1,
>>>    NVME_RW_DSM_FREQ_RARE        = 2,
>>>@@ -829,6 +879,53 @@ struct nvme_write_zeroes_cmd {
>>>    __le16            appmask;
>>>};
>>>
>>>+enum nvme_zone_mgmt_action {
>>>+    NVME_ZONE_CLOSE        = 0x1,
>>>+    NVME_ZONE_FINISH    = 0x2,
>>>+    NVME_ZONE_OPEN        = 0x3,
>>>+    NVME_ZONE_RESET        = 0x4,
>>>+    NVME_ZONE_OFFLINE    = 0x5,
>>>+    NVME_ZONE_SET_DESC_EXT    = 0x10,
>>>+};
>>>+
>>>+struct nvme_zone_mgmt_send_cmd {
>>>+    __u8            opcode;
>>>+    __u8            flags;
>>>+    __u16            command_id;
>>>+    __le32            nsid;
>>>+    __le32            cdw2[2];
>>>+    __le64            metadata;
>>>+    union nvme_data_ptr    dptr;
>>>+    __le64            slba;
>>>+    __le32            cdw12;
>>>+    __u8            action;
>>
>>Why not zsa to make it easier to match to the spec
>>
>>
>>>+    __u8            select;
>>
>>sel_all?
>>
>>>+    __u8            rsvd13[2];
>>>+    __le32            cdw14[2];
>>>+};
>>>+
>>>+struct nvme_zone_mgmt_recv_cmd {
>>>+    __u8            opcode;
>>>+    __u8            flags;
>>>+    __u16            command_id;
>>>+    __le32            nsid;
>>>+    __le64            rsvd2[2];
>>>+    union nvme_data_ptr    dptr;
>>>+    __le64            slba;
>>>+    __le32            numd;
>>>+    __u8            zra;
>>>+    __u8            zrasf;
>>>+    __u8            pr;
>>
>>Partial Report is just one bit in the "Zone Receive Action Specific
>>Features". What about zrasfe?
>
>There currently no users of pr, and bit 1-7 are reserved in the spec. 
>Users of the pr variable should shift and mask as necessary.
>
>zrasf looks good to me. It is defined as a byte in the spec.

I meant for the pr variable name. Agree with the rest.

>
>>
>>>+    __u8            rsvd13;
>>>+    __le32            cdw14[2];
>>>+};
>>>+
>>>+enum {
>>>+    NVME_ZRA_ZONE_REPORT        = 0,
>>>+    NVME_ZRASF_ZONE_REPORT_ALL    = 0,
>>>+    NVME_REPORT_ZONE_PARTIAL    = 1,
>>>+};
>>>+
>>>/* Features */
>>>
>>>enum {
>>>@@ -1300,6 +1397,8 @@ struct nvme_command {
>>>        struct nvme_format_cmd format;
>>>        struct nvme_dsm_cmd dsm;
>>>        struct nvme_write_zeroes_cmd write_zeroes;
>>>+        struct nvme_zone_mgmt_send_cmd zms;
>>>+        struct nvme_zone_mgmt_recv_cmd zmr;
>>>        struct nvme_abort_cmd abort;
>>>        struct nvme_get_log_page_command get_log_page;
>>>        struct nvmf_common_command fabrics;
>>>@@ -1433,6 +1532,18 @@ enum {
>>>    NVME_SC_DISCOVERY_RESTART    = 0x190,
>>>    NVME_SC_AUTH_REQUIRED        = 0x191,
>>>
>>>+    /*
>>>+     * I/O Command Set Specific - Zoned commands:
>>>+     */
>>>+    NVME_SC_ZONE_BOUNDARY_ERROR    = 0x1b8,
>>>+    NVME_SC_ZONE_FULL        = 0x1b9,
>>>+    NVME_SC_ZONE_READ_ONLY        = 0x1ba,
>>>+    NVME_SC_ZONE_OFFLINE        = 0x1bb,
>>>+    NVME_SC_ZONE_INVALID_WRITE    = 0x1bc,
>>>+    NVME_SC_ZONE_TOO_MANY_ACTIVE    = 0x1bd,
>>>+    NVME_SC_ZONE_TOO_MANY_OPEN    = 0x1be,
>>>+    NVME_SC_ZONE_INVALID_TRANSITION    = 0x1bf,
>>>+
>>>    /*
>>>     * Media and Data Integrity Errors:
>>>     */
>>>-- 
>>>2.24.1
>>>
>
Damien Le Moal June 16, 2020, 12:35 p.m. UTC | #8
On 2020/06/16 21:24, Javier González wrote:
> On 16.06.2020 14:06, Matias Bjørling wrote:
>> On 16/06/2020 14.00, Javier González wrote:
>>> On 16.06.2020 13:18, Matias Bjørling wrote:
>>>> On 16/06/2020 12.41, Javier González wrote:
>>>>> On 16.06.2020 08:34, Keith Busch wrote:
>>>>>> Add support for NVM Express Zoned Namespaces (ZNS) Command Set defined
>>>>>> in NVM Express TP4053. Zoned namespaces are discovered based on their
>>>>>> Command Set Identifier reported in the namespaces Namespace
>>>>>> Identification Descriptor list. A successfully discovered Zoned
>>>>>> Namespace will be registered with the block layer as a host managed
>>>>>> zoned block device with Zone Append command support. A namespace that
>>>>>> does not support append is not supported by the driver.
>>>>>
>>>>> Why are we enforcing the append command? Append is optional on the
>>>>> current ZNS specification, so we should not make this mandatory in the
>>>>> implementation. See specifics below.
>>>
>>>>
>>>> There is already general support in the kernel for the zone append 
>>>> command. Feel free to submit patches to emulate the support. It is 
>>>> outside the scope of this patchset.
>>>>
>>>
>>> It is fine that the kernel supports append, but the ZNS specification
>>> does not impose the implementation for append, so the driver should not
>>> do that either.
>>>
>>> ZNS SSDs that choose to leave append as a non-implemented optional
>>> command should not rely on emulated SW support, specially when
>>> traditional writes work very fine for a large part of current ZNS use
>>> cases.
>>>
>>> Please, remove this virtual constraint.
>>
>> The Zone Append command is mandatory for zoned block devices. Please 
>> see https://lwn.net/Articles/818709/ for the background.
> 
> I do not see anywhere in the block layer that append is mandatory for
> zoned devices. Append is emulated on ZBC, but beyond that there is no
> mandatory bits. Please explain.

This is to allow a single write IO path for all types of zoned block device for
higher layers, e.g file systems. The on-going re-work of btrfs zone support for
instance now relies 100% on zone append being supported. That significantly
simplifies the file system support and more importantly remove the need for
locking around block allocation and BIO issuing, allowing to preserve a fully
asynchronous write path that can include workqueues for efficient CPU usage of
things like encryption and compression. Without zone append, file system would
either (1) have to reject these drives that do not support zone append, or (2)
implement 2 different write IO path (slower regular write and zone append). None
of these options are ideal, to say the least.

So the approach is: mandate zone append support for ZNS devices. To allow other
ZNS drives, an emulation similar to SCSI can be implemented, with that emulation
ideally combined to work for both types of drives if possible. And note that
this emulation would require the drive to be operated with mq-deadline to enable
zone write locking for preserving write command order. While on a HDD the
performance penalty is minimal, it will likely be significant on a SSD.

> 
>> Please submitpatches if you want to have support for ZNS devices that
>> does not implement the Zone Append command. It is outside the scope
>> of this patchset.
> 
> That we will.
> 
> 
> _______________________________________________
> linux-nvme mailing list
> linux-nvme@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-nvme
>
Damien Le Moal June 16, 2020, 12:37 p.m. UTC | #9
On 2020/06/16 21:35, Judy Brock wrote:
> >>> A namespace that does not support append is not supported by the driver.
> 
> I am not that knowledgeable about Linux kernel drivers so maybe this is a
> dumb question but won't the driver in question be the default kernel driver
> for ZNS devices? If so, why would that driver deliberately reject a device
> which is 100% compliant with the ZNS spec? That would seem to favor specific
> implementations which seems like an inappropriate thing for a community
> driver to do unless there is an actual technical reason the driver is unable
> to function w/o append. Is there any such reason and if so what is it? Thanks
> and sorry if I've misunderstood.

Judy,

please see my answer to Javier for the rational behind this software design
decision.

> 
> 
> Judy
> 
> -----Original Message----- From: linux-nvme
> [mailto:linux-nvme-bounces@lists.infradead.org] On Behalf Of Javier González 
> Sent: Tuesday, June 16, 2020 5:00 AM To: Matias Bjørling Cc: Jens Axboe;
> Niklas Cassel; Damien Le Moal; Ajay Joshi; Sagi Grimberg; Keith Busch; Dmitry
> Fomichev; Aravind Ramesh; linux-nvme@lists.infradead.org;
> linux-block@vger.kernel.org; Hans Holmberg; Christoph Hellwig; Matias
> Bjørling Subject: Re: [PATCH 5/5] nvme: support for zoned namespaces
> 
> On 16.06.2020 13:18, Matias Bjørling wrote:
>> On 16/06/2020 12.41, Javier González wrote:
>>> On 16.06.2020 08:34, Keith Busch wrote:
>>>> Add support for NVM Express Zoned Namespaces (ZNS) Command Set defined 
>>>> in NVM Express TP4053. Zoned namespaces are discovered based on their 
>>>> Command Set Identifier reported in the namespaces Namespace 
>>>> Identification Descriptor list. A successfully discovered Zoned 
>>>> Namespace will be registered with the block layer as a host managed 
>>>> zoned block device with Zone Append command support. A namespace that 
>>>> does not support append is not supported by the driver.
>>> 
>>> Why are we enforcing the append command? Append is optional on the 
>>> current ZNS specification, so we should not make this mandatory in the 
>>> implementation. See specifics below.
> 
>> 
>> There is already general support in the kernel for the zone append command.
>> Feel free to submit patches to emulate the support. It is outside the scope
>> of this patchset.
>> 
> 
> It is fine that the kernel supports append, but the ZNS specification does
> not impose the implementation for append, so the driver should not do that
> either.
> 
> ZNS SSDs that choose to leave append as a non-implemented optional command
> should not rely on emulated SW support, specially when traditional writes
> work very fine for a large part of current ZNS use cases.
> 
> Please, remove this virtual constraint.
> 
>>> 
>>>> 
>>>> Signed-off-by: Hans Holmberg <hans.holmberg@wdc.com> Signed-off-by:
>>>> Dmitry Fomichev <dmitry.fomichev@wdc.com> Signed-off-by: Ajay Joshi
>>>> <ajay.joshi@wdc.com> Signed-off-by: Aravind Ramesh
>>>> <aravind.ramesh@wdc.com> Signed-off-by: Niklas Cassel
>>>> <niklas.cassel@wdc.com> Signed-off-by: Matias Bjørling
>>>> <matias.bjorling@wdc.com> Signed-off-by: Damien Le Moal
>>>> <damien.lemoal@wdc.com> Signed-off-by: Keith Busch
>>>> <keith.busch@wdc.com> --- drivers/nvme/host/Makefile |   1 + 
>>>> drivers/nvme/host/core.c   |  91 ++++++++++++-- 
>>>> drivers/nvme/host/nvme.h   |  39 ++++++ drivers/nvme/host/zns.c    |
>>>> 238 +++++++++++++++++++++++++++++++++++++ include/linux/nvme.h       |
>>>> 111 +++++++++++++++++ 5 files changed, 468 insertions(+), 12
>>>> deletions(-) create mode 100644 drivers/nvme/host/zns.c
>>>> 
>>>> diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile 
>>>> index fc7b26be692d..d7f6a87687b8 100644 ---
>>>> a/drivers/nvme/host/Makefile +++ b/drivers/nvme/host/Makefile @@ -13,6
>>>> +13,7 @@ nvme-core-y                := core.o 
>>>> nvme-core-$(CONFIG_TRACING)        += trace.o 
>>>> nvme-core-$(CONFIG_NVME_MULTIPATH)    += multipath.o 
>>>> nvme-core-$(CONFIG_NVM)            += lightnvm.o 
>>>> +nvme-core-$(CONFIG_BLK_DEV_ZONED)    += zns.o 
>>>> nvme-core-$(CONFIG_FAULT_INJECTION_DEBUG_FS)    += fault_inject.o 
>>>> nvme-core-$(CONFIG_NVME_HWMON)        += hwmon.o
>>>> 
>>>> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index
>>>> 58f137b9f2c5..e961910da4ac 100644 --- a/drivers/nvme/host/core.c +++
>>>> b/drivers/nvme/host/core.c @@ -89,7 +89,7 @@ static dev_t
>>>> nvme_chr_devt; static struct class *nvme_class; static struct class
>>>> *nvme_subsys_class;
>>>> 
>>>> -static int nvme_revalidate_disk(struct gendisk *disk); +static int
>>>> _nvme_revalidate_disk(struct gendisk *disk); static void
>>>> nvme_put_subsystem(struct nvme_subsystem *subsys); static void
>>>> nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl, unsigned nsid); 
>>>> @@ -287,6 +287,10 @@ void nvme_complete_rq(struct request *req) 
>>>> nvme_retry_req(req); return; } +    } else if
>>>> (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && +           req_op(req) ==
>>>> REQ_OP_ZONE_APPEND) { +        req->__sector =
>>>> nvme_lba_to_sect(req->q->queuedata, +
>>>> le64_to_cpu(nvme_req(req)->result.u64)); }
>>>> 
>>>> nvme_trace_bio_complete(req, status); @@ -673,7 +677,8 @@ static inline
>>>> blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns, }
>>>> 
>>>> static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, -
>>>> struct request *req, struct nvme_command *cmnd) +        struct request
>>>> *req, struct nvme_command *cmnd, +        enum nvme_opcode op) { struct
>>>> nvme_ctrl *ctrl = ns->ctrl; u16 control = 0; @@ -687,7 +692,7 @@ static
>>>> inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, if
>>>> (req->cmd_flags & REQ_RAHEAD) dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
>>>> 
>>>> -    cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : 
>>>> nvme_cmd_read); +    cmnd->rw.opcode = op; cmnd->rw.nsid =
>>>> cpu_to_le32(ns->head->ns_id); cmnd->rw.slba =
>>>> cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req))); cmnd->rw.length =
>>>> cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1); @@ -716,6 +721,8
>>>> @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, case
>>>> NVME_NS_DPS_PI_TYPE2: control |= NVME_RW_PRINFO_PRCHK_GUARD | 
>>>> NVME_RW_PRINFO_PRCHK_REF; +            if (op == nvme_cmd_zone_append) 
>>>> +                control |= NVME_RW_APPEND_PIREMAP; cmnd->rw.reftag =
>>>> cpu_to_le32(t10_pi_ref_tag(req)); break; } @@ -756,6 +763,19 @@
>>>> blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req, 
>>>> case REQ_OP_FLUSH: nvme_setup_flush(ns, cmd); break; +    case
>>>> REQ_OP_ZONE_RESET_ALL: +    case REQ_OP_ZONE_RESET: +        ret =
>>>> nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_RESET); +
>>>> break; +    case REQ_OP_ZONE_OPEN: +        ret =
>>>> nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_OPEN); +
>>>> break; +    case REQ_OP_ZONE_CLOSE: +        ret =
>>>> nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_CLOSE); +
>>>> break; +    case REQ_OP_ZONE_FINISH: +        ret =
>>>> nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_FINISH); +
>>>> break; case REQ_OP_WRITE_ZEROES: ret = nvme_setup_write_zeroes(ns, req,
>>>> cmd); break; @@ -763,8 +783,13 @@ blk_status_t nvme_setup_cmd(struct
>>>> nvme_ns *ns, struct request *req, ret = nvme_setup_discard(ns, req,
>>>> cmd); break; case REQ_OP_READ: +        ret = nvme_setup_rw(ns, req,
>>>> cmd, nvme_cmd_read); +        break; case REQ_OP_WRITE: -        ret =
>>>> nvme_setup_rw(ns, req, cmd); +        ret = nvme_setup_rw(ns, req, cmd,
>>>> nvme_cmd_write); +        break; +    case REQ_OP_ZONE_APPEND: +
>>>> ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_zone_append); break; 
>>>> default: WARN_ON_ONCE(1); @@ -1392,14 +1417,23 @@ static u32
>>>> nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns, return
>>>> effects; }
>>>> 
>>>> -static void nvme_update_formats(struct nvme_ctrl *ctrl) +static void
>>>> nvme_update_formats(struct nvme_ctrl *ctrl, u32 *effects) { struct
>>>> nvme_ns *ns;
>>>> 
>>>> down_read(&ctrl->namespaces_rwsem); list_for_each_entry(ns,
>>>> &ctrl->namespaces, list) -        if (ns->disk &&
>>>> nvme_revalidate_disk(ns->disk)) +        if (ns->disk &&
>>>> _nvme_revalidate_disk(ns->disk)) nvme_set_queue_dying(ns); +
>>>> else if (blk_queue_is_zoned(ns->disk->queue)) { +            /* +
>>>> * IO commands are required to fully revalidate a zoned +             *
>>>> device. Force the command effects to trigger rescan +             *
>>>> work so report zones can run in a context with +             * unfrozen
>>>> IO queues. +             */ +            *effects |=
>>>> NVME_CMD_EFFECTS_NCC; +        } up_read(&ctrl->namespaces_rwsem); }
>>>> 
>>>> @@ -1411,7 +1445,7 @@ static void nvme_passthru_end(struct nvme_ctrl
>>>> *ctrl, u32 effects) * this command. */ if (effects &
>>>> NVME_CMD_EFFECTS_LBCC) -        nvme_update_formats(ctrl); +
>>>> nvme_update_formats(ctrl, &effects); if (effects &
>>>> (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) { 
>>>> nvme_unfreeze(ctrl); nvme_mpath_unfreeze(ctrl->subsys); @@ -1526,7
>>>> +1560,7 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct
>>>> nvme_ns *ns, * Issue ioctl requests on the first available path.  Note
>>>> that unlike normal * block layer requests we will not retry failed
>>>> request on another controller. */ -static struct nvme_ns
>>>> *nvme_get_ns_from_disk(struct gendisk *disk, +struct nvme_ns
>>>> *nvme_get_ns_from_disk(struct gendisk *disk, struct nvme_ns_head
>>>> **head, int *srcu_idx) { #ifdef CONFIG_NVME_MULTIPATH @@ -1546,7
>>>> +1580,7 @@ static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk
>>>> *disk, return disk->private_data; }
>>>> 
>>>> -static void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx) 
>>>> +void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx) { if
>>>> (head) srcu_read_unlock(&head->srcu, idx); @@ -1939,21 +1973,28 @@
>>>> static void nvme_update_disk_info(struct gendisk *disk,
>>>> 
>>>> static int __nvme_revalidate_disk(struct gendisk *disk, struct 
>>>> nvme_id_ns *id) { +    unsigned lbaf = id->flbas &
>>>> NVME_NS_FLBAS_LBA_MASK; struct nvme_ns *ns = disk->private_data; struct
>>>> nvme_ctrl *ctrl = ns->ctrl; +    int ret; u32 iob;
>>>> 
>>>> /* * If identify namespace failed, use default 512 byte block size so *
>>>> block layer can use before failing read/write for 0 capacity. */ -
>>>> ns->lba_shift = id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ds; +
>>>> ns->lba_shift = id->lbaf[lbaf].ds; if (ns->lba_shift == 0) 
>>>> ns->lba_shift = 9;
>>>> 
>>>> switch (ns->head->ids.csi) { case NVME_CSI_NVM: break; +    case
>>>> NVME_CSI_ZNS: +        ret = nvme_update_zone_info(disk, ns, lbaf); +
>>>> if (ret) +            return ret; +        break; default: 
>>>> dev_warn(ctrl->device, "unknown csi:%d ns:%d\n", ns->head->ids.csi,
>>>> ns->head->ns_id); @@ -1967,7 +2008,7 @@ static int
>>>> __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) iob
>>>> = nvme_lba_to_sect(ns, le16_to_cpu(id->noiob));
>>>> 
>>>> ns->features = 0; -    ns->ms = le16_to_cpu(id->lbaf[id->flbas & 
>>>> NVME_NS_FLBAS_LBA_MASK].ms); +    ns->ms =
>>>> le16_to_cpu(id->lbaf[lbaf].ms); /* the PI implementation requires
>>>> metadata equal t10 pi tuple size */ if (ns->ms == sizeof(struct
>>>> t10_pi_tuple)) ns->pi_type = id->dps & NVME_NS_DPS_PI_MASK; @@ -2010,7
>>>> +2051,7 @@ static int __nvme_revalidate_disk(struct gendisk *disk,
>>>> struct nvme_id_ns *id) return 0; }
>>>> 
>>>> -static int nvme_revalidate_disk(struct gendisk *disk) +static int
>>>> _nvme_revalidate_disk(struct gendisk *disk) { struct nvme_ns *ns =
>>>> disk->private_data; struct nvme_ctrl *ctrl = ns->ctrl; @@ -2058,6
>>>> +2099,28 @@ static int nvme_revalidate_disk(struct gendisk *disk) 
>>>> return ret; }
>>>> 
>>>> +static int nvme_revalidate_disk(struct gendisk *disk) +{ +    int
>>>> ret; + +    ret = _nvme_revalidate_disk(disk); +    if (ret) +
>>>> return ret; + +#ifdef CONFIG_BLK_DEV_ZONED +    if
>>>> (blk_queue_is_zoned(disk->queue)) { +        struct nvme_ns *ns =
>>>> disk->private_data; +        struct nvme_ctrl *ctrl = ns->ctrl; + +
>>>> ret = blk_revalidate_disk_zones(disk, NULL); +        if (!ret) +
>>>> blk_queue_max_zone_append_sectors(disk->queue, +
>>>> ctrl->max_zone_append); +    } +#endif +    return ret; +} + static
>>>> char nvme_pr_type(enum pr_type type) { switch (type) { @@ -2188,6
>>>> +2251,7 @@ static const struct block_device_operations nvme_fops = { 
>>>> .release    = nvme_release, .getgeo        = nvme_getgeo, 
>>>> .revalidate_disk= nvme_revalidate_disk, +    .report_zones    =
>>>> nvme_report_zones, .pr_ops        = &nvme_pr_ops, };
>>>> 
>>>> @@ -2213,6 +2277,7 @@ const struct block_device_operations 
>>>> nvme_ns_head_ops = { .ioctl        = nvme_ioctl, .compat_ioctl    =
>>>> nvme_compat_ioctl, .getgeo        = nvme_getgeo, +    .report_zones
>>>> = nvme_report_zones, .pr_ops        = &nvme_pr_ops, }; #endif /*
>>>> CONFIG_NVME_MULTIPATH */ @@ -4439,6 +4504,8 @@ static inline void
>>>> _nvme_check_size(void) BUILD_BUG_ON(sizeof(struct nvme_command) !=
>>>> 64); BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != 
>>>> NVME_IDENTIFY_DATA_SIZE); BUILD_BUG_ON(sizeof(struct nvme_id_ns) !=
>>>> NVME_IDENTIFY_DATA_SIZE); +    BUILD_BUG_ON(sizeof(struct
>>>> nvme_id_ns_zns) != NVME_IDENTIFY_DATA_SIZE); +
>>>> BUILD_BUG_ON(sizeof(struct nvme_id_ctrl_zns) != 
>>>> NVME_IDENTIFY_DATA_SIZE); BUILD_BUG_ON(sizeof(struct
>>>> nvme_lba_range_type) != 64); BUILD_BUG_ON(sizeof(struct nvme_smart_log)
>>>> != 512); BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64); diff --git
>>>> a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index
>>>> 58428e3a590e..662f95fbd909 100644 --- a/drivers/nvme/host/nvme.h +++
>>>> b/drivers/nvme/host/nvme.h @@ -239,6 +239,9 @@ struct nvme_ctrl { u32
>>>> max_hw_sectors; u32 max_segments; u32 max_integrity_segments; +#ifdef
>>>> CONFIG_BLK_DEV_ZONED +    u32 max_zone_append; +#endif u16 crdt[3]; u16
>>>> oncs; u16 oacs; @@ -403,6 +406,9 @@ struct nvme_ns { u16 sgs; u32 sws; 
>>>> u8 pi_type; +#ifdef CONFIG_BLK_DEV_ZONED +    u64 zsze; +#endif 
>>>> unsigned long features; unsigned long flags; #define NVME_NS_REMOVING
>>>> 0 @@ -568,6 +574,9 @@ int nvme_delete_ctrl(struct nvme_ctrl *ctrl);
>>>> 
>>>> int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp,
>>>> u8 csi, void *log, size_t size, u64 offset); +struct nvme_ns
>>>> *nvme_get_ns_from_disk(struct gendisk *disk, +        struct
>>>> nvme_ns_head **head, int *srcu_idx); +void nvme_put_ns_from_disk(struct
>>>> nvme_ns_head *head, int idx);
>>>> 
>>>> extern const struct attribute_group *nvme_ns_id_attr_groups[]; extern
>>>> const struct block_device_operations nvme_ns_head_ops; @@ -689,6
>>>> +698,36 @@ static inline void nvme_mpath_start_freeze(struct
>>>> nvme_subsystem *subsys) } #endif /* CONFIG_NVME_MULTIPATH */
>>>> 
>>>> +#ifdef CONFIG_BLK_DEV_ZONED +int nvme_update_zone_info(struct gendisk
>>>> *disk, struct nvme_ns *ns, +              unsigned lbaf); + +int
>>>> nvme_report_zones(struct gendisk *disk, sector_t sector, +
>>>> unsigned int nr_zones, report_zones_cb cb, void *data); + +blk_status_t
>>>> nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct request *req, +
>>>> struct nvme_command *cmnd, +                       enum
>>>> nvme_zone_mgmt_action action); +#else +#define nvme_report_zones NULL 
>>>> + +static inline blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns
>>>> *ns, +        struct request *req, struct nvme_command *cmnd, +
>>>> enum nvme_zone_mgmt_action action) +{ +    return BLK_STS_NOTSUPP; +} 
>>>> + +static inline int nvme_update_zone_info(struct gendisk *disk, +
>>>> struct nvme_ns *ns, +                    unsigned lbaf) +{ +
>>>> dev_warn(ns->ctrl->device, +         "Please enable
>>>> CONFIG_BLK_DEV_ZONED to support ZNS devices\n"); +    return
>>>> -EPROTONOSUPPORT; +} +#endif + #ifdef CONFIG_NVM int
>>>> nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node); void
>>>> nvme_nvm_unregister(struct nvme_ns *ns); diff --git
>>>> a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c new file mode
>>>> 100644 index 000000000000..c08f6281b614 --- /dev/null +++
>>>> b/drivers/nvme/host/zns.c @@ -0,0 +1,238 @@ +//
>>>> SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2020 Western
>>>> Digital Corporation or its affiliates. + */ + +#include
>>>> <linux/blkdev.h> +#include <linux/vmalloc.h> +#include "nvme.h" + 
>>>> +static int nvme_set_max_append(struct nvme_ctrl *ctrl) +{ +    struct
>>>> nvme_command c = { }; +    struct nvme_id_ctrl_zns *id; +    int
>>>> status; + +    id = kzalloc(sizeof(*id), GFP_KERNEL); +    if (!id) +
>>>> return -ENOMEM; + +    c.identify.opcode = nvme_admin_identify; +
>>>> c.identify.cns = NVME_ID_CNS_CS_CTRL; +    c.identify.csi =
>>>> NVME_CSI_ZNS; + +    status = nvme_submit_sync_cmd(ctrl->admin_q, &c,
>>>> id, sizeof(*id)); +    if (status) { +        kfree(id); +
>>>> return status; +    } + +    ctrl->max_zone_append = 1 << (id->zamds +
>>>> 3); +    kfree(id); +    return 0; +} + +int
>>>> nvme_update_zone_info(struct gendisk *disk, struct nvme_ns *ns, +
>>>> unsigned lbaf) +{ +    struct nvme_effects_log *log =
>>>> ns->head->effects; +    struct request_queue *q = disk->queue; +
>>>> struct nvme_command c = { }; +    struct nvme_id_ns_zns *id; +    int
>>>> status; + +    /* Driver requires zone append support */ +    if
>>>> (!(log->iocs[nvme_cmd_zone_append] & NVME_CMD_EFFECTS_CSUPP)) +
>>>> return -ENODEV;
>>> 
>>> Following up on the initial comment, this check should go.
>> 
>> See first comment.
> 
> See above and please remove.
> 
>> 
>>> 
>>>> + +    /* Lazily query controller append limit for the first zoned 
>>>> namespace */ +    if (!ns->ctrl->max_zone_append) { +        status =
>>>> nvme_set_max_append(ns->ctrl); +        if (status) +            return
>>>> status; +    }
>>> 
>>> This should only be applied if append is supported.
>> 
>> See first comment.
>> 
>>> 
>>>> + +    id = kzalloc(sizeof(*id), GFP_KERNEL); +    if (!id) +
>>>> return -ENOMEM; + +    c.identify.opcode = nvme_admin_identify; +
>>>> c.identify.nsid = cpu_to_le32(ns->head->ns_id); +    c.identify.cns =
>>>> NVME_ID_CNS_CS_NS; +    c.identify.csi = NVME_CSI_ZNS; + +    status =
>>>> nvme_submit_sync_cmd(ns->ctrl->admin_q, &c, id, sizeof(*id)); +    if
>>>> (status) +        goto free_data; + +    /* +     * We currently do not
>>>> handle devices requiring any of the zoned +     * operation
>>>> characteristics. +     */ +    if (id->zoc) { +        status =
>>>> -EINVAL; +        goto free_data; +    }
>>> 
>>> I understand that "Variable Zone Capacity" is not supported as it 
>>> requires major changes at this moment, but we should support controllers 
>>> that enable "Zone Active Excursions", even when the AER event is not 
>>> implemented in this patchset.
>> 
>> 
>> NAK. Similarly to VZC, this allows an unsuspecting user to have major data
>> loss when a zone is suddenly moved to Full.
> 
> I buy that.
> 
>> 
>> 
>>> 
>>>> + +    ns->zsze = nvme_lba_to_sect(ns,
>>>> le64_to_cpu(id->lbafe[lbaf].zsze)); +    if (!ns->zsze) { +
>>>> status = -EINVAL; +        goto free_data; +    } + +
>>>> q->limits.zoned = BLK_ZONED_HM; +
>>>> blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q); +free_data: +
>>>> kfree(id); +    return status; +} + +static void
>>>> *nvme_zns_alloc_report_buffer(struct nvme_ns *ns, +
>>>> unsigned int nr_zones, size_t *buflen) +{ +    struct request_queue *q
>>>> = ns->disk->queue; +    size_t bufsize; +    void *buf; + +    const
>>>> size_t min_bufsize = sizeof(struct nvme_zone_report) + +
>>>> sizeof(struct nvme_zone_descriptor); + +    nr_zones = min_t(unsigned
>>>> int, nr_zones, +             get_capacity(ns->disk) >>
>>>> ilog2(ns->zsze)); + +    bufsize = sizeof(struct nvme_zone_report) + +
>>>> nr_zones * sizeof(struct nvme_zone_descriptor); +    bufsize =
>>>> min_t(size_t, bufsize, +            queue_max_hw_sectors(q) <<
>>>> SECTOR_SHIFT); +    bufsize = min_t(size_t, bufsize,
>>>> queue_max_segments(q) << PAGE_SHIFT); + +    while (bufsize >=
>>>> min_bufsize) { +        buf = __vmalloc(bufsize, +
>>>> GFP_KERNEL | __GFP_ZERO | __GFP_NORETRY); +        if (buf) { +
>>>> *buflen = bufsize; +            return buf; +        } +        bufsize
>>>> >>= 1; +    } +    return NULL; +} + +static int
>>>> __nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector, +
>>>> struct nvme_zone_report *report, +                  size_t buflen) +{ +
>>>> struct nvme_command c = { }; +    int ret; + +    c.zmr.opcode =
>>>> nvme_cmd_zone_mgmt_recv; +    c.zmr.nsid =
>>>> cpu_to_le32(ns->head->ns_id); +    c.zmr.slba =
>>>> cpu_to_le64(nvme_sect_to_lba(ns, sector)); +    c.zmr.numd =
>>>> cpu_to_le32(nvme_bytes_to_numd(buflen)); +    c.zmr.zra =
>>>> NVME_ZRA_ZONE_REPORT; +    c.zmr.zrasf = NVME_ZRASF_ZONE_REPORT_ALL; +
>>>> c.zmr.pr = NVME_REPORT_ZONE_PARTIAL; + +    ret =
>>>> nvme_submit_sync_cmd(ns->queue, &c, report, buflen); +    if (ret) +
>>>> return ret; + +    return le64_to_cpu(report->nr_zones); +} + +static
>>>> int nvme_zone_parse_entry(struct nvme_ns *ns, +                 struct
>>>> nvme_zone_descriptor *entry, +                 unsigned int idx,
>>>> report_zones_cb cb, +                 void *data) +{ +    struct
>>>> blk_zone zone = { }; + +    if ((entry->zt & 0xf) !=
>>>> NVME_ZONE_TYPE_SEQWRITE_REQ) { +        dev_err(ns->ctrl->device,
>>>> "invalid zone type %#x\n", +                entry->zt); +        return
>>>> -EINVAL; +    } + +    zone.type = BLK_ZONE_TYPE_SEQWRITE_REQ; +
>>>> zone.cond = entry->zs >> 4; +    zone.len = ns->zsze; +
>>>> zone.capacity = nvme_lba_to_sect(ns, le64_to_cpu(entry->zcap)); +
>>>> zone.start = nvme_lba_to_sect(ns, le64_to_cpu(entry->zslba)); +
>>>> zone.wp = nvme_lba_to_sect(ns, le64_to_cpu(entry->wp)); + +    return
>>>> cb(&zone, idx, data); +} + +static int nvme_ns_report_zones(struct
>>>> nvme_ns *ns, sector_t sector, +            unsigned int nr_zones,
>>>> report_zones_cb cb, void *data) +{ +    struct nvme_zone_report
>>>> *report; +    int ret, zone_idx = 0; +    unsigned int nz, i; +
>>>> size_t buflen; + +    report = nvme_zns_alloc_report_buffer(ns,
>>>> nr_zones, &buflen); +    if (!report) +        return -ENOMEM; + +
>>>> sector &= ~(ns->zsze - 1); +    while (zone_idx < nr_zones && sector <
>>>> get_capacity(ns->disk)) { +        memset(report, 0, buflen); +
>>>> ret = __nvme_ns_report_zones(ns, sector, report, buflen); +        if
>>>> (ret < 0) +            goto out_free; + +        nz = min_t(unsigned
>>>> int, ret, nr_zones); +        if (!nz) +            break; + +
>>>> for (i = 0; i < nz && zone_idx < nr_zones; i++) { +            ret =
>>>> nvme_zone_parse_entry(ns, &report->entries[i], +
>>>> zone_idx, cb, data); +            if (ret) +                goto
>>>> out_free; +            zone_idx++; +        } + +        sector +=
>>>> ns->zsze * nz; +    } + +    ret = zone_idx; +out_free: +
>>>> kvfree(report); +    return ret; +} + +int nvme_report_zones(struct
>>>> gendisk *disk, sector_t sector, +              unsigned int nr_zones,
>>>> report_zones_cb cb, void *data) +{ +    struct nvme_ns_head *head =
>>>> NULL; +    struct nvme_ns *ns; +    int srcu_idx, ret; + +    ns =
>>>> nvme_get_ns_from_disk(disk, &head, &srcu_idx); +    if (unlikely(!ns)) 
>>>> +        return -EWOULDBLOCK; + +    if (ns->head->ids.csi ==
>>>> NVME_CSI_ZNS) +        ret = nvme_ns_report_zones(ns, sector, nr_zones,
>>>> cb, data); +    else +        ret = -EINVAL; +
>>>> nvme_put_ns_from_disk(head, srcu_idx); + +    return ret; +} + 
>>>> +blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct 
>>>> request *req, +        struct nvme_command *c, enum
>>>> nvme_zone_mgmt_action action) +{ +    c->zms.opcode =
>>>> nvme_cmd_zone_mgmt_send; +    c->zms.nsid =
>>>> cpu_to_le32(ns->head->ns_id); +    c->zms.slba =
>>>> cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req))); +    c->zms.action
>>>> = action; + +    if (req_op(req) == REQ_OP_ZONE_RESET_ALL) +
>>>> c->zms.select = 1; + +    return BLK_STS_OK; +} diff --git
>>>> a/include/linux/nvme.h b/include/linux/nvme.h index
>>>> ea25da572eed..7b3fa7de07bd 100644 --- a/include/linux/nvme.h +++
>>>> b/include/linux/nvme.h @@ -374,6 +374,30 @@ struct nvme_id_ns { __u8
>>>> vs[3712]; };
>>>> 
>>>> +struct nvme_zns_lbafe { +    __le64            zsze; +    __u8
>>>> zdes; +    __u8            rsvd9[7]; +}; + +struct nvme_id_ns_zns { +
>>>> __le16            zoc; +    __le16            ozcs; +    __le32
>>>> mar; +    __le32            mor; +    __le32            rrl; +
>>>> __le32            frl; +    __u8            rsvd20[2796]; +    struct
>>>> nvme_zns_lbafe    lbafe[16]; +    __u8            rsvd3072[768]; +
>>>> __u8            vs[256]; +}; + +struct nvme_id_ctrl_zns { +    __u8
>>>> zamds; +    __u8    rsvd1[4095]; +}; + enum { NVME_ID_CNS_NS
>>>> = 0x00, NVME_ID_CNS_CTRL        = 0x01, @@ -392,6 +416,7 @@ enum {
>>>> 
>>>> enum { NVME_CSI_NVM            = 0, +    NVME_CSI_ZNS            = 2, 
>>>> };
>>>> 
>>>> enum { @@ -532,6 +557,27 @@ struct nvme_ana_rsp_hdr { __le16
>>>> rsvd10[3]; };
>>>> 
>>>> +struct nvme_zone_descriptor { +    __u8        zt; +    __u8
>>>> zs; +    __u8        za; +    __u8        rsvd3[5]; +    __le64
>>>> zcap; +    __le64        zslba; +    __le64        wp; +    __u8
>>>> rsvd32[32]; +}; + +enum { +    NVME_ZONE_TYPE_SEQWRITE_REQ    = 0x2, 
>>>> +}; + +struct nvme_zone_report { +    __le64        nr_zones; +    __u8
>>>> resv8[56]; +    struct nvme_zone_descriptor entries[]; +}; + enum { 
>>>> NVME_SMART_CRIT_SPARE        = 1 << 0, NVME_SMART_CRIT_TEMPERATURE    =
>>>> 1 << 1, @@ -626,6 +672,9 @@ enum nvme_opcode { nvme_cmd_resv_report
>>>> = 0x0e, nvme_cmd_resv_acquire    = 0x11, nvme_cmd_resv_release    =
>>>> 0x15, +    nvme_cmd_zone_mgmt_send    = 0x79, +
>>>> nvme_cmd_zone_mgmt_recv    = 0x7a, +    nvme_cmd_zone_append    =
>>>> 0x7d, };
>>>> 
>>>> #define nvme_opcode_name(opcode)    { opcode, #opcode } @@ -764,6
>>>> +813,7 @@ struct nvme_rw_command { enum { NVME_RW_LR            = 1 <<
>>>> 15, NVME_RW_FUA            = 1 << 14, +    NVME_RW_APPEND_PIREMAP
>>>> = 1 << 9, NVME_RW_DSM_FREQ_UNSPEC        = 0, NVME_RW_DSM_FREQ_TYPICAL
>>>> = 1, NVME_RW_DSM_FREQ_RARE        = 2, @@ -829,6 +879,53 @@ struct
>>>> nvme_write_zeroes_cmd { __le16            appmask; };
>>>> 
>>>> +enum nvme_zone_mgmt_action { +    NVME_ZONE_CLOSE        = 0x1, +
>>>> NVME_ZONE_FINISH    = 0x2, +    NVME_ZONE_OPEN        = 0x3, +
>>>> NVME_ZONE_RESET        = 0x4, +    NVME_ZONE_OFFLINE    = 0x5, +
>>>> NVME_ZONE_SET_DESC_EXT    = 0x10, +}; + +struct nvme_zone_mgmt_send_cmd
>>>> { +    __u8            opcode; +    __u8            flags; +    __u16
>>>> command_id; +    __le32            nsid; +    __le32
>>>> cdw2[2]; +    __le64            metadata; +    union nvme_data_ptr
>>>> dptr; +    __le64            slba; +    __le32            cdw12; +
>>>> __u8            action;
>>> 
>>> Why not zsa to make it easier to match to the spec
>>> 
>>> 
>>>> +    __u8            select;
>>> 
>>> sel_all?
>>> 
>>>> +    __u8            rsvd13[2]; +    __le32            cdw14[2]; +}; + 
>>>> +struct nvme_zone_mgmt_recv_cmd { +    __u8            opcode; +
>>>> __u8            flags; +    __u16            command_id; +    __le32
>>>> nsid; +    __le64            rsvd2[2]; +    union nvme_data_ptr
>>>> dptr; +    __le64            slba; +    __le32            numd; +
>>>> __u8            zra; +    __u8            zrasf; +    __u8
>>>> pr;
>>> 
>>> Partial Report is just one bit in the "Zone Receive Action Specific 
>>> Features". What about zrasfe?
>> 
>> There currently no users of pr, and bit 1-7 are reserved in the spec. Users
>> of the pr variable should shift and mask as necessary.
>> 
>> zrasf looks good to me. It is defined as a byte in the spec.
> 
> I meant for the pr variable name. Agree with the rest.
> 
>> 
>>> 
>>>> +    __u8            rsvd13; +    __le32            cdw14[2]; +}; + 
>>>> +enum { +    NVME_ZRA_ZONE_REPORT        = 0, +
>>>> NVME_ZRASF_ZONE_REPORT_ALL    = 0, +    NVME_REPORT_ZONE_PARTIAL    =
>>>> 1, +}; + /* Features */
>>>> 
>>>> enum { @@ -1300,6 +1397,8 @@ struct nvme_command { struct
>>>> nvme_format_cmd format; struct nvme_dsm_cmd dsm; struct
>>>> nvme_write_zeroes_cmd write_zeroes; +        struct
>>>> nvme_zone_mgmt_send_cmd zms; +        struct nvme_zone_mgmt_recv_cmd
>>>> zmr; struct nvme_abort_cmd abort; struct nvme_get_log_page_command
>>>> get_log_page; struct nvmf_common_command fabrics; @@ -1433,6 +1532,18
>>>> @@ enum { NVME_SC_DISCOVERY_RESTART    = 0x190, NVME_SC_AUTH_REQUIRED
>>>> = 0x191,
>>>> 
>>>> +    /* +     * I/O Command Set Specific - Zoned commands: +     */ +
>>>> NVME_SC_ZONE_BOUNDARY_ERROR    = 0x1b8, +    NVME_SC_ZONE_FULL        =
>>>> 0x1b9, +    NVME_SC_ZONE_READ_ONLY        = 0x1ba, +
>>>> NVME_SC_ZONE_OFFLINE        = 0x1bb, +    NVME_SC_ZONE_INVALID_WRITE
>>>> = 0x1bc, +    NVME_SC_ZONE_TOO_MANY_ACTIVE    = 0x1bd, +
>>>> NVME_SC_ZONE_TOO_MANY_OPEN    = 0x1be, +
>>>> NVME_SC_ZONE_INVALID_TRANSITION    = 0x1bf, + /* * Media and Data
>>>> Integrity Errors: */ -- 2.24.1
>>>> 
>> 
> 
> _______________________________________________ linux-nvme mailing list 
> linux-nvme@lists.infradead.org 
> https://urldefense.proofpoint.com/v2/url?u=http-3A__lists.infradead.org_mailman_listinfo_linux-2Dnvme&d=DwIGaQ&c=JfeWlBa6VbDyTXraMENjy_b_0yKWuqQ4qY-FPhxK4x8w-TfgRBDyeV4hVQQBEgL2&r=YJM_QPk2w1CRIo5NNBXnCXGzNnmIIfG_iTRs6chBf6s&m=vuAxizG1aX1Dc1Tj0NPWUbhwmZIe1Y12kNIbJHLIdBU&s=uCIVhY22an8jd0FJv1lizpv_vA0tpe37xpz4af6KA10&e=
> 
>
Matias Bjorling June 16, 2020, 12:37 p.m. UTC | #10
On 16/06/2020 14.35, Judy Brock wrote:
> 	>>> A namespace that does not support append is not supported by the driver.
>
> I am not that knowledgeable about Linux kernel drivers so maybe this is a dumb question but won't the driver in question be the default kernel driver for ZNS devices? If so, why would that driver deliberately reject a device which is 100% compliant with the ZNS spec?
> That would seem to favor specific implementations which seems like an inappropriate thing for a community driver to do unless there is an actual technical reason the driver is unable to function w/o append. Is there any such reason and if so what is it? Thanks and sorry if I've misunderstood.

Hi Judy,

This has been solved. Javier has said he will send patches that supports 
the above use-case. It is outside the scope of this patchset.

Best, Matias

> Judy
>
> -----Original Message-----
> From: linux-nvme [mailto:linux-nvme-bounces@lists.infradead.org] On Behalf Of Javier González
> Sent: Tuesday, June 16, 2020 5:00 AM
> To: Matias Bjørling
> Cc: Jens Axboe; Niklas Cassel; Damien Le Moal; Ajay Joshi; Sagi Grimberg; Keith Busch; Dmitry Fomichev; Aravind Ramesh; linux-nvme@lists.infradead.org; linux-block@vger.kernel.org; Hans Holmberg; Christoph Hellwig; Matias Bjørling
> Subject: Re: [PATCH 5/5] nvme: support for zoned namespaces
>
> On 16.06.2020 13:18, Matias Bjørling wrote:
>> On 16/06/2020 12.41, Javier González wrote:
>>> On 16.06.2020 08:34, Keith Busch wrote:
>>>> Add support for NVM Express Zoned Namespaces (ZNS) Command Set defined
>>>> in NVM Express TP4053. Zoned namespaces are discovered based on their
>>>> Command Set Identifier reported in the namespaces Namespace
>>>> Identification Descriptor list. A successfully discovered Zoned
>>>> Namespace will be registered with the block layer as a host managed
>>>> zoned block device with Zone Append command support. A namespace that
>>>> does not support append is not supported by the driver.
>>> Why are we enforcing the append command? Append is optional on the
>>> current ZNS specification, so we should not make this mandatory in the
>>> implementation. See specifics below.
>> There is already general support in the kernel for the zone append
>> command. Feel free to submit patches to emulate the support. It is
>> outside the scope of this patchset.
>>
> It is fine that the kernel supports append, but the ZNS specification
> does not impose the implementation for append, so the driver should not
> do that either.
>
> ZNS SSDs that choose to leave append as a non-implemented optional
> command should not rely on emulated SW support, specially when
> traditional writes work very fine for a large part of current ZNS use
> cases.
>
> Please, remove this virtual constraint.
>
>>>> Signed-off-by: Hans Holmberg <hans.holmberg@wdc.com>
>>>> Signed-off-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
>>>> Signed-off-by: Ajay Joshi <ajay.joshi@wdc.com>
>>>> Signed-off-by: Aravind Ramesh <aravind.ramesh@wdc.com>
>>>> Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
>>>> Signed-off-by: Matias Bjørling <matias.bjorling@wdc.com>
>>>> Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
>>>> Signed-off-by: Keith Busch <keith.busch@wdc.com>
>>>> ---
>>>> drivers/nvme/host/Makefile |   1 +
>>>> drivers/nvme/host/core.c   |  91 ++++++++++++--
>>>> drivers/nvme/host/nvme.h   |  39 ++++++
>>>> drivers/nvme/host/zns.c    | 238 +++++++++++++++++++++++++++++++++++++
>>>> include/linux/nvme.h       | 111 +++++++++++++++++
>>>> 5 files changed, 468 insertions(+), 12 deletions(-)
>>>> create mode 100644 drivers/nvme/host/zns.c
>>>>
>>>> diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile
>>>> index fc7b26be692d..d7f6a87687b8 100644
>>>> --- a/drivers/nvme/host/Makefile
>>>> +++ b/drivers/nvme/host/Makefile
>>>> @@ -13,6 +13,7 @@ nvme-core-y                := core.o
>>>> nvme-core-$(CONFIG_TRACING)        += trace.o
>>>> nvme-core-$(CONFIG_NVME_MULTIPATH)    += multipath.o
>>>> nvme-core-$(CONFIG_NVM)            += lightnvm.o
>>>> +nvme-core-$(CONFIG_BLK_DEV_ZONED)    += zns.o
>>>> nvme-core-$(CONFIG_FAULT_INJECTION_DEBUG_FS)    += fault_inject.o
>>>> nvme-core-$(CONFIG_NVME_HWMON)        += hwmon.o
>>>>
>>>> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
>>>> index 58f137b9f2c5..e961910da4ac 100644
>>>> --- a/drivers/nvme/host/core.c
>>>> +++ b/drivers/nvme/host/core.c
>>>> @@ -89,7 +89,7 @@ static dev_t nvme_chr_devt;
>>>> static struct class *nvme_class;
>>>> static struct class *nvme_subsys_class;
>>>>
>>>> -static int nvme_revalidate_disk(struct gendisk *disk);
>>>> +static int _nvme_revalidate_disk(struct gendisk *disk);
>>>> static void nvme_put_subsystem(struct nvme_subsystem *subsys);
>>>> static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
>>>>                         unsigned nsid);
>>>> @@ -287,6 +287,10 @@ void nvme_complete_rq(struct request *req)
>>>>              nvme_retry_req(req);
>>>>              return;
>>>>          }
>>>> +    } else if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
>>>> +           req_op(req) == REQ_OP_ZONE_APPEND) {
>>>> +        req->__sector = nvme_lba_to_sect(req->q->queuedata,
>>>> +            le64_to_cpu(nvme_req(req)->result.u64));
>>>>      }
>>>>
>>>>      nvme_trace_bio_complete(req, status);
>>>> @@ -673,7 +677,8 @@ static inline blk_status_t
>>>> nvme_setup_write_zeroes(struct nvme_ns *ns,
>>>> }
>>>>
>>>> static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
>>>> -        struct request *req, struct nvme_command *cmnd)
>>>> +        struct request *req, struct nvme_command *cmnd,
>>>> +        enum nvme_opcode op)
>>>> {
>>>>      struct nvme_ctrl *ctrl = ns->ctrl;
>>>>      u16 control = 0;
>>>> @@ -687,7 +692,7 @@ static inline blk_status_t
>>>> nvme_setup_rw(struct nvme_ns *ns,
>>>>      if (req->cmd_flags & REQ_RAHEAD)
>>>>          dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
>>>>
>>>> -    cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write :
>>>> nvme_cmd_read);
>>>> +    cmnd->rw.opcode = op;
>>>>      cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id);
>>>>      cmnd->rw.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
>>>>      cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >>
>>>> ns->lba_shift) - 1);
>>>> @@ -716,6 +721,8 @@ static inline blk_status_t
>>>> nvme_setup_rw(struct nvme_ns *ns,
>>>>          case NVME_NS_DPS_PI_TYPE2:
>>>>              control |= NVME_RW_PRINFO_PRCHK_GUARD |
>>>>                      NVME_RW_PRINFO_PRCHK_REF;
>>>> +            if (op == nvme_cmd_zone_append)
>>>> +                control |= NVME_RW_APPEND_PIREMAP;
>>>>              cmnd->rw.reftag = cpu_to_le32(t10_pi_ref_tag(req));
>>>>              break;
>>>>          }
>>>> @@ -756,6 +763,19 @@ blk_status_t nvme_setup_cmd(struct nvme_ns
>>>> *ns, struct request *req,
>>>>      case REQ_OP_FLUSH:
>>>>          nvme_setup_flush(ns, cmd);
>>>>          break;
>>>> +    case REQ_OP_ZONE_RESET_ALL:
>>>> +    case REQ_OP_ZONE_RESET:
>>>> +        ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_RESET);
>>>> +        break;
>>>> +    case REQ_OP_ZONE_OPEN:
>>>> +        ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_OPEN);
>>>> +        break;
>>>> +    case REQ_OP_ZONE_CLOSE:
>>>> +        ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_CLOSE);
>>>> +        break;
>>>> +    case REQ_OP_ZONE_FINISH:
>>>> +        ret = nvme_setup_zone_mgmt_send(ns, req, cmd,
>>>> NVME_ZONE_FINISH);
>>>> +        break;
>>>>      case REQ_OP_WRITE_ZEROES:
>>>>          ret = nvme_setup_write_zeroes(ns, req, cmd);
>>>>          break;
>>>> @@ -763,8 +783,13 @@ blk_status_t nvme_setup_cmd(struct nvme_ns
>>>> *ns, struct request *req,
>>>>          ret = nvme_setup_discard(ns, req, cmd);
>>>>          break;
>>>>      case REQ_OP_READ:
>>>> +        ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_read);
>>>> +        break;
>>>>      case REQ_OP_WRITE:
>>>> -        ret = nvme_setup_rw(ns, req, cmd);
>>>> +        ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_write);
>>>> +        break;
>>>> +    case REQ_OP_ZONE_APPEND:
>>>> +        ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_zone_append);
>>>>          break;
>>>>      default:
>>>>          WARN_ON_ONCE(1);
>>>> @@ -1392,14 +1417,23 @@ static u32 nvme_passthru_start(struct
>>>> nvme_ctrl *ctrl, struct nvme_ns *ns,
>>>>      return effects;
>>>> }
>>>>
>>>> -static void nvme_update_formats(struct nvme_ctrl *ctrl)
>>>> +static void nvme_update_formats(struct nvme_ctrl *ctrl, u32 *effects)
>>>> {
>>>>      struct nvme_ns *ns;
>>>>
>>>>      down_read(&ctrl->namespaces_rwsem);
>>>>      list_for_each_entry(ns, &ctrl->namespaces, list)
>>>> -        if (ns->disk && nvme_revalidate_disk(ns->disk))
>>>> +        if (ns->disk && _nvme_revalidate_disk(ns->disk))
>>>>              nvme_set_queue_dying(ns);
>>>> +        else if (blk_queue_is_zoned(ns->disk->queue)) {
>>>> +            /*
>>>> +             * IO commands are required to fully revalidate a zoned
>>>> +             * device. Force the command effects to trigger rescan
>>>> +             * work so report zones can run in a context with
>>>> +             * unfrozen IO queues.
>>>> +             */
>>>> +            *effects |= NVME_CMD_EFFECTS_NCC;
>>>> +        }
>>>>      up_read(&ctrl->namespaces_rwsem);
>>>> }
>>>>
>>>> @@ -1411,7 +1445,7 @@ static void nvme_passthru_end(struct
>>>> nvme_ctrl *ctrl, u32 effects)
>>>>       * this command.
>>>>       */
>>>>      if (effects & NVME_CMD_EFFECTS_LBCC)
>>>> -        nvme_update_formats(ctrl);
>>>> +        nvme_update_formats(ctrl, &effects);
>>>>      if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) {
>>>>          nvme_unfreeze(ctrl);
>>>>          nvme_mpath_unfreeze(ctrl->subsys);
>>>> @@ -1526,7 +1560,7 @@ static int nvme_user_cmd64(struct nvme_ctrl
>>>> *ctrl, struct nvme_ns *ns,
>>>>   * Issue ioctl requests on the first available path.  Note that
>>>> unlike normal
>>>>   * block layer requests we will not retry failed request on
>>>> another controller.
>>>>   */
>>>> -static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
>>>> +struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
>>>>          struct nvme_ns_head **head, int *srcu_idx)
>>>> {
>>>> #ifdef CONFIG_NVME_MULTIPATH
>>>> @@ -1546,7 +1580,7 @@ static struct nvme_ns
>>>> *nvme_get_ns_from_disk(struct gendisk *disk,
>>>>      return disk->private_data;
>>>> }
>>>>
>>>> -static void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx)
>>>> +void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx)
>>>> {
>>>>      if (head)
>>>>          srcu_read_unlock(&head->srcu, idx);
>>>> @@ -1939,21 +1973,28 @@ static void nvme_update_disk_info(struct
>>>> gendisk *disk,
>>>>
>>>> static int __nvme_revalidate_disk(struct gendisk *disk, struct
>>>> nvme_id_ns *id)
>>>> {
>>>> +    unsigned lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK;
>>>>      struct nvme_ns *ns = disk->private_data;
>>>>      struct nvme_ctrl *ctrl = ns->ctrl;
>>>> +    int ret;
>>>>      u32 iob;
>>>>
>>>>      /*
>>>>       * If identify namespace failed, use default 512 byte block size so
>>>>       * block layer can use before failing read/write for 0 capacity.
>>>>       */
>>>> -    ns->lba_shift = id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ds;
>>>> +    ns->lba_shift = id->lbaf[lbaf].ds;
>>>>      if (ns->lba_shift == 0)
>>>>          ns->lba_shift = 9;
>>>>
>>>>      switch (ns->head->ids.csi) {
>>>>      case NVME_CSI_NVM:
>>>>          break;
>>>> +    case NVME_CSI_ZNS:
>>>> +        ret = nvme_update_zone_info(disk, ns, lbaf);
>>>> +        if (ret)
>>>> +            return ret;
>>>> +        break;
>>>>      default:
>>>>          dev_warn(ctrl->device, "unknown csi:%d ns:%d\n",
>>>>              ns->head->ids.csi, ns->head->ns_id);
>>>> @@ -1967,7 +2008,7 @@ static int __nvme_revalidate_disk(struct
>>>> gendisk *disk, struct nvme_id_ns *id)
>>>>          iob = nvme_lba_to_sect(ns, le16_to_cpu(id->noiob));
>>>>
>>>>      ns->features = 0;
>>>> -    ns->ms = le16_to_cpu(id->lbaf[id->flbas &
>>>> NVME_NS_FLBAS_LBA_MASK].ms);
>>>> +    ns->ms = le16_to_cpu(id->lbaf[lbaf].ms);
>>>>      /* the PI implementation requires metadata equal t10 pi tuple
>>>> size */
>>>>      if (ns->ms == sizeof(struct t10_pi_tuple))
>>>>          ns->pi_type = id->dps & NVME_NS_DPS_PI_MASK;
>>>> @@ -2010,7 +2051,7 @@ static int __nvme_revalidate_disk(struct
>>>> gendisk *disk, struct nvme_id_ns *id)
>>>>      return 0;
>>>> }
>>>>
>>>> -static int nvme_revalidate_disk(struct gendisk *disk)
>>>> +static int _nvme_revalidate_disk(struct gendisk *disk)
>>>> {
>>>>      struct nvme_ns *ns = disk->private_data;
>>>>      struct nvme_ctrl *ctrl = ns->ctrl;
>>>> @@ -2058,6 +2099,28 @@ static int nvme_revalidate_disk(struct
>>>> gendisk *disk)
>>>>      return ret;
>>>> }
>>>>
>>>> +static int nvme_revalidate_disk(struct gendisk *disk)
>>>> +{
>>>> +    int ret;
>>>> +
>>>> +    ret = _nvme_revalidate_disk(disk);
>>>> +    if (ret)
>>>> +        return ret;
>>>> +
>>>> +#ifdef CONFIG_BLK_DEV_ZONED
>>>> +    if (blk_queue_is_zoned(disk->queue)) {
>>>> +        struct nvme_ns *ns = disk->private_data;
>>>> +        struct nvme_ctrl *ctrl = ns->ctrl;
>>>> +
>>>> +        ret = blk_revalidate_disk_zones(disk, NULL);
>>>> +        if (!ret)
>>>> +            blk_queue_max_zone_append_sectors(disk->queue,
>>>> +                              ctrl->max_zone_append);
>>>> +    }
>>>> +#endif
>>>> +    return ret;
>>>> +}
>>>> +
>>>> static char nvme_pr_type(enum pr_type type)
>>>> {
>>>>      switch (type) {
>>>> @@ -2188,6 +2251,7 @@ static const struct block_device_operations
>>>> nvme_fops = {
>>>>      .release    = nvme_release,
>>>>      .getgeo        = nvme_getgeo,
>>>>      .revalidate_disk= nvme_revalidate_disk,
>>>> +    .report_zones    = nvme_report_zones,
>>>>      .pr_ops        = &nvme_pr_ops,
>>>> };
>>>>
>>>> @@ -2213,6 +2277,7 @@ const struct block_device_operations
>>>> nvme_ns_head_ops = {
>>>>      .ioctl        = nvme_ioctl,
>>>>      .compat_ioctl    = nvme_compat_ioctl,
>>>>      .getgeo        = nvme_getgeo,
>>>> +    .report_zones    = nvme_report_zones,
>>>>      .pr_ops        = &nvme_pr_ops,
>>>> };
>>>> #endif /* CONFIG_NVME_MULTIPATH */
>>>> @@ -4439,6 +4504,8 @@ static inline void _nvme_check_size(void)
>>>>      BUILD_BUG_ON(sizeof(struct nvme_command) != 64);
>>>>      BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) !=
>>>> NVME_IDENTIFY_DATA_SIZE);
>>>>      BUILD_BUG_ON(sizeof(struct nvme_id_ns) != NVME_IDENTIFY_DATA_SIZE);
>>>> +    BUILD_BUG_ON(sizeof(struct nvme_id_ns_zns) !=
>>>> NVME_IDENTIFY_DATA_SIZE);
>>>> +    BUILD_BUG_ON(sizeof(struct nvme_id_ctrl_zns) !=
>>>> NVME_IDENTIFY_DATA_SIZE);
>>>>      BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
>>>>      BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
>>>>      BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64);
>>>> diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
>>>> index 58428e3a590e..662f95fbd909 100644
>>>> --- a/drivers/nvme/host/nvme.h
>>>> +++ b/drivers/nvme/host/nvme.h
>>>> @@ -239,6 +239,9 @@ struct nvme_ctrl {
>>>>      u32 max_hw_sectors;
>>>>      u32 max_segments;
>>>>      u32 max_integrity_segments;
>>>> +#ifdef CONFIG_BLK_DEV_ZONED
>>>> +    u32 max_zone_append;
>>>> +#endif
>>>>      u16 crdt[3];
>>>>      u16 oncs;
>>>>      u16 oacs;
>>>> @@ -403,6 +406,9 @@ struct nvme_ns {
>>>>      u16 sgs;
>>>>      u32 sws;
>>>>      u8 pi_type;
>>>> +#ifdef CONFIG_BLK_DEV_ZONED
>>>> +    u64 zsze;
>>>> +#endif
>>>>      unsigned long features;
>>>>      unsigned long flags;
>>>> #define NVME_NS_REMOVING    0
>>>> @@ -568,6 +574,9 @@ int nvme_delete_ctrl(struct nvme_ctrl *ctrl);
>>>>
>>>> int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8
>>>> lsp, u8 csi,
>>>>          void *log, size_t size, u64 offset);
>>>> +struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
>>>> +        struct nvme_ns_head **head, int *srcu_idx);
>>>> +void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx);
>>>>
>>>> extern const struct attribute_group *nvme_ns_id_attr_groups[];
>>>> extern const struct block_device_operations nvme_ns_head_ops;
>>>> @@ -689,6 +698,36 @@ static inline void
>>>> nvme_mpath_start_freeze(struct nvme_subsystem *subsys)
>>>> }
>>>> #endif /* CONFIG_NVME_MULTIPATH */
>>>>
>>>> +#ifdef CONFIG_BLK_DEV_ZONED
>>>> +int nvme_update_zone_info(struct gendisk *disk, struct nvme_ns *ns,
>>>> +              unsigned lbaf);
>>>> +
>>>> +int nvme_report_zones(struct gendisk *disk, sector_t sector,
>>>> +              unsigned int nr_zones, report_zones_cb cb, void *data);
>>>> +
>>>> +blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct
>>>> request *req,
>>>> +                       struct nvme_command *cmnd,
>>>> +                       enum nvme_zone_mgmt_action action);
>>>> +#else
>>>> +#define nvme_report_zones NULL
>>>> +
>>>> +static inline blk_status_t nvme_setup_zone_mgmt_send(struct
>>>> nvme_ns *ns,
>>>> +        struct request *req, struct nvme_command *cmnd,
>>>> +        enum nvme_zone_mgmt_action action)
>>>> +{
>>>> +    return BLK_STS_NOTSUPP;
>>>> +}
>>>> +
>>>> +static inline int nvme_update_zone_info(struct gendisk *disk,
>>>> +                    struct nvme_ns *ns,
>>>> +                    unsigned lbaf)
>>>> +{
>>>> +    dev_warn(ns->ctrl->device,
>>>> +         "Please enable CONFIG_BLK_DEV_ZONED to support ZNS
>>>> devices\n");
>>>> +    return -EPROTONOSUPPORT;
>>>> +}
>>>> +#endif
>>>> +
>>>> #ifdef CONFIG_NVM
>>>> int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node);
>>>> void nvme_nvm_unregister(struct nvme_ns *ns);
>>>> diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c
>>>> new file mode 100644
>>>> index 000000000000..c08f6281b614
>>>> --- /dev/null
>>>> +++ b/drivers/nvme/host/zns.c
>>>> @@ -0,0 +1,238 @@
>>>> +// SPDX-License-Identifier: GPL-2.0
>>>> +/*
>>>> + * Copyright (C) 2020 Western Digital Corporation or its affiliates.
>>>> + */
>>>> +
>>>> +#include <linux/blkdev.h>
>>>> +#include <linux/vmalloc.h>
>>>> +#include "nvme.h"
>>>> +
>>>> +static int nvme_set_max_append(struct nvme_ctrl *ctrl)
>>>> +{
>>>> +    struct nvme_command c = { };
>>>> +    struct nvme_id_ctrl_zns *id;
>>>> +    int status;
>>>> +
>>>> +    id = kzalloc(sizeof(*id), GFP_KERNEL);
>>>> +    if (!id)
>>>> +        return -ENOMEM;
>>>> +
>>>> +    c.identify.opcode = nvme_admin_identify;
>>>> +    c.identify.cns = NVME_ID_CNS_CS_CTRL;
>>>> +    c.identify.csi = NVME_CSI_ZNS;
>>>> +
>>>> +    status = nvme_submit_sync_cmd(ctrl->admin_q, &c, id, sizeof(*id));
>>>> +    if (status) {
>>>> +        kfree(id);
>>>> +        return status;
>>>> +    }
>>>> +
>>>> +    ctrl->max_zone_append = 1 << (id->zamds + 3);
>>>> +    kfree(id);
>>>> +    return 0;
>>>> +}
>>>> +
>>>> +int nvme_update_zone_info(struct gendisk *disk, struct nvme_ns *ns,
>>>> +              unsigned lbaf)
>>>> +{
>>>> +    struct nvme_effects_log *log = ns->head->effects;
>>>> +    struct request_queue *q = disk->queue;
>>>> +    struct nvme_command c = { };
>>>> +    struct nvme_id_ns_zns *id;
>>>> +    int status;
>>>> +
>>>> +    /* Driver requires zone append support */
>>>> +    if (!(log->iocs[nvme_cmd_zone_append] & NVME_CMD_EFFECTS_CSUPP))
>>>> +        return -ENODEV;
>>> Following up on the initial comment, this check should go.
>> See first comment.
> See above and please remove.
>
>>>> +
>>>> +    /* Lazily query controller append limit for the first zoned
>>>> namespace */
>>>> +    if (!ns->ctrl->max_zone_append) {
>>>> +        status = nvme_set_max_append(ns->ctrl);
>>>> +        if (status)
>>>> +            return status;
>>>> +    }
>>> This should only be applied if append is supported.
>> See first comment.
>>
>>>> +
>>>> +    id = kzalloc(sizeof(*id), GFP_KERNEL);
>>>> +    if (!id)
>>>> +        return -ENOMEM;
>>>> +
>>>> +    c.identify.opcode = nvme_admin_identify;
>>>> +    c.identify.nsid = cpu_to_le32(ns->head->ns_id);
>>>> +    c.identify.cns = NVME_ID_CNS_CS_NS;
>>>> +    c.identify.csi = NVME_CSI_ZNS;
>>>> +
>>>> +    status = nvme_submit_sync_cmd(ns->ctrl->admin_q, &c, id,
>>>> sizeof(*id));
>>>> +    if (status)
>>>> +        goto free_data;
>>>> +
>>>> +    /*
>>>> +     * We currently do not handle devices requiring any of the zoned
>>>> +     * operation characteristics.
>>>> +     */
>>>> +    if (id->zoc) {
>>>> +        status = -EINVAL;
>>>> +        goto free_data;
>>>> +    }
>>> I understand that "Variable Zone Capacity" is not supported as it
>>> requires major changes at this moment, but we should support controllers
>>> that enable "Zone Active Excursions", even when the AER event is not
>>> implemented in this patchset.
>>
>> NAK. Similarly to VZC, this allows an unsuspecting user to have major
>> data loss when a zone is suddenly moved to Full.
> I buy that.
>
>>
>>>> +
>>>> +    ns->zsze = nvme_lba_to_sect(ns, le64_to_cpu(id->lbafe[lbaf].zsze));
>>>> +    if (!ns->zsze) {
>>>> +        status = -EINVAL;
>>>> +        goto free_data;
>>>> +    }
>>>> +
>>>> +    q->limits.zoned = BLK_ZONED_HM;
>>>> +    blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q);
>>>> +free_data:
>>>> +    kfree(id);
>>>> +    return status;
>>>> +}
>>>> +
>>>> +static void *nvme_zns_alloc_report_buffer(struct nvme_ns *ns,
>>>> +                      unsigned int nr_zones, size_t *buflen)
>>>> +{
>>>> +    struct request_queue *q = ns->disk->queue;
>>>> +    size_t bufsize;
>>>> +    void *buf;
>>>> +
>>>> +    const size_t min_bufsize = sizeof(struct nvme_zone_report) +
>>>> +                   sizeof(struct nvme_zone_descriptor);
>>>> +
>>>> +    nr_zones = min_t(unsigned int, nr_zones,
>>>> +             get_capacity(ns->disk) >> ilog2(ns->zsze));
>>>> +
>>>> +    bufsize = sizeof(struct nvme_zone_report) +
>>>> +        nr_zones * sizeof(struct nvme_zone_descriptor);
>>>> +    bufsize = min_t(size_t, bufsize,
>>>> +            queue_max_hw_sectors(q) << SECTOR_SHIFT);
>>>> +    bufsize = min_t(size_t, bufsize, queue_max_segments(q) <<
>>>> PAGE_SHIFT);
>>>> +
>>>> +    while (bufsize >= min_bufsize) {
>>>> +        buf = __vmalloc(bufsize,
>>>> +                GFP_KERNEL | __GFP_ZERO | __GFP_NORETRY);
>>>> +        if (buf) {
>>>> +            *buflen = bufsize;
>>>> +            return buf;
>>>> +        }
>>>> +        bufsize >>= 1;
>>>> +    }
>>>> +    return NULL;
>>>> +}
>>>> +
>>>> +static int __nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector,
>>>> +                  struct nvme_zone_report *report,
>>>> +                  size_t buflen)
>>>> +{
>>>> +    struct nvme_command c = { };
>>>> +    int ret;
>>>> +
>>>> +    c.zmr.opcode = nvme_cmd_zone_mgmt_recv;
>>>> +    c.zmr.nsid = cpu_to_le32(ns->head->ns_id);
>>>> +    c.zmr.slba = cpu_to_le64(nvme_sect_to_lba(ns, sector));
>>>> +    c.zmr.numd = cpu_to_le32(nvme_bytes_to_numd(buflen));
>>>> +    c.zmr.zra = NVME_ZRA_ZONE_REPORT;
>>>> +    c.zmr.zrasf = NVME_ZRASF_ZONE_REPORT_ALL;
>>>> +    c.zmr.pr = NVME_REPORT_ZONE_PARTIAL;
>>>> +
>>>> +    ret = nvme_submit_sync_cmd(ns->queue, &c, report, buflen);
>>>> +    if (ret)
>>>> +        return ret;
>>>> +
>>>> +    return le64_to_cpu(report->nr_zones);
>>>> +}
>>>> +
>>>> +static int nvme_zone_parse_entry(struct nvme_ns *ns,
>>>> +                 struct nvme_zone_descriptor *entry,
>>>> +                 unsigned int idx, report_zones_cb cb,
>>>> +                 void *data)
>>>> +{
>>>> +    struct blk_zone zone = { };
>>>> +
>>>> +    if ((entry->zt & 0xf) != NVME_ZONE_TYPE_SEQWRITE_REQ) {
>>>> +        dev_err(ns->ctrl->device, "invalid zone type %#x\n",
>>>> +                entry->zt);
>>>> +        return -EINVAL;
>>>> +    }
>>>> +
>>>> +    zone.type = BLK_ZONE_TYPE_SEQWRITE_REQ;
>>>> +    zone.cond = entry->zs >> 4;
>>>> +    zone.len = ns->zsze;
>>>> +    zone.capacity = nvme_lba_to_sect(ns, le64_to_cpu(entry->zcap));
>>>> +    zone.start = nvme_lba_to_sect(ns, le64_to_cpu(entry->zslba));
>>>> +    zone.wp = nvme_lba_to_sect(ns, le64_to_cpu(entry->wp));
>>>> +
>>>> +    return cb(&zone, idx, data);
>>>> +}
>>>> +
>>>> +static int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector,
>>>> +            unsigned int nr_zones, report_zones_cb cb, void *data)
>>>> +{
>>>> +    struct nvme_zone_report *report;
>>>> +    int ret, zone_idx = 0;
>>>> +    unsigned int nz, i;
>>>> +    size_t buflen;
>>>> +
>>>> +    report = nvme_zns_alloc_report_buffer(ns, nr_zones, &buflen);
>>>> +    if (!report)
>>>> +        return -ENOMEM;
>>>> +
>>>> +    sector &= ~(ns->zsze - 1);
>>>> +    while (zone_idx < nr_zones && sector < get_capacity(ns->disk)) {
>>>> +        memset(report, 0, buflen);
>>>> +        ret = __nvme_ns_report_zones(ns, sector, report, buflen);
>>>> +        if (ret < 0)
>>>> +            goto out_free;
>>>> +
>>>> +        nz = min_t(unsigned int, ret, nr_zones);
>>>> +        if (!nz)
>>>> +            break;
>>>> +
>>>> +        for (i = 0; i < nz && zone_idx < nr_zones; i++) {
>>>> +            ret = nvme_zone_parse_entry(ns, &report->entries[i],
>>>> +                            zone_idx, cb, data);
>>>> +            if (ret)
>>>> +                goto out_free;
>>>> +            zone_idx++;
>>>> +        }
>>>> +
>>>> +        sector += ns->zsze * nz;
>>>> +    }
>>>> +
>>>> +    ret = zone_idx;
>>>> +out_free:
>>>> +    kvfree(report);
>>>> +    return ret;
>>>> +}
>>>> +
>>>> +int nvme_report_zones(struct gendisk *disk, sector_t sector,
>>>> +              unsigned int nr_zones, report_zones_cb cb, void *data)
>>>> +{
>>>> +    struct nvme_ns_head *head = NULL;
>>>> +    struct nvme_ns *ns;
>>>> +    int srcu_idx, ret;
>>>> +
>>>> +    ns = nvme_get_ns_from_disk(disk, &head, &srcu_idx);
>>>> +    if (unlikely(!ns))
>>>> +        return -EWOULDBLOCK;
>>>> +
>>>> +    if (ns->head->ids.csi == NVME_CSI_ZNS)
>>>> +        ret = nvme_ns_report_zones(ns, sector, nr_zones, cb, data);
>>>> +    else
>>>> +        ret = -EINVAL;
>>>> +    nvme_put_ns_from_disk(head, srcu_idx);
>>>> +
>>>> +    return ret;
>>>> +}
>>>> +
>>>> +blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct
>>>> request *req,
>>>> +        struct nvme_command *c, enum nvme_zone_mgmt_action action)
>>>> +{
>>>> +    c->zms.opcode = nvme_cmd_zone_mgmt_send;
>>>> +    c->zms.nsid = cpu_to_le32(ns->head->ns_id);
>>>> +    c->zms.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
>>>> +    c->zms.action = action;
>>>> +
>>>> +    if (req_op(req) == REQ_OP_ZONE_RESET_ALL)
>>>> +        c->zms.select = 1;
>>>> +
>>>> +    return BLK_STS_OK;
>>>> +}
>>>> diff --git a/include/linux/nvme.h b/include/linux/nvme.h
>>>> index ea25da572eed..7b3fa7de07bd 100644
>>>> --- a/include/linux/nvme.h
>>>> +++ b/include/linux/nvme.h
>>>> @@ -374,6 +374,30 @@ struct nvme_id_ns {
>>>>      __u8            vs[3712];
>>>> };
>>>>
>>>> +struct nvme_zns_lbafe {
>>>> +    __le64            zsze;
>>>> +    __u8            zdes;
>>>> +    __u8            rsvd9[7];
>>>> +};
>>>> +
>>>> +struct nvme_id_ns_zns {
>>>> +    __le16            zoc;
>>>> +    __le16            ozcs;
>>>> +    __le32            mar;
>>>> +    __le32            mor;
>>>> +    __le32            rrl;
>>>> +    __le32            frl;
>>>> +    __u8            rsvd20[2796];
>>>> +    struct nvme_zns_lbafe    lbafe[16];
>>>> +    __u8            rsvd3072[768];
>>>> +    __u8            vs[256];
>>>> +};
>>>> +
>>>> +struct nvme_id_ctrl_zns {
>>>> +    __u8    zamds;
>>>> +    __u8    rsvd1[4095];
>>>> +};
>>>> +
>>>> enum {
>>>>      NVME_ID_CNS_NS            = 0x00,
>>>>      NVME_ID_CNS_CTRL        = 0x01,
>>>> @@ -392,6 +416,7 @@ enum {
>>>>
>>>> enum {
>>>>      NVME_CSI_NVM            = 0,
>>>> +    NVME_CSI_ZNS            = 2,
>>>> };
>>>>
>>>> enum {
>>>> @@ -532,6 +557,27 @@ struct nvme_ana_rsp_hdr {
>>>>      __le16    rsvd10[3];
>>>> };
>>>>
>>>> +struct nvme_zone_descriptor {
>>>> +    __u8        zt;
>>>> +    __u8        zs;
>>>> +    __u8        za;
>>>> +    __u8        rsvd3[5];
>>>> +    __le64        zcap;
>>>> +    __le64        zslba;
>>>> +    __le64        wp;
>>>> +    __u8        rsvd32[32];
>>>> +};
>>>> +
>>>> +enum {
>>>> +    NVME_ZONE_TYPE_SEQWRITE_REQ    = 0x2,
>>>> +};
>>>> +
>>>> +struct nvme_zone_report {
>>>> +    __le64        nr_zones;
>>>> +    __u8        resv8[56];
>>>> +    struct nvme_zone_descriptor entries[];
>>>> +};
>>>> +
>>>> enum {
>>>>      NVME_SMART_CRIT_SPARE        = 1 << 0,
>>>>      NVME_SMART_CRIT_TEMPERATURE    = 1 << 1,
>>>> @@ -626,6 +672,9 @@ enum nvme_opcode {
>>>>      nvme_cmd_resv_report    = 0x0e,
>>>>      nvme_cmd_resv_acquire    = 0x11,
>>>>      nvme_cmd_resv_release    = 0x15,
>>>> +    nvme_cmd_zone_mgmt_send    = 0x79,
>>>> +    nvme_cmd_zone_mgmt_recv    = 0x7a,
>>>> +    nvme_cmd_zone_append    = 0x7d,
>>>> };
>>>>
>>>> #define nvme_opcode_name(opcode)    { opcode, #opcode }
>>>> @@ -764,6 +813,7 @@ struct nvme_rw_command {
>>>> enum {
>>>>      NVME_RW_LR            = 1 << 15,
>>>>      NVME_RW_FUA            = 1 << 14,
>>>> +    NVME_RW_APPEND_PIREMAP        = 1 << 9,
>>>>      NVME_RW_DSM_FREQ_UNSPEC        = 0,
>>>>      NVME_RW_DSM_FREQ_TYPICAL    = 1,
>>>>      NVME_RW_DSM_FREQ_RARE        = 2,
>>>> @@ -829,6 +879,53 @@ struct nvme_write_zeroes_cmd {
>>>>      __le16            appmask;
>>>> };
>>>>
>>>> +enum nvme_zone_mgmt_action {
>>>> +    NVME_ZONE_CLOSE        = 0x1,
>>>> +    NVME_ZONE_FINISH    = 0x2,
>>>> +    NVME_ZONE_OPEN        = 0x3,
>>>> +    NVME_ZONE_RESET        = 0x4,
>>>> +    NVME_ZONE_OFFLINE    = 0x5,
>>>> +    NVME_ZONE_SET_DESC_EXT    = 0x10,
>>>> +};
>>>> +
>>>> +struct nvme_zone_mgmt_send_cmd {
>>>> +    __u8            opcode;
>>>> +    __u8            flags;
>>>> +    __u16            command_id;
>>>> +    __le32            nsid;
>>>> +    __le32            cdw2[2];
>>>> +    __le64            metadata;
>>>> +    union nvme_data_ptr    dptr;
>>>> +    __le64            slba;
>>>> +    __le32            cdw12;
>>>> +    __u8            action;
>>> Why not zsa to make it easier to match to the spec
>>>
>>>
>>>> +    __u8            select;
>>> sel_all?
>>>
>>>> +    __u8            rsvd13[2];
>>>> +    __le32            cdw14[2];
>>>> +};
>>>> +
>>>> +struct nvme_zone_mgmt_recv_cmd {
>>>> +    __u8            opcode;
>>>> +    __u8            flags;
>>>> +    __u16            command_id;
>>>> +    __le32            nsid;
>>>> +    __le64            rsvd2[2];
>>>> +    union nvme_data_ptr    dptr;
>>>> +    __le64            slba;
>>>> +    __le32            numd;
>>>> +    __u8            zra;
>>>> +    __u8            zrasf;
>>>> +    __u8            pr;
>>> Partial Report is just one bit in the "Zone Receive Action Specific
>>> Features". What about zrasfe?
>> There currently no users of pr, and bit 1-7 are reserved in the spec.
>> Users of the pr variable should shift and mask as necessary.
>>
>> zrasf looks good to me. It is defined as a byte in the spec.
> I meant for the pr variable name. Agree with the rest.
>
>>>> +    __u8            rsvd13;
>>>> +    __le32            cdw14[2];
>>>> +};
>>>> +
>>>> +enum {
>>>> +    NVME_ZRA_ZONE_REPORT        = 0,
>>>> +    NVME_ZRASF_ZONE_REPORT_ALL    = 0,
>>>> +    NVME_REPORT_ZONE_PARTIAL    = 1,
>>>> +};
>>>> +
>>>> /* Features */
>>>>
>>>> enum {
>>>> @@ -1300,6 +1397,8 @@ struct nvme_command {
>>>>          struct nvme_format_cmd format;
>>>>          struct nvme_dsm_cmd dsm;
>>>>          struct nvme_write_zeroes_cmd write_zeroes;
>>>> +        struct nvme_zone_mgmt_send_cmd zms;
>>>> +        struct nvme_zone_mgmt_recv_cmd zmr;
>>>>          struct nvme_abort_cmd abort;
>>>>          struct nvme_get_log_page_command get_log_page;
>>>>          struct nvmf_common_command fabrics;
>>>> @@ -1433,6 +1532,18 @@ enum {
>>>>      NVME_SC_DISCOVERY_RESTART    = 0x190,
>>>>      NVME_SC_AUTH_REQUIRED        = 0x191,
>>>>
>>>> +    /*
>>>> +     * I/O Command Set Specific - Zoned commands:
>>>> +     */
>>>> +    NVME_SC_ZONE_BOUNDARY_ERROR    = 0x1b8,
>>>> +    NVME_SC_ZONE_FULL        = 0x1b9,
>>>> +    NVME_SC_ZONE_READ_ONLY        = 0x1ba,
>>>> +    NVME_SC_ZONE_OFFLINE        = 0x1bb,
>>>> +    NVME_SC_ZONE_INVALID_WRITE    = 0x1bc,
>>>> +    NVME_SC_ZONE_TOO_MANY_ACTIVE    = 0x1bd,
>>>> +    NVME_SC_ZONE_TOO_MANY_OPEN    = 0x1be,
>>>> +    NVME_SC_ZONE_INVALID_TRANSITION    = 0x1bf,
>>>> +
>>>>      /*
>>>>       * Media and Data Integrity Errors:
>>>>       */
>>>> -- 
>>>> 2.24.1
>>>>
> _______________________________________________
> linux-nvme mailing list
> linux-nvme@lists.infradead.org
> https://urldefense.proofpoint.com/v2/url?u=http-3A__lists.infradead.org_mailman_listinfo_linux-2Dnvme&d=DwIGaQ&c=JfeWlBa6VbDyTXraMENjy_b_0yKWuqQ4qY-FPhxK4x8w-TfgRBDyeV4hVQQBEgL2&r=YJM_QPk2w1CRIo5NNBXnCXGzNnmIIfG_iTRs6chBf6s&m=vuAxizG1aX1Dc1Tj0NPWUbhwmZIe1Y12kNIbJHLIdBU&s=uCIVhY22an8jd0FJv1lizpv_vA0tpe37xpz4af6KA10&e=
Judy Brock June 16, 2020, 1:08 p.m. UTC | #11
"The on-going re-work of btrfs zone support for instance now relies 100% on zone append being supported.... So the approach is: mandate zone append support for ZNS devices.... To allow other ZNS drives, an emulation similar to SCSI can be implemented, ...  While on a HDD the  performance penalty is minimal, it will likely be *significant* on a SSD."

Wow. Well as I said, I don't know much about Linux but it sounds like the ongoing re-work of btrfs zone support mandating zone append should be revisited.

The reality is there will be flavors of ZNS drives in the market that do not support Append.  As many of you know, the ZRWA technical proposal is well under-way in NVMe ZNS WG.

Ensuring that the entire Linux zone support ecosystem deliberately locks these devices out / or at best consigns them to a severely performance-penalized path, especially given the MULTIPLE statements that have been made in the NVMe ZNS WG by multiple companies regarding the use cases for which Zone Append is an absolute disaster (not my words), seems pretty darn inappropriate.





-----Original Message-----
From: linux-nvme [mailto:linux-nvme-bounces@lists.infradead.org] On Behalf Of Damien Le Moal
Sent: Tuesday, June 16, 2020 5:36 AM
To: Javier González; Matias Bjørling
Cc: Jens Axboe; Niklas Cassel; Ajay Joshi; Sagi Grimberg; Keith Busch; Dmitry Fomichev; Aravind Ramesh; linux-nvme@lists.infradead.org; linux-block@vger.kernel.org; Hans Holmberg; Christoph Hellwig; Matias Bjorling
Subject: Re: [PATCH 5/5] nvme: support for zoned namespaces

On 2020/06/16 21:24, Javier González wrote:
> On 16.06.2020 14:06, Matias Bjørling wrote:
>> On 16/06/2020 14.00, Javier González wrote:
>>> On 16.06.2020 13:18, Matias Bjørling wrote:
>>>> On 16/06/2020 12.41, Javier González wrote:
>>>>> On 16.06.2020 08:34, Keith Busch wrote:
>>>>>> Add support for NVM Express Zoned Namespaces (ZNS) Command Set defined
>>>>>> in NVM Express TP4053. Zoned namespaces are discovered based on their
>>>>>> Command Set Identifier reported in the namespaces Namespace
>>>>>> Identification Descriptor list. A successfully discovered Zoned
>>>>>> Namespace will be registered with the block layer as a host managed
>>>>>> zoned block device with Zone Append command support. A namespace that
>>>>>> does not support append is not supported by the driver.
>>>>>
>>>>> Why are we enforcing the append command? Append is optional on the
>>>>> current ZNS specification, so we should not make this mandatory in the
>>>>> implementation. See specifics below.
>>>
>>>>
>>>> There is already general support in the kernel for the zone append 
>>>> command. Feel free to submit patches to emulate the support. It is 
>>>> outside the scope of this patchset.
>>>>
>>>
>>> It is fine that the kernel supports append, but the ZNS specification
>>> does not impose the implementation for append, so the driver should not
>>> do that either.
>>>
>>> ZNS SSDs that choose to leave append as a non-implemented optional
>>> command should not rely on emulated SW support, specially when
>>> traditional writes work very fine for a large part of current ZNS use
>>> cases.
>>>
>>> Please, remove this virtual constraint.
>>
>> The Zone Append command is mandatory for zoned block devices. Please 
>> see https://urldefense.proofpoint.com/v2/url?u=https-3A__lwn.net_Articles_818709_&d=DwIFAw&c=JfeWlBa6VbDyTXraMENjy_b_0yKWuqQ4qY-FPhxK4x8w-TfgRBDyeV4hVQQBEgL2&r=YJM_QPk2w1CRIo5NNBXnCXGzNnmIIfG_iTRs6chBf6s&m=-fIHWuFYU2GHiTJ2FuhTBgrypPIJW0FjLUWTaK4cH9c&s=kkJ8bJpiTjKS9PoobDPHTf11agXKNUpcw5AsIEyewZk&e=  for the background.
> 
> I do not see anywhere in the block layer that append is mandatory for
> zoned devices. Append is emulated on ZBC, but beyond that there is no
> mandatory bits. Please explain.

This is to allow a single write IO path for all types of zoned block device for
higher layers, e.g file systems. The on-going re-work of btrfs zone support for
instance now relies 100% on zone append being supported. That significantly
simplifies the file system support and more importantly remove the need for
locking around block allocation and BIO issuing, allowing to preserve a fully
asynchronous write path that can include workqueues for efficient CPU usage of
things like encryption and compression. Without zone append, file system would
either (1) have to reject these drives that do not support zone append, or (2)
implement 2 different write IO path (slower regular write and zone append). None
of these options are ideal, to say the least.

So the approach is: mandate zone append support for ZNS devices. To allow other
ZNS drives, an emulation similar to SCSI can be implemented, with that emulation
ideally combined to work for both types of drives if possible. And note that
this emulation would require the drive to be operated with mq-deadline to enable
zone write locking for preserving write command order. While on a HDD the
performance penalty is minimal, it will likely be significant on a SSD.

> 
>> Please submitpatches if you want to have support for ZNS devices that
>> does not implement the Zone Append command. It is outside the scope
>> of this patchset.
> 
> That we will.
> 
> 
> _______________________________________________
> linux-nvme mailing list
> linux-nvme@lists.infradead.org
> https://urldefense.proofpoint.com/v2/url?u=http-3A__lists.infradead.org_mailman_listinfo_linux-2Dnvme&d=DwIFAw&c=JfeWlBa6VbDyTXraMENjy_b_0yKWuqQ4qY-FPhxK4x8w-TfgRBDyeV4hVQQBEgL2&r=YJM_QPk2w1CRIo5NNBXnCXGzNnmIIfG_iTRs6chBf6s&m=-fIHWuFYU2GHiTJ2FuhTBgrypPIJW0FjLUWTaK4cH9c&s=HeBnGkcBM5OqESkW8yYYi2KtvVwbdamrbd_X5PgGKBk&e= 
>
Judy Brock June 16, 2020, 1:12 p.m. UTC | #12
Sorry again then if I overreacted. But it seemed that Damien essentially said that at best non- Zone Append ZNS SSDs will be at a large performance disadvantage. I assume that is even with the patches Javier creates. Due to the fact that the whole stack is mandating Append from top down and anything that doesn't natively support that can't really be integrated seamlessly no matter what.

If that's true, then how will patches solve it?

-----Original Message-----
From: Matias Bjørling [mailto:mb@lightnvm.io] 
Sent: Tuesday, June 16, 2020 5:38 AM
To: Judy Brock; Javier González
Cc: Jens Axboe; Niklas Cassel; Damien Le Moal; Ajay Joshi; Sagi Grimberg; Keith Busch; Dmitry Fomichev; Aravind Ramesh; linux-nvme@lists.infradead.org; linux-block@vger.kernel.org; Hans Holmberg; Christoph Hellwig; Matias Bjørling
Subject: Re: [PATCH 5/5] nvme: support for zoned namespaces

On 16/06/2020 14.35, Judy Brock wrote:
> 	>>> A namespace that does not support append is not supported by the driver.
>
> I am not that knowledgeable about Linux kernel drivers so maybe this is a dumb question but won't the driver in question be the default kernel driver for ZNS devices? If so, why would that driver deliberately reject a device which is 100% compliant with the ZNS spec?
> That would seem to favor specific implementations which seems like an inappropriate thing for a community driver to do unless there is an actual technical reason the driver is unable to function w/o append. Is there any such reason and if so what is it? Thanks and sorry if I've misunderstood.

Hi Judy,

This has been solved. Javier has said he will send patches that supports 
the above use-case. It is outside the scope of this patchset.

Best, Matias

> Judy
>
> -----Original Message-----
> From: linux-nvme [mailto:linux-nvme-bounces@lists.infradead.org] On Behalf Of Javier González
> Sent: Tuesday, June 16, 2020 5:00 AM
> To: Matias Bjørling
> Cc: Jens Axboe; Niklas Cassel; Damien Le Moal; Ajay Joshi; Sagi Grimberg; Keith Busch; Dmitry Fomichev; Aravind Ramesh; linux-nvme@lists.infradead.org; linux-block@vger.kernel.org; Hans Holmberg; Christoph Hellwig; Matias Bjørling
> Subject: Re: [PATCH 5/5] nvme: support for zoned namespaces
>
> On 16.06.2020 13:18, Matias Bjørling wrote:
>> On 16/06/2020 12.41, Javier González wrote:
>>> On 16.06.2020 08:34, Keith Busch wrote:
>>>> Add support for NVM Express Zoned Namespaces (ZNS) Command Set defined
>>>> in NVM Express TP4053. Zoned namespaces are discovered based on their
>>>> Command Set Identifier reported in the namespaces Namespace
>>>> Identification Descriptor list. A successfully discovered Zoned
>>>> Namespace will be registered with the block layer as a host managed
>>>> zoned block device with Zone Append command support. A namespace that
>>>> does not support append is not supported by the driver.
>>> Why are we enforcing the append command? Append is optional on the
>>> current ZNS specification, so we should not make this mandatory in the
>>> implementation. See specifics below.
>> There is already general support in the kernel for the zone append
>> command. Feel free to submit patches to emulate the support. It is
>> outside the scope of this patchset.
>>
> It is fine that the kernel supports append, but the ZNS specification
> does not impose the implementation for append, so the driver should not
> do that either.
>
> ZNS SSDs that choose to leave append as a non-implemented optional
> command should not rely on emulated SW support, specially when
> traditional writes work very fine for a large part of current ZNS use
> cases.
>
> Please, remove this virtual constraint.
>
>>>> Signed-off-by: Hans Holmberg <hans.holmberg@wdc.com>
>>>> Signed-off-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
>>>> Signed-off-by: Ajay Joshi <ajay.joshi@wdc.com>
>>>> Signed-off-by: Aravind Ramesh <aravind.ramesh@wdc.com>
>>>> Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
>>>> Signed-off-by: Matias Bjørling <matias.bjorling@wdc.com>
>>>> Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
>>>> Signed-off-by: Keith Busch <keith.busch@wdc.com>
>>>> ---
>>>> drivers/nvme/host/Makefile |   1 +
>>>> drivers/nvme/host/core.c   |  91 ++++++++++++--
>>>> drivers/nvme/host/nvme.h   |  39 ++++++
>>>> drivers/nvme/host/zns.c    | 238 +++++++++++++++++++++++++++++++++++++
>>>> include/linux/nvme.h       | 111 +++++++++++++++++
>>>> 5 files changed, 468 insertions(+), 12 deletions(-)
>>>> create mode 100644 drivers/nvme/host/zns.c
>>>>
>>>> diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile
>>>> index fc7b26be692d..d7f6a87687b8 100644
>>>> --- a/drivers/nvme/host/Makefile
>>>> +++ b/drivers/nvme/host/Makefile
>>>> @@ -13,6 +13,7 @@ nvme-core-y                := core.o
>>>> nvme-core-$(CONFIG_TRACING)        += trace.o
>>>> nvme-core-$(CONFIG_NVME_MULTIPATH)    += multipath.o
>>>> nvme-core-$(CONFIG_NVM)            += lightnvm.o
>>>> +nvme-core-$(CONFIG_BLK_DEV_ZONED)    += zns.o
>>>> nvme-core-$(CONFIG_FAULT_INJECTION_DEBUG_FS)    += fault_inject.o
>>>> nvme-core-$(CONFIG_NVME_HWMON)        += hwmon.o
>>>>
>>>> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
>>>> index 58f137b9f2c5..e961910da4ac 100644
>>>> --- a/drivers/nvme/host/core.c
>>>> +++ b/drivers/nvme/host/core.c
>>>> @@ -89,7 +89,7 @@ static dev_t nvme_chr_devt;
>>>> static struct class *nvme_class;
>>>> static struct class *nvme_subsys_class;
>>>>
>>>> -static int nvme_revalidate_disk(struct gendisk *disk);
>>>> +static int _nvme_revalidate_disk(struct gendisk *disk);
>>>> static void nvme_put_subsystem(struct nvme_subsystem *subsys);
>>>> static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
>>>>                         unsigned nsid);
>>>> @@ -287,6 +287,10 @@ void nvme_complete_rq(struct request *req)
>>>>              nvme_retry_req(req);
>>>>              return;
>>>>          }
>>>> +    } else if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
>>>> +           req_op(req) == REQ_OP_ZONE_APPEND) {
>>>> +        req->__sector = nvme_lba_to_sect(req->q->queuedata,
>>>> +            le64_to_cpu(nvme_req(req)->result.u64));
>>>>      }
>>>>
>>>>      nvme_trace_bio_complete(req, status);
>>>> @@ -673,7 +677,8 @@ static inline blk_status_t
>>>> nvme_setup_write_zeroes(struct nvme_ns *ns,
>>>> }
>>>>
>>>> static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
>>>> -        struct request *req, struct nvme_command *cmnd)
>>>> +        struct request *req, struct nvme_command *cmnd,
>>>> +        enum nvme_opcode op)
>>>> {
>>>>      struct nvme_ctrl *ctrl = ns->ctrl;
>>>>      u16 control = 0;
>>>> @@ -687,7 +692,7 @@ static inline blk_status_t
>>>> nvme_setup_rw(struct nvme_ns *ns,
>>>>      if (req->cmd_flags & REQ_RAHEAD)
>>>>          dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
>>>>
>>>> -    cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write :
>>>> nvme_cmd_read);
>>>> +    cmnd->rw.opcode = op;
>>>>      cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id);
>>>>      cmnd->rw.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
>>>>      cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >>
>>>> ns->lba_shift) - 1);
>>>> @@ -716,6 +721,8 @@ static inline blk_status_t
>>>> nvme_setup_rw(struct nvme_ns *ns,
>>>>          case NVME_NS_DPS_PI_TYPE2:
>>>>              control |= NVME_RW_PRINFO_PRCHK_GUARD |
>>>>                      NVME_RW_PRINFO_PRCHK_REF;
>>>> +            if (op == nvme_cmd_zone_append)
>>>> +                control |= NVME_RW_APPEND_PIREMAP;
>>>>              cmnd->rw.reftag = cpu_to_le32(t10_pi_ref_tag(req));
>>>>              break;
>>>>          }
>>>> @@ -756,6 +763,19 @@ blk_status_t nvme_setup_cmd(struct nvme_ns
>>>> *ns, struct request *req,
>>>>      case REQ_OP_FLUSH:
>>>>          nvme_setup_flush(ns, cmd);
>>>>          break;
>>>> +    case REQ_OP_ZONE_RESET_ALL:
>>>> +    case REQ_OP_ZONE_RESET:
>>>> +        ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_RESET);
>>>> +        break;
>>>> +    case REQ_OP_ZONE_OPEN:
>>>> +        ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_OPEN);
>>>> +        break;
>>>> +    case REQ_OP_ZONE_CLOSE:
>>>> +        ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_CLOSE);
>>>> +        break;
>>>> +    case REQ_OP_ZONE_FINISH:
>>>> +        ret = nvme_setup_zone_mgmt_send(ns, req, cmd,
>>>> NVME_ZONE_FINISH);
>>>> +        break;
>>>>      case REQ_OP_WRITE_ZEROES:
>>>>          ret = nvme_setup_write_zeroes(ns, req, cmd);
>>>>          break;
>>>> @@ -763,8 +783,13 @@ blk_status_t nvme_setup_cmd(struct nvme_ns
>>>> *ns, struct request *req,
>>>>          ret = nvme_setup_discard(ns, req, cmd);
>>>>          break;
>>>>      case REQ_OP_READ:
>>>> +        ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_read);
>>>> +        break;
>>>>      case REQ_OP_WRITE:
>>>> -        ret = nvme_setup_rw(ns, req, cmd);
>>>> +        ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_write);
>>>> +        break;
>>>> +    case REQ_OP_ZONE_APPEND:
>>>> +        ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_zone_append);
>>>>          break;
>>>>      default:
>>>>          WARN_ON_ONCE(1);
>>>> @@ -1392,14 +1417,23 @@ static u32 nvme_passthru_start(struct
>>>> nvme_ctrl *ctrl, struct nvme_ns *ns,
>>>>      return effects;
>>>> }
>>>>
>>>> -static void nvme_update_formats(struct nvme_ctrl *ctrl)
>>>> +static void nvme_update_formats(struct nvme_ctrl *ctrl, u32 *effects)
>>>> {
>>>>      struct nvme_ns *ns;
>>>>
>>>>      down_read(&ctrl->namespaces_rwsem);
>>>>      list_for_each_entry(ns, &ctrl->namespaces, list)
>>>> -        if (ns->disk && nvme_revalidate_disk(ns->disk))
>>>> +        if (ns->disk && _nvme_revalidate_disk(ns->disk))
>>>>              nvme_set_queue_dying(ns);
>>>> +        else if (blk_queue_is_zoned(ns->disk->queue)) {
>>>> +            /*
>>>> +             * IO commands are required to fully revalidate a zoned
>>>> +             * device. Force the command effects to trigger rescan
>>>> +             * work so report zones can run in a context with
>>>> +             * unfrozen IO queues.
>>>> +             */
>>>> +            *effects |= NVME_CMD_EFFECTS_NCC;
>>>> +        }
>>>>      up_read(&ctrl->namespaces_rwsem);
>>>> }
>>>>
>>>> @@ -1411,7 +1445,7 @@ static void nvme_passthru_end(struct
>>>> nvme_ctrl *ctrl, u32 effects)
>>>>       * this command.
>>>>       */
>>>>      if (effects & NVME_CMD_EFFECTS_LBCC)
>>>> -        nvme_update_formats(ctrl);
>>>> +        nvme_update_formats(ctrl, &effects);
>>>>      if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) {
>>>>          nvme_unfreeze(ctrl);
>>>>          nvme_mpath_unfreeze(ctrl->subsys);
>>>> @@ -1526,7 +1560,7 @@ static int nvme_user_cmd64(struct nvme_ctrl
>>>> *ctrl, struct nvme_ns *ns,
>>>>   * Issue ioctl requests on the first available path.  Note that
>>>> unlike normal
>>>>   * block layer requests we will not retry failed request on
>>>> another controller.
>>>>   */
>>>> -static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
>>>> +struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
>>>>          struct nvme_ns_head **head, int *srcu_idx)
>>>> {
>>>> #ifdef CONFIG_NVME_MULTIPATH
>>>> @@ -1546,7 +1580,7 @@ static struct nvme_ns
>>>> *nvme_get_ns_from_disk(struct gendisk *disk,
>>>>      return disk->private_data;
>>>> }
>>>>
>>>> -static void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx)
>>>> +void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx)
>>>> {
>>>>      if (head)
>>>>          srcu_read_unlock(&head->srcu, idx);
>>>> @@ -1939,21 +1973,28 @@ static void nvme_update_disk_info(struct
>>>> gendisk *disk,
>>>>
>>>> static int __nvme_revalidate_disk(struct gendisk *disk, struct
>>>> nvme_id_ns *id)
>>>> {
>>>> +    unsigned lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK;
>>>>      struct nvme_ns *ns = disk->private_data;
>>>>      struct nvme_ctrl *ctrl = ns->ctrl;
>>>> +    int ret;
>>>>      u32 iob;
>>>>
>>>>      /*
>>>>       * If identify namespace failed, use default 512 byte block size so
>>>>       * block layer can use before failing read/write for 0 capacity.
>>>>       */
>>>> -    ns->lba_shift = id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ds;
>>>> +    ns->lba_shift = id->lbaf[lbaf].ds;
>>>>      if (ns->lba_shift == 0)
>>>>          ns->lba_shift = 9;
>>>>
>>>>      switch (ns->head->ids.csi) {
>>>>      case NVME_CSI_NVM:
>>>>          break;
>>>> +    case NVME_CSI_ZNS:
>>>> +        ret = nvme_update_zone_info(disk, ns, lbaf);
>>>> +        if (ret)
>>>> +            return ret;
>>>> +        break;
>>>>      default:
>>>>          dev_warn(ctrl->device, "unknown csi:%d ns:%d\n",
>>>>              ns->head->ids.csi, ns->head->ns_id);
>>>> @@ -1967,7 +2008,7 @@ static int __nvme_revalidate_disk(struct
>>>> gendisk *disk, struct nvme_id_ns *id)
>>>>          iob = nvme_lba_to_sect(ns, le16_to_cpu(id->noiob));
>>>>
>>>>      ns->features = 0;
>>>> -    ns->ms = le16_to_cpu(id->lbaf[id->flbas &
>>>> NVME_NS_FLBAS_LBA_MASK].ms);
>>>> +    ns->ms = le16_to_cpu(id->lbaf[lbaf].ms);
>>>>      /* the PI implementation requires metadata equal t10 pi tuple
>>>> size */
>>>>      if (ns->ms == sizeof(struct t10_pi_tuple))
>>>>          ns->pi_type = id->dps & NVME_NS_DPS_PI_MASK;
>>>> @@ -2010,7 +2051,7 @@ static int __nvme_revalidate_disk(struct
>>>> gendisk *disk, struct nvme_id_ns *id)
>>>>      return 0;
>>>> }
>>>>
>>>> -static int nvme_revalidate_disk(struct gendisk *disk)
>>>> +static int _nvme_revalidate_disk(struct gendisk *disk)
>>>> {
>>>>      struct nvme_ns *ns = disk->private_data;
>>>>      struct nvme_ctrl *ctrl = ns->ctrl;
>>>> @@ -2058,6 +2099,28 @@ static int nvme_revalidate_disk(struct
>>>> gendisk *disk)
>>>>      return ret;
>>>> }
>>>>
>>>> +static int nvme_revalidate_disk(struct gendisk *disk)
>>>> +{
>>>> +    int ret;
>>>> +
>>>> +    ret = _nvme_revalidate_disk(disk);
>>>> +    if (ret)
>>>> +        return ret;
>>>> +
>>>> +#ifdef CONFIG_BLK_DEV_ZONED
>>>> +    if (blk_queue_is_zoned(disk->queue)) {
>>>> +        struct nvme_ns *ns = disk->private_data;
>>>> +        struct nvme_ctrl *ctrl = ns->ctrl;
>>>> +
>>>> +        ret = blk_revalidate_disk_zones(disk, NULL);
>>>> +        if (!ret)
>>>> +            blk_queue_max_zone_append_sectors(disk->queue,
>>>> +                              ctrl->max_zone_append);
>>>> +    }
>>>> +#endif
>>>> +    return ret;
>>>> +}
>>>> +
>>>> static char nvme_pr_type(enum pr_type type)
>>>> {
>>>>      switch (type) {
>>>> @@ -2188,6 +2251,7 @@ static const struct block_device_operations
>>>> nvme_fops = {
>>>>      .release    = nvme_release,
>>>>      .getgeo        = nvme_getgeo,
>>>>      .revalidate_disk= nvme_revalidate_disk,
>>>> +    .report_zones    = nvme_report_zones,
>>>>      .pr_ops        = &nvme_pr_ops,
>>>> };
>>>>
>>>> @@ -2213,6 +2277,7 @@ const struct block_device_operations
>>>> nvme_ns_head_ops = {
>>>>      .ioctl        = nvme_ioctl,
>>>>      .compat_ioctl    = nvme_compat_ioctl,
>>>>      .getgeo        = nvme_getgeo,
>>>> +    .report_zones    = nvme_report_zones,
>>>>      .pr_ops        = &nvme_pr_ops,
>>>> };
>>>> #endif /* CONFIG_NVME_MULTIPATH */
>>>> @@ -4439,6 +4504,8 @@ static inline void _nvme_check_size(void)
>>>>      BUILD_BUG_ON(sizeof(struct nvme_command) != 64);
>>>>      BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) !=
>>>> NVME_IDENTIFY_DATA_SIZE);
>>>>      BUILD_BUG_ON(sizeof(struct nvme_id_ns) != NVME_IDENTIFY_DATA_SIZE);
>>>> +    BUILD_BUG_ON(sizeof(struct nvme_id_ns_zns) !=
>>>> NVME_IDENTIFY_DATA_SIZE);
>>>> +    BUILD_BUG_ON(sizeof(struct nvme_id_ctrl_zns) !=
>>>> NVME_IDENTIFY_DATA_SIZE);
>>>>      BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
>>>>      BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
>>>>      BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64);
>>>> diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
>>>> index 58428e3a590e..662f95fbd909 100644
>>>> --- a/drivers/nvme/host/nvme.h
>>>> +++ b/drivers/nvme/host/nvme.h
>>>> @@ -239,6 +239,9 @@ struct nvme_ctrl {
>>>>      u32 max_hw_sectors;
>>>>      u32 max_segments;
>>>>      u32 max_integrity_segments;
>>>> +#ifdef CONFIG_BLK_DEV_ZONED
>>>> +    u32 max_zone_append;
>>>> +#endif
>>>>      u16 crdt[3];
>>>>      u16 oncs;
>>>>      u16 oacs;
>>>> @@ -403,6 +406,9 @@ struct nvme_ns {
>>>>      u16 sgs;
>>>>      u32 sws;
>>>>      u8 pi_type;
>>>> +#ifdef CONFIG_BLK_DEV_ZONED
>>>> +    u64 zsze;
>>>> +#endif
>>>>      unsigned long features;
>>>>      unsigned long flags;
>>>> #define NVME_NS_REMOVING    0
>>>> @@ -568,6 +574,9 @@ int nvme_delete_ctrl(struct nvme_ctrl *ctrl);
>>>>
>>>> int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8
>>>> lsp, u8 csi,
>>>>          void *log, size_t size, u64 offset);
>>>> +struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
>>>> +        struct nvme_ns_head **head, int *srcu_idx);
>>>> +void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx);
>>>>
>>>> extern const struct attribute_group *nvme_ns_id_attr_groups[];
>>>> extern const struct block_device_operations nvme_ns_head_ops;
>>>> @@ -689,6 +698,36 @@ static inline void
>>>> nvme_mpath_start_freeze(struct nvme_subsystem *subsys)
>>>> }
>>>> #endif /* CONFIG_NVME_MULTIPATH */
>>>>
>>>> +#ifdef CONFIG_BLK_DEV_ZONED
>>>> +int nvme_update_zone_info(struct gendisk *disk, struct nvme_ns *ns,
>>>> +              unsigned lbaf);
>>>> +
>>>> +int nvme_report_zones(struct gendisk *disk, sector_t sector,
>>>> +              unsigned int nr_zones, report_zones_cb cb, void *data);
>>>> +
>>>> +blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct
>>>> request *req,
>>>> +                       struct nvme_command *cmnd,
>>>> +                       enum nvme_zone_mgmt_action action);
>>>> +#else
>>>> +#define nvme_report_zones NULL
>>>> +
>>>> +static inline blk_status_t nvme_setup_zone_mgmt_send(struct
>>>> nvme_ns *ns,
>>>> +        struct request *req, struct nvme_command *cmnd,
>>>> +        enum nvme_zone_mgmt_action action)
>>>> +{
>>>> +    return BLK_STS_NOTSUPP;
>>>> +}
>>>> +
>>>> +static inline int nvme_update_zone_info(struct gendisk *disk,
>>>> +                    struct nvme_ns *ns,
>>>> +                    unsigned lbaf)
>>>> +{
>>>> +    dev_warn(ns->ctrl->device,
>>>> +         "Please enable CONFIG_BLK_DEV_ZONED to support ZNS
>>>> devices\n");
>>>> +    return -EPROTONOSUPPORT;
>>>> +}
>>>> +#endif
>>>> +
>>>> #ifdef CONFIG_NVM
>>>> int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node);
>>>> void nvme_nvm_unregister(struct nvme_ns *ns);
>>>> diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c
>>>> new file mode 100644
>>>> index 000000000000..c08f6281b614
>>>> --- /dev/null
>>>> +++ b/drivers/nvme/host/zns.c
>>>> @@ -0,0 +1,238 @@
>>>> +// SPDX-License-Identifier: GPL-2.0
>>>> +/*
>>>> + * Copyright (C) 2020 Western Digital Corporation or its affiliates.
>>>> + */
>>>> +
>>>> +#include <linux/blkdev.h>
>>>> +#include <linux/vmalloc.h>
>>>> +#include "nvme.h"
>>>> +
>>>> +static int nvme_set_max_append(struct nvme_ctrl *ctrl)
>>>> +{
>>>> +    struct nvme_command c = { };
>>>> +    struct nvme_id_ctrl_zns *id;
>>>> +    int status;
>>>> +
>>>> +    id = kzalloc(sizeof(*id), GFP_KERNEL);
>>>> +    if (!id)
>>>> +        return -ENOMEM;
>>>> +
>>>> +    c.identify.opcode = nvme_admin_identify;
>>>> +    c.identify.cns = NVME_ID_CNS_CS_CTRL;
>>>> +    c.identify.csi = NVME_CSI_ZNS;
>>>> +
>>>> +    status = nvme_submit_sync_cmd(ctrl->admin_q, &c, id, sizeof(*id));
>>>> +    if (status) {
>>>> +        kfree(id);
>>>> +        return status;
>>>> +    }
>>>> +
>>>> +    ctrl->max_zone_append = 1 << (id->zamds + 3);
>>>> +    kfree(id);
>>>> +    return 0;
>>>> +}
>>>> +
>>>> +int nvme_update_zone_info(struct gendisk *disk, struct nvme_ns *ns,
>>>> +              unsigned lbaf)
>>>> +{
>>>> +    struct nvme_effects_log *log = ns->head->effects;
>>>> +    struct request_queue *q = disk->queue;
>>>> +    struct nvme_command c = { };
>>>> +    struct nvme_id_ns_zns *id;
>>>> +    int status;
>>>> +
>>>> +    /* Driver requires zone append support */
>>>> +    if (!(log->iocs[nvme_cmd_zone_append] & NVME_CMD_EFFECTS_CSUPP))
>>>> +        return -ENODEV;
>>> Following up on the initial comment, this check should go.
>> See first comment.
> See above and please remove.
>
>>>> +
>>>> +    /* Lazily query controller append limit for the first zoned
>>>> namespace */
>>>> +    if (!ns->ctrl->max_zone_append) {
>>>> +        status = nvme_set_max_append(ns->ctrl);
>>>> +        if (status)
>>>> +            return status;
>>>> +    }
>>> This should only be applied if append is supported.
>> See first comment.
>>
>>>> +
>>>> +    id = kzalloc(sizeof(*id), GFP_KERNEL);
>>>> +    if (!id)
>>>> +        return -ENOMEM;
>>>> +
>>>> +    c.identify.opcode = nvme_admin_identify;
>>>> +    c.identify.nsid = cpu_to_le32(ns->head->ns_id);
>>>> +    c.identify.cns = NVME_ID_CNS_CS_NS;
>>>> +    c.identify.csi = NVME_CSI_ZNS;
>>>> +
>>>> +    status = nvme_submit_sync_cmd(ns->ctrl->admin_q, &c, id,
>>>> sizeof(*id));
>>>> +    if (status)
>>>> +        goto free_data;
>>>> +
>>>> +    /*
>>>> +     * We currently do not handle devices requiring any of the zoned
>>>> +     * operation characteristics.
>>>> +     */
>>>> +    if (id->zoc) {
>>>> +        status = -EINVAL;
>>>> +        goto free_data;
>>>> +    }
>>> I understand that "Variable Zone Capacity" is not supported as it
>>> requires major changes at this moment, but we should support controllers
>>> that enable "Zone Active Excursions", even when the AER event is not
>>> implemented in this patchset.
>>
>> NAK. Similarly to VZC, this allows an unsuspecting user to have major
>> data loss when a zone is suddenly moved to Full.
> I buy that.
>
>>
>>>> +
>>>> +    ns->zsze = nvme_lba_to_sect(ns, le64_to_cpu(id->lbafe[lbaf].zsze));
>>>> +    if (!ns->zsze) {
>>>> +        status = -EINVAL;
>>>> +        goto free_data;
>>>> +    }
>>>> +
>>>> +    q->limits.zoned = BLK_ZONED_HM;
>>>> +    blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q);
>>>> +free_data:
>>>> +    kfree(id);
>>>> +    return status;
>>>> +}
>>>> +
>>>> +static void *nvme_zns_alloc_report_buffer(struct nvme_ns *ns,
>>>> +                      unsigned int nr_zones, size_t *buflen)
>>>> +{
>>>> +    struct request_queue *q = ns->disk->queue;
>>>> +    size_t bufsize;
>>>> +    void *buf;
>>>> +
>>>> +    const size_t min_bufsize = sizeof(struct nvme_zone_report) +
>>>> +                   sizeof(struct nvme_zone_descriptor);
>>>> +
>>>> +    nr_zones = min_t(unsigned int, nr_zones,
>>>> +             get_capacity(ns->disk) >> ilog2(ns->zsze));
>>>> +
>>>> +    bufsize = sizeof(struct nvme_zone_report) +
>>>> +        nr_zones * sizeof(struct nvme_zone_descriptor);
>>>> +    bufsize = min_t(size_t, bufsize,
>>>> +            queue_max_hw_sectors(q) << SECTOR_SHIFT);
>>>> +    bufsize = min_t(size_t, bufsize, queue_max_segments(q) <<
>>>> PAGE_SHIFT);
>>>> +
>>>> +    while (bufsize >= min_bufsize) {
>>>> +        buf = __vmalloc(bufsize,
>>>> +                GFP_KERNEL | __GFP_ZERO | __GFP_NORETRY);
>>>> +        if (buf) {
>>>> +            *buflen = bufsize;
>>>> +            return buf;
>>>> +        }
>>>> +        bufsize >>= 1;
>>>> +    }
>>>> +    return NULL;
>>>> +}
>>>> +
>>>> +static int __nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector,
>>>> +                  struct nvme_zone_report *report,
>>>> +                  size_t buflen)
>>>> +{
>>>> +    struct nvme_command c = { };
>>>> +    int ret;
>>>> +
>>>> +    c.zmr.opcode = nvme_cmd_zone_mgmt_recv;
>>>> +    c.zmr.nsid = cpu_to_le32(ns->head->ns_id);
>>>> +    c.zmr.slba = cpu_to_le64(nvme_sect_to_lba(ns, sector));
>>>> +    c.zmr.numd = cpu_to_le32(nvme_bytes_to_numd(buflen));
>>>> +    c.zmr.zra = NVME_ZRA_ZONE_REPORT;
>>>> +    c.zmr.zrasf = NVME_ZRASF_ZONE_REPORT_ALL;
>>>> +    c.zmr.pr = NVME_REPORT_ZONE_PARTIAL;
>>>> +
>>>> +    ret = nvme_submit_sync_cmd(ns->queue, &c, report, buflen);
>>>> +    if (ret)
>>>> +        return ret;
>>>> +
>>>> +    return le64_to_cpu(report->nr_zones);
>>>> +}
>>>> +
>>>> +static int nvme_zone_parse_entry(struct nvme_ns *ns,
>>>> +                 struct nvme_zone_descriptor *entry,
>>>> +                 unsigned int idx, report_zones_cb cb,
>>>> +                 void *data)
>>>> +{
>>>> +    struct blk_zone zone = { };
>>>> +
>>>> +    if ((entry->zt & 0xf) != NVME_ZONE_TYPE_SEQWRITE_REQ) {
>>>> +        dev_err(ns->ctrl->device, "invalid zone type %#x\n",
>>>> +                entry->zt);
>>>> +        return -EINVAL;
>>>> +    }
>>>> +
>>>> +    zone.type = BLK_ZONE_TYPE_SEQWRITE_REQ;
>>>> +    zone.cond = entry->zs >> 4;
>>>> +    zone.len = ns->zsze;
>>>> +    zone.capacity = nvme_lba_to_sect(ns, le64_to_cpu(entry->zcap));
>>>> +    zone.start = nvme_lba_to_sect(ns, le64_to_cpu(entry->zslba));
>>>> +    zone.wp = nvme_lba_to_sect(ns, le64_to_cpu(entry->wp));
>>>> +
>>>> +    return cb(&zone, idx, data);
>>>> +}
>>>> +
>>>> +static int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector,
>>>> +            unsigned int nr_zones, report_zones_cb cb, void *data)
>>>> +{
>>>> +    struct nvme_zone_report *report;
>>>> +    int ret, zone_idx = 0;
>>>> +    unsigned int nz, i;
>>>> +    size_t buflen;
>>>> +
>>>> +    report = nvme_zns_alloc_report_buffer(ns, nr_zones, &buflen);
>>>> +    if (!report)
>>>> +        return -ENOMEM;
>>>> +
>>>> +    sector &= ~(ns->zsze - 1);
>>>> +    while (zone_idx < nr_zones && sector < get_capacity(ns->disk)) {
>>>> +        memset(report, 0, buflen);
>>>> +        ret = __nvme_ns_report_zones(ns, sector, report, buflen);
>>>> +        if (ret < 0)
>>>> +            goto out_free;
>>>> +
>>>> +        nz = min_t(unsigned int, ret, nr_zones);
>>>> +        if (!nz)
>>>> +            break;
>>>> +
>>>> +        for (i = 0; i < nz && zone_idx < nr_zones; i++) {
>>>> +            ret = nvme_zone_parse_entry(ns, &report->entries[i],
>>>> +                            zone_idx, cb, data);
>>>> +            if (ret)
>>>> +                goto out_free;
>>>> +            zone_idx++;
>>>> +        }
>>>> +
>>>> +        sector += ns->zsze * nz;
>>>> +    }
>>>> +
>>>> +    ret = zone_idx;
>>>> +out_free:
>>>> +    kvfree(report);
>>>> +    return ret;
>>>> +}
>>>> +
>>>> +int nvme_report_zones(struct gendisk *disk, sector_t sector,
>>>> +              unsigned int nr_zones, report_zones_cb cb, void *data)
>>>> +{
>>>> +    struct nvme_ns_head *head = NULL;
>>>> +    struct nvme_ns *ns;
>>>> +    int srcu_idx, ret;
>>>> +
>>>> +    ns = nvme_get_ns_from_disk(disk, &head, &srcu_idx);
>>>> +    if (unlikely(!ns))
>>>> +        return -EWOULDBLOCK;
>>>> +
>>>> +    if (ns->head->ids.csi == NVME_CSI_ZNS)
>>>> +        ret = nvme_ns_report_zones(ns, sector, nr_zones, cb, data);
>>>> +    else
>>>> +        ret = -EINVAL;
>>>> +    nvme_put_ns_from_disk(head, srcu_idx);
>>>> +
>>>> +    return ret;
>>>> +}
>>>> +
>>>> +blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct
>>>> request *req,
>>>> +        struct nvme_command *c, enum nvme_zone_mgmt_action action)
>>>> +{
>>>> +    c->zms.opcode = nvme_cmd_zone_mgmt_send;
>>>> +    c->zms.nsid = cpu_to_le32(ns->head->ns_id);
>>>> +    c->zms.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
>>>> +    c->zms.action = action;
>>>> +
>>>> +    if (req_op(req) == REQ_OP_ZONE_RESET_ALL)
>>>> +        c->zms.select = 1;
>>>> +
>>>> +    return BLK_STS_OK;
>>>> +}
>>>> diff --git a/include/linux/nvme.h b/include/linux/nvme.h
>>>> index ea25da572eed..7b3fa7de07bd 100644
>>>> --- a/include/linux/nvme.h
>>>> +++ b/include/linux/nvme.h
>>>> @@ -374,6 +374,30 @@ struct nvme_id_ns {
>>>>      __u8            vs[3712];
>>>> };
>>>>
>>>> +struct nvme_zns_lbafe {
>>>> +    __le64            zsze;
>>>> +    __u8            zdes;
>>>> +    __u8            rsvd9[7];
>>>> +};
>>>> +
>>>> +struct nvme_id_ns_zns {
>>>> +    __le16            zoc;
>>>> +    __le16            ozcs;
>>>> +    __le32            mar;
>>>> +    __le32            mor;
>>>> +    __le32            rrl;
>>>> +    __le32            frl;
>>>> +    __u8            rsvd20[2796];
>>>> +    struct nvme_zns_lbafe    lbafe[16];
>>>> +    __u8            rsvd3072[768];
>>>> +    __u8            vs[256];
>>>> +};
>>>> +
>>>> +struct nvme_id_ctrl_zns {
>>>> +    __u8    zamds;
>>>> +    __u8    rsvd1[4095];
>>>> +};
>>>> +
>>>> enum {
>>>>      NVME_ID_CNS_NS            = 0x00,
>>>>      NVME_ID_CNS_CTRL        = 0x01,
>>>> @@ -392,6 +416,7 @@ enum {
>>>>
>>>> enum {
>>>>      NVME_CSI_NVM            = 0,
>>>> +    NVME_CSI_ZNS            = 2,
>>>> };
>>>>
>>>> enum {
>>>> @@ -532,6 +557,27 @@ struct nvme_ana_rsp_hdr {
>>>>      __le16    rsvd10[3];
>>>> };
>>>>
>>>> +struct nvme_zone_descriptor {
>>>> +    __u8        zt;
>>>> +    __u8        zs;
>>>> +    __u8        za;
>>>> +    __u8        rsvd3[5];
>>>> +    __le64        zcap;
>>>> +    __le64        zslba;
>>>> +    __le64        wp;
>>>> +    __u8        rsvd32[32];
>>>> +};
>>>> +
>>>> +enum {
>>>> +    NVME_ZONE_TYPE_SEQWRITE_REQ    = 0x2,
>>>> +};
>>>> +
>>>> +struct nvme_zone_report {
>>>> +    __le64        nr_zones;
>>>> +    __u8        resv8[56];
>>>> +    struct nvme_zone_descriptor entries[];
>>>> +};
>>>> +
>>>> enum {
>>>>      NVME_SMART_CRIT_SPARE        = 1 << 0,
>>>>      NVME_SMART_CRIT_TEMPERATURE    = 1 << 1,
>>>> @@ -626,6 +672,9 @@ enum nvme_opcode {
>>>>      nvme_cmd_resv_report    = 0x0e,
>>>>      nvme_cmd_resv_acquire    = 0x11,
>>>>      nvme_cmd_resv_release    = 0x15,
>>>> +    nvme_cmd_zone_mgmt_send    = 0x79,
>>>> +    nvme_cmd_zone_mgmt_recv    = 0x7a,
>>>> +    nvme_cmd_zone_append    = 0x7d,
>>>> };
>>>>
>>>> #define nvme_opcode_name(opcode)    { opcode, #opcode }
>>>> @@ -764,6 +813,7 @@ struct nvme_rw_command {
>>>> enum {
>>>>      NVME_RW_LR            = 1 << 15,
>>>>      NVME_RW_FUA            = 1 << 14,
>>>> +    NVME_RW_APPEND_PIREMAP        = 1 << 9,
>>>>      NVME_RW_DSM_FREQ_UNSPEC        = 0,
>>>>      NVME_RW_DSM_FREQ_TYPICAL    = 1,
>>>>      NVME_RW_DSM_FREQ_RARE        = 2,
>>>> @@ -829,6 +879,53 @@ struct nvme_write_zeroes_cmd {
>>>>      __le16            appmask;
>>>> };
>>>>
>>>> +enum nvme_zone_mgmt_action {
>>>> +    NVME_ZONE_CLOSE        = 0x1,
>>>> +    NVME_ZONE_FINISH    = 0x2,
>>>> +    NVME_ZONE_OPEN        = 0x3,
>>>> +    NVME_ZONE_RESET        = 0x4,
>>>> +    NVME_ZONE_OFFLINE    = 0x5,
>>>> +    NVME_ZONE_SET_DESC_EXT    = 0x10,
>>>> +};
>>>> +
>>>> +struct nvme_zone_mgmt_send_cmd {
>>>> +    __u8            opcode;
>>>> +    __u8            flags;
>>>> +    __u16            command_id;
>>>> +    __le32            nsid;
>>>> +    __le32            cdw2[2];
>>>> +    __le64            metadata;
>>>> +    union nvme_data_ptr    dptr;
>>>> +    __le64            slba;
>>>> +    __le32            cdw12;
>>>> +    __u8            action;
>>> Why not zsa to make it easier to match to the spec
>>>
>>>
>>>> +    __u8            select;
>>> sel_all?
>>>
>>>> +    __u8            rsvd13[2];
>>>> +    __le32            cdw14[2];
>>>> +};
>>>> +
>>>> +struct nvme_zone_mgmt_recv_cmd {
>>>> +    __u8            opcode;
>>>> +    __u8            flags;
>>>> +    __u16            command_id;
>>>> +    __le32            nsid;
>>>> +    __le64            rsvd2[2];
>>>> +    union nvme_data_ptr    dptr;
>>>> +    __le64            slba;
>>>> +    __le32            numd;
>>>> +    __u8            zra;
>>>> +    __u8            zrasf;
>>>> +    __u8            pr;
>>> Partial Report is just one bit in the "Zone Receive Action Specific
>>> Features". What about zrasfe?
>> There currently no users of pr, and bit 1-7 are reserved in the spec.
>> Users of the pr variable should shift and mask as necessary.
>>
>> zrasf looks good to me. It is defined as a byte in the spec.
> I meant for the pr variable name. Agree with the rest.
>
>>>> +    __u8            rsvd13;
>>>> +    __le32            cdw14[2];
>>>> +};
>>>> +
>>>> +enum {
>>>> +    NVME_ZRA_ZONE_REPORT        = 0,
>>>> +    NVME_ZRASF_ZONE_REPORT_ALL    = 0,
>>>> +    NVME_REPORT_ZONE_PARTIAL    = 1,
>>>> +};
>>>> +
>>>> /* Features */
>>>>
>>>> enum {
>>>> @@ -1300,6 +1397,8 @@ struct nvme_command {
>>>>          struct nvme_format_cmd format;
>>>>          struct nvme_dsm_cmd dsm;
>>>>          struct nvme_write_zeroes_cmd write_zeroes;
>>>> +        struct nvme_zone_mgmt_send_cmd zms;
>>>> +        struct nvme_zone_mgmt_recv_cmd zmr;
>>>>          struct nvme_abort_cmd abort;
>>>>          struct nvme_get_log_page_command get_log_page;
>>>>          struct nvmf_common_command fabrics;
>>>> @@ -1433,6 +1532,18 @@ enum {
>>>>      NVME_SC_DISCOVERY_RESTART    = 0x190,
>>>>      NVME_SC_AUTH_REQUIRED        = 0x191,
>>>>
>>>> +    /*
>>>> +     * I/O Command Set Specific - Zoned commands:
>>>> +     */
>>>> +    NVME_SC_ZONE_BOUNDARY_ERROR    = 0x1b8,
>>>> +    NVME_SC_ZONE_FULL        = 0x1b9,
>>>> +    NVME_SC_ZONE_READ_ONLY        = 0x1ba,
>>>> +    NVME_SC_ZONE_OFFLINE        = 0x1bb,
>>>> +    NVME_SC_ZONE_INVALID_WRITE    = 0x1bc,
>>>> +    NVME_SC_ZONE_TOO_MANY_ACTIVE    = 0x1bd,
>>>> +    NVME_SC_ZONE_TOO_MANY_OPEN    = 0x1be,
>>>> +    NVME_SC_ZONE_INVALID_TRANSITION    = 0x1bf,
>>>> +
>>>>      /*
>>>>       * Media and Data Integrity Errors:
>>>>       */
>>>> -- 
>>>> 2.24.1
>>>>
> _______________________________________________
> linux-nvme mailing list
> linux-nvme@lists.infradead.org
> https://urldefense.proofpoint.com/v2/url?u=https-3A__protect2.fireeye.com_url-3Fk-3D5f923ae7-2D02402c42-2D5f93b1a8-2D0cc47a31bee8-2Dab724190eafdfea8-26q-3D1-26u-3Dhttps-253A-252F-252Furldefense.proofpoint.com-252Fv2-252Furl-253Fu-253Dhttp-2D3A-5F-5Flists.infradead.org-5Fmailman-5Flistinfo-5Flinux-2D2Dnvme-2526d-253DDwIGaQ-2526c-253DJfeWlBa6VbDyTXraMENjy-5Fb-5F0yKWuqQ4qY-2DFPhxK4x8w-2DTfgRBDyeV4hVQQBEgL2-2526r-253DYJM-5FQPk2w1CRIo5NNBXnCXGzNnmIIfG-5FiTRs6chBf6s-2526m-253DvuAxizG1aX1Dc1Tj0NPWUbhwmZIe1Y12kNIbJHLIdBU-2526s-253DuCIVhY22an8jd0FJv1lizpv-5FvA0tpe37xpz4af6KA10-2526e-253D&d=DwIDaQ&c=JfeWlBa6VbDyTXraMENjy_b_0yKWuqQ4qY-FPhxK4x8w-TfgRBDyeV4hVQQBEgL2&r=YJM_QPk2w1CRIo5NNBXnCXGzNnmIIfG_iTRs6chBf6s&m=EfD62msu4pGyRsUINT6Rt3UPJqKvoN2-eC_xYdgDBI0&s=zi6TqgjKYYcL-ruLuw7LD-CjqWJBWhBsNX5YzzxlVRY&e=
Judy Brock June 16, 2020, 1:18 p.m. UTC | #13
Wait a minute. I think I misunderstood what Damien was referring to w/respect to the performance penalty. I took it to mean a penalty incurred by having to do translation (like a SCSI translation layer, etc.) but if he was referring to the fact that Append gives queue depth > 1 and he was assuming non-Zone Append SSDs would only support QD = 1, then we are all good since ZRWA is an alternate approach to achieving QD >1.

If that's what you meant Damien, my bad.

Thanks,
Judy

-----Original Message-----
From: linux-nvme [mailto:linux-nvme-bounces@lists.infradead.org] On Behalf Of Judy Brock
Sent: Tuesday, June 16, 2020 6:13 AM
To: Matias Bjørling; Javier González
Cc: Jens Axboe; Niklas Cassel; Damien Le Moal; Ajay Joshi; Sagi Grimberg; Keith Busch; Dmitry Fomichev; Aravind Ramesh; linux-nvme@lists.infradead.org; linux-block@vger.kernel.org; Hans Holmberg; Christoph Hellwig; Matias Bjørling
Subject: RE: [PATCH 5/5] nvme: support for zoned namespaces

Sorry again then if I overreacted. But it seemed that Damien essentially said that at best non- Zone Append ZNS SSDs will be at a large performance disadvantage. I assume that is even with the patches Javier creates. Due to the fact that the whole stack is mandating Append from top down and anything that doesn't natively support that can't really be integrated seamlessly no matter what.

If that's true, then how will patches solve it?

-----Original Message-----
From: Matias Bjørling [mailto:mb@lightnvm.io] 
Sent: Tuesday, June 16, 2020 5:38 AM
To: Judy Brock; Javier González
Cc: Jens Axboe; Niklas Cassel; Damien Le Moal; Ajay Joshi; Sagi Grimberg; Keith Busch; Dmitry Fomichev; Aravind Ramesh; linux-nvme@lists.infradead.org; linux-block@vger.kernel.org; Hans Holmberg; Christoph Hellwig; Matias Bjørling
Subject: Re: [PATCH 5/5] nvme: support for zoned namespaces

On 16/06/2020 14.35, Judy Brock wrote:
> 	>>> A namespace that does not support append is not supported by the driver.
>
> I am not that knowledgeable about Linux kernel drivers so maybe this is a dumb question but won't the driver in question be the default kernel driver for ZNS devices? If so, why would that driver deliberately reject a device which is 100% compliant with the ZNS spec?
> That would seem to favor specific implementations which seems like an inappropriate thing for a community driver to do unless there is an actual technical reason the driver is unable to function w/o append. Is there any such reason and if so what is it? Thanks and sorry if I've misunderstood.

Hi Judy,

This has been solved. Javier has said he will send patches that supports 
the above use-case. It is outside the scope of this patchset.

Best, Matias

> Judy
>
> -----Original Message-----
> From: linux-nvme [mailto:linux-nvme-bounces@lists.infradead.org] On Behalf Of Javier González
> Sent: Tuesday, June 16, 2020 5:00 AM
> To: Matias Bjørling
> Cc: Jens Axboe; Niklas Cassel; Damien Le Moal; Ajay Joshi; Sagi Grimberg; Keith Busch; Dmitry Fomichev; Aravind Ramesh; linux-nvme@lists.infradead.org; linux-block@vger.kernel.org; Hans Holmberg; Christoph Hellwig; Matias Bjørling
> Subject: Re: [PATCH 5/5] nvme: support for zoned namespaces
>
> On 16.06.2020 13:18, Matias Bjørling wrote:
>> On 16/06/2020 12.41, Javier González wrote:
>>> On 16.06.2020 08:34, Keith Busch wrote:
>>>> Add support for NVM Express Zoned Namespaces (ZNS) Command Set defined
>>>> in NVM Express TP4053. Zoned namespaces are discovered based on their
>>>> Command Set Identifier reported in the namespaces Namespace
>>>> Identification Descriptor list. A successfully discovered Zoned
>>>> Namespace will be registered with the block layer as a host managed
>>>> zoned block device with Zone Append command support. A namespace that
>>>> does not support append is not supported by the driver.
>>> Why are we enforcing the append command? Append is optional on the
>>> current ZNS specification, so we should not make this mandatory in the
>>> implementation. See specifics below.
>> There is already general support in the kernel for the zone append
>> command. Feel free to submit patches to emulate the support. It is
>> outside the scope of this patchset.
>>
> It is fine that the kernel supports append, but the ZNS specification
> does not impose the implementation for append, so the driver should not
> do that either.
>
> ZNS SSDs that choose to leave append as a non-implemented optional
> command should not rely on emulated SW support, specially when
> traditional writes work very fine for a large part of current ZNS use
> cases.
>
> Please, remove this virtual constraint.
>
>>>> Signed-off-by: Hans Holmberg <hans.holmberg@wdc.com>
>>>> Signed-off-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
>>>> Signed-off-by: Ajay Joshi <ajay.joshi@wdc.com>
>>>> Signed-off-by: Aravind Ramesh <aravind.ramesh@wdc.com>
>>>> Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
>>>> Signed-off-by: Matias Bjørling <matias.bjorling@wdc.com>
>>>> Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
>>>> Signed-off-by: Keith Busch <keith.busch@wdc.com>
>>>> ---
>>>> drivers/nvme/host/Makefile |   1 +
>>>> drivers/nvme/host/core.c   |  91 ++++++++++++--
>>>> drivers/nvme/host/nvme.h   |  39 ++++++
>>>> drivers/nvme/host/zns.c    | 238 +++++++++++++++++++++++++++++++++++++
>>>> include/linux/nvme.h       | 111 +++++++++++++++++
>>>> 5 files changed, 468 insertions(+), 12 deletions(-)
>>>> create mode 100644 drivers/nvme/host/zns.c
>>>>
>>>> diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile
>>>> index fc7b26be692d..d7f6a87687b8 100644
>>>> --- a/drivers/nvme/host/Makefile
>>>> +++ b/drivers/nvme/host/Makefile
>>>> @@ -13,6 +13,7 @@ nvme-core-y                := core.o
>>>> nvme-core-$(CONFIG_TRACING)        += trace.o
>>>> nvme-core-$(CONFIG_NVME_MULTIPATH)    += multipath.o
>>>> nvme-core-$(CONFIG_NVM)            += lightnvm.o
>>>> +nvme-core-$(CONFIG_BLK_DEV_ZONED)    += zns.o
>>>> nvme-core-$(CONFIG_FAULT_INJECTION_DEBUG_FS)    += fault_inject.o
>>>> nvme-core-$(CONFIG_NVME_HWMON)        += hwmon.o
>>>>
>>>> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
>>>> index 58f137b9f2c5..e961910da4ac 100644
>>>> --- a/drivers/nvme/host/core.c
>>>> +++ b/drivers/nvme/host/core.c
>>>> @@ -89,7 +89,7 @@ static dev_t nvme_chr_devt;
>>>> static struct class *nvme_class;
>>>> static struct class *nvme_subsys_class;
>>>>
>>>> -static int nvme_revalidate_disk(struct gendisk *disk);
>>>> +static int _nvme_revalidate_disk(struct gendisk *disk);
>>>> static void nvme_put_subsystem(struct nvme_subsystem *subsys);
>>>> static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
>>>>                         unsigned nsid);
>>>> @@ -287,6 +287,10 @@ void nvme_complete_rq(struct request *req)
>>>>              nvme_retry_req(req);
>>>>              return;
>>>>          }
>>>> +    } else if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
>>>> +           req_op(req) == REQ_OP_ZONE_APPEND) {
>>>> +        req->__sector = nvme_lba_to_sect(req->q->queuedata,
>>>> +            le64_to_cpu(nvme_req(req)->result.u64));
>>>>      }
>>>>
>>>>      nvme_trace_bio_complete(req, status);
>>>> @@ -673,7 +677,8 @@ static inline blk_status_t
>>>> nvme_setup_write_zeroes(struct nvme_ns *ns,
>>>> }
>>>>
>>>> static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
>>>> -        struct request *req, struct nvme_command *cmnd)
>>>> +        struct request *req, struct nvme_command *cmnd,
>>>> +        enum nvme_opcode op)
>>>> {
>>>>      struct nvme_ctrl *ctrl = ns->ctrl;
>>>>      u16 control = 0;
>>>> @@ -687,7 +692,7 @@ static inline blk_status_t
>>>> nvme_setup_rw(struct nvme_ns *ns,
>>>>      if (req->cmd_flags & REQ_RAHEAD)
>>>>          dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
>>>>
>>>> -    cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write :
>>>> nvme_cmd_read);
>>>> +    cmnd->rw.opcode = op;
>>>>      cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id);
>>>>      cmnd->rw.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
>>>>      cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >>
>>>> ns->lba_shift) - 1);
>>>> @@ -716,6 +721,8 @@ static inline blk_status_t
>>>> nvme_setup_rw(struct nvme_ns *ns,
>>>>          case NVME_NS_DPS_PI_TYPE2:
>>>>              control |= NVME_RW_PRINFO_PRCHK_GUARD |
>>>>                      NVME_RW_PRINFO_PRCHK_REF;
>>>> +            if (op == nvme_cmd_zone_append)
>>>> +                control |= NVME_RW_APPEND_PIREMAP;
>>>>              cmnd->rw.reftag = cpu_to_le32(t10_pi_ref_tag(req));
>>>>              break;
>>>>          }
>>>> @@ -756,6 +763,19 @@ blk_status_t nvme_setup_cmd(struct nvme_ns
>>>> *ns, struct request *req,
>>>>      case REQ_OP_FLUSH:
>>>>          nvme_setup_flush(ns, cmd);
>>>>          break;
>>>> +    case REQ_OP_ZONE_RESET_ALL:
>>>> +    case REQ_OP_ZONE_RESET:
>>>> +        ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_RESET);
>>>> +        break;
>>>> +    case REQ_OP_ZONE_OPEN:
>>>> +        ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_OPEN);
>>>> +        break;
>>>> +    case REQ_OP_ZONE_CLOSE:
>>>> +        ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_CLOSE);
>>>> +        break;
>>>> +    case REQ_OP_ZONE_FINISH:
>>>> +        ret = nvme_setup_zone_mgmt_send(ns, req, cmd,
>>>> NVME_ZONE_FINISH);
>>>> +        break;
>>>>      case REQ_OP_WRITE_ZEROES:
>>>>          ret = nvme_setup_write_zeroes(ns, req, cmd);
>>>>          break;
>>>> @@ -763,8 +783,13 @@ blk_status_t nvme_setup_cmd(struct nvme_ns
>>>> *ns, struct request *req,
>>>>          ret = nvme_setup_discard(ns, req, cmd);
>>>>          break;
>>>>      case REQ_OP_READ:
>>>> +        ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_read);
>>>> +        break;
>>>>      case REQ_OP_WRITE:
>>>> -        ret = nvme_setup_rw(ns, req, cmd);
>>>> +        ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_write);
>>>> +        break;
>>>> +    case REQ_OP_ZONE_APPEND:
>>>> +        ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_zone_append);
>>>>          break;
>>>>      default:
>>>>          WARN_ON_ONCE(1);
>>>> @@ -1392,14 +1417,23 @@ static u32 nvme_passthru_start(struct
>>>> nvme_ctrl *ctrl, struct nvme_ns *ns,
>>>>      return effects;
>>>> }
>>>>
>>>> -static void nvme_update_formats(struct nvme_ctrl *ctrl)
>>>> +static void nvme_update_formats(struct nvme_ctrl *ctrl, u32 *effects)
>>>> {
>>>>      struct nvme_ns *ns;
>>>>
>>>>      down_read(&ctrl->namespaces_rwsem);
>>>>      list_for_each_entry(ns, &ctrl->namespaces, list)
>>>> -        if (ns->disk && nvme_revalidate_disk(ns->disk))
>>>> +        if (ns->disk && _nvme_revalidate_disk(ns->disk))
>>>>              nvme_set_queue_dying(ns);
>>>> +        else if (blk_queue_is_zoned(ns->disk->queue)) {
>>>> +            /*
>>>> +             * IO commands are required to fully revalidate a zoned
>>>> +             * device. Force the command effects to trigger rescan
>>>> +             * work so report zones can run in a context with
>>>> +             * unfrozen IO queues.
>>>> +             */
>>>> +            *effects |= NVME_CMD_EFFECTS_NCC;
>>>> +        }
>>>>      up_read(&ctrl->namespaces_rwsem);
>>>> }
>>>>
>>>> @@ -1411,7 +1445,7 @@ static void nvme_passthru_end(struct
>>>> nvme_ctrl *ctrl, u32 effects)
>>>>       * this command.
>>>>       */
>>>>      if (effects & NVME_CMD_EFFECTS_LBCC)
>>>> -        nvme_update_formats(ctrl);
>>>> +        nvme_update_formats(ctrl, &effects);
>>>>      if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) {
>>>>          nvme_unfreeze(ctrl);
>>>>          nvme_mpath_unfreeze(ctrl->subsys);
>>>> @@ -1526,7 +1560,7 @@ static int nvme_user_cmd64(struct nvme_ctrl
>>>> *ctrl, struct nvme_ns *ns,
>>>>   * Issue ioctl requests on the first available path.  Note that
>>>> unlike normal
>>>>   * block layer requests we will not retry failed request on
>>>> another controller.
>>>>   */
>>>> -static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
>>>> +struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
>>>>          struct nvme_ns_head **head, int *srcu_idx)
>>>> {
>>>> #ifdef CONFIG_NVME_MULTIPATH
>>>> @@ -1546,7 +1580,7 @@ static struct nvme_ns
>>>> *nvme_get_ns_from_disk(struct gendisk *disk,
>>>>      return disk->private_data;
>>>> }
>>>>
>>>> -static void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx)
>>>> +void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx)
>>>> {
>>>>      if (head)
>>>>          srcu_read_unlock(&head->srcu, idx);
>>>> @@ -1939,21 +1973,28 @@ static void nvme_update_disk_info(struct
>>>> gendisk *disk,
>>>>
>>>> static int __nvme_revalidate_disk(struct gendisk *disk, struct
>>>> nvme_id_ns *id)
>>>> {
>>>> +    unsigned lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK;
>>>>      struct nvme_ns *ns = disk->private_data;
>>>>      struct nvme_ctrl *ctrl = ns->ctrl;
>>>> +    int ret;
>>>>      u32 iob;
>>>>
>>>>      /*
>>>>       * If identify namespace failed, use default 512 byte block size so
>>>>       * block layer can use before failing read/write for 0 capacity.
>>>>       */
>>>> -    ns->lba_shift = id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ds;
>>>> +    ns->lba_shift = id->lbaf[lbaf].ds;
>>>>      if (ns->lba_shift == 0)
>>>>          ns->lba_shift = 9;
>>>>
>>>>      switch (ns->head->ids.csi) {
>>>>      case NVME_CSI_NVM:
>>>>          break;
>>>> +    case NVME_CSI_ZNS:
>>>> +        ret = nvme_update_zone_info(disk, ns, lbaf);
>>>> +        if (ret)
>>>> +            return ret;
>>>> +        break;
>>>>      default:
>>>>          dev_warn(ctrl->device, "unknown csi:%d ns:%d\n",
>>>>              ns->head->ids.csi, ns->head->ns_id);
>>>> @@ -1967,7 +2008,7 @@ static int __nvme_revalidate_disk(struct
>>>> gendisk *disk, struct nvme_id_ns *id)
>>>>          iob = nvme_lba_to_sect(ns, le16_to_cpu(id->noiob));
>>>>
>>>>      ns->features = 0;
>>>> -    ns->ms = le16_to_cpu(id->lbaf[id->flbas &
>>>> NVME_NS_FLBAS_LBA_MASK].ms);
>>>> +    ns->ms = le16_to_cpu(id->lbaf[lbaf].ms);
>>>>      /* the PI implementation requires metadata equal t10 pi tuple
>>>> size */
>>>>      if (ns->ms == sizeof(struct t10_pi_tuple))
>>>>          ns->pi_type = id->dps & NVME_NS_DPS_PI_MASK;
>>>> @@ -2010,7 +2051,7 @@ static int __nvme_revalidate_disk(struct
>>>> gendisk *disk, struct nvme_id_ns *id)
>>>>      return 0;
>>>> }
>>>>
>>>> -static int nvme_revalidate_disk(struct gendisk *disk)
>>>> +static int _nvme_revalidate_disk(struct gendisk *disk)
>>>> {
>>>>      struct nvme_ns *ns = disk->private_data;
>>>>      struct nvme_ctrl *ctrl = ns->ctrl;
>>>> @@ -2058,6 +2099,28 @@ static int nvme_revalidate_disk(struct
>>>> gendisk *disk)
>>>>      return ret;
>>>> }
>>>>
>>>> +static int nvme_revalidate_disk(struct gendisk *disk)
>>>> +{
>>>> +    int ret;
>>>> +
>>>> +    ret = _nvme_revalidate_disk(disk);
>>>> +    if (ret)
>>>> +        return ret;
>>>> +
>>>> +#ifdef CONFIG_BLK_DEV_ZONED
>>>> +    if (blk_queue_is_zoned(disk->queue)) {
>>>> +        struct nvme_ns *ns = disk->private_data;
>>>> +        struct nvme_ctrl *ctrl = ns->ctrl;
>>>> +
>>>> +        ret = blk_revalidate_disk_zones(disk, NULL);
>>>> +        if (!ret)
>>>> +            blk_queue_max_zone_append_sectors(disk->queue,
>>>> +                              ctrl->max_zone_append);
>>>> +    }
>>>> +#endif
>>>> +    return ret;
>>>> +}
>>>> +
>>>> static char nvme_pr_type(enum pr_type type)
>>>> {
>>>>      switch (type) {
>>>> @@ -2188,6 +2251,7 @@ static const struct block_device_operations
>>>> nvme_fops = {
>>>>      .release    = nvme_release,
>>>>      .getgeo        = nvme_getgeo,
>>>>      .revalidate_disk= nvme_revalidate_disk,
>>>> +    .report_zones    = nvme_report_zones,
>>>>      .pr_ops        = &nvme_pr_ops,
>>>> };
>>>>
>>>> @@ -2213,6 +2277,7 @@ const struct block_device_operations
>>>> nvme_ns_head_ops = {
>>>>      .ioctl        = nvme_ioctl,
>>>>      .compat_ioctl    = nvme_compat_ioctl,
>>>>      .getgeo        = nvme_getgeo,
>>>> +    .report_zones    = nvme_report_zones,
>>>>      .pr_ops        = &nvme_pr_ops,
>>>> };
>>>> #endif /* CONFIG_NVME_MULTIPATH */
>>>> @@ -4439,6 +4504,8 @@ static inline void _nvme_check_size(void)
>>>>      BUILD_BUG_ON(sizeof(struct nvme_command) != 64);
>>>>      BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) !=
>>>> NVME_IDENTIFY_DATA_SIZE);
>>>>      BUILD_BUG_ON(sizeof(struct nvme_id_ns) != NVME_IDENTIFY_DATA_SIZE);
>>>> +    BUILD_BUG_ON(sizeof(struct nvme_id_ns_zns) !=
>>>> NVME_IDENTIFY_DATA_SIZE);
>>>> +    BUILD_BUG_ON(sizeof(struct nvme_id_ctrl_zns) !=
>>>> NVME_IDENTIFY_DATA_SIZE);
>>>>      BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
>>>>      BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
>>>>      BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64);
>>>> diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
>>>> index 58428e3a590e..662f95fbd909 100644
>>>> --- a/drivers/nvme/host/nvme.h
>>>> +++ b/drivers/nvme/host/nvme.h
>>>> @@ -239,6 +239,9 @@ struct nvme_ctrl {
>>>>      u32 max_hw_sectors;
>>>>      u32 max_segments;
>>>>      u32 max_integrity_segments;
>>>> +#ifdef CONFIG_BLK_DEV_ZONED
>>>> +    u32 max_zone_append;
>>>> +#endif
>>>>      u16 crdt[3];
>>>>      u16 oncs;
>>>>      u16 oacs;
>>>> @@ -403,6 +406,9 @@ struct nvme_ns {
>>>>      u16 sgs;
>>>>      u32 sws;
>>>>      u8 pi_type;
>>>> +#ifdef CONFIG_BLK_DEV_ZONED
>>>> +    u64 zsze;
>>>> +#endif
>>>>      unsigned long features;
>>>>      unsigned long flags;
>>>> #define NVME_NS_REMOVING    0
>>>> @@ -568,6 +574,9 @@ int nvme_delete_ctrl(struct nvme_ctrl *ctrl);
>>>>
>>>> int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8
>>>> lsp, u8 csi,
>>>>          void *log, size_t size, u64 offset);
>>>> +struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
>>>> +        struct nvme_ns_head **head, int *srcu_idx);
>>>> +void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx);
>>>>
>>>> extern const struct attribute_group *nvme_ns_id_attr_groups[];
>>>> extern const struct block_device_operations nvme_ns_head_ops;
>>>> @@ -689,6 +698,36 @@ static inline void
>>>> nvme_mpath_start_freeze(struct nvme_subsystem *subsys)
>>>> }
>>>> #endif /* CONFIG_NVME_MULTIPATH */
>>>>
>>>> +#ifdef CONFIG_BLK_DEV_ZONED
>>>> +int nvme_update_zone_info(struct gendisk *disk, struct nvme_ns *ns,
>>>> +              unsigned lbaf);
>>>> +
>>>> +int nvme_report_zones(struct gendisk *disk, sector_t sector,
>>>> +              unsigned int nr_zones, report_zones_cb cb, void *data);
>>>> +
>>>> +blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct
>>>> request *req,
>>>> +                       struct nvme_command *cmnd,
>>>> +                       enum nvme_zone_mgmt_action action);
>>>> +#else
>>>> +#define nvme_report_zones NULL
>>>> +
>>>> +static inline blk_status_t nvme_setup_zone_mgmt_send(struct
>>>> nvme_ns *ns,
>>>> +        struct request *req, struct nvme_command *cmnd,
>>>> +        enum nvme_zone_mgmt_action action)
>>>> +{
>>>> +    return BLK_STS_NOTSUPP;
>>>> +}
>>>> +
>>>> +static inline int nvme_update_zone_info(struct gendisk *disk,
>>>> +                    struct nvme_ns *ns,
>>>> +                    unsigned lbaf)
>>>> +{
>>>> +    dev_warn(ns->ctrl->device,
>>>> +         "Please enable CONFIG_BLK_DEV_ZONED to support ZNS
>>>> devices\n");
>>>> +    return -EPROTONOSUPPORT;
>>>> +}
>>>> +#endif
>>>> +
>>>> #ifdef CONFIG_NVM
>>>> int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node);
>>>> void nvme_nvm_unregister(struct nvme_ns *ns);
>>>> diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c
>>>> new file mode 100644
>>>> index 000000000000..c08f6281b614
>>>> --- /dev/null
>>>> +++ b/drivers/nvme/host/zns.c
>>>> @@ -0,0 +1,238 @@
>>>> +// SPDX-License-Identifier: GPL-2.0
>>>> +/*
>>>> + * Copyright (C) 2020 Western Digital Corporation or its affiliates.
>>>> + */
>>>> +
>>>> +#include <linux/blkdev.h>
>>>> +#include <linux/vmalloc.h>
>>>> +#include "nvme.h"
>>>> +
>>>> +static int nvme_set_max_append(struct nvme_ctrl *ctrl)
>>>> +{
>>>> +    struct nvme_command c = { };
>>>> +    struct nvme_id_ctrl_zns *id;
>>>> +    int status;
>>>> +
>>>> +    id = kzalloc(sizeof(*id), GFP_KERNEL);
>>>> +    if (!id)
>>>> +        return -ENOMEM;
>>>> +
>>>> +    c.identify.opcode = nvme_admin_identify;
>>>> +    c.identify.cns = NVME_ID_CNS_CS_CTRL;
>>>> +    c.identify.csi = NVME_CSI_ZNS;
>>>> +
>>>> +    status = nvme_submit_sync_cmd(ctrl->admin_q, &c, id, sizeof(*id));
>>>> +    if (status) {
>>>> +        kfree(id);
>>>> +        return status;
>>>> +    }
>>>> +
>>>> +    ctrl->max_zone_append = 1 << (id->zamds + 3);
>>>> +    kfree(id);
>>>> +    return 0;
>>>> +}
>>>> +
>>>> +int nvme_update_zone_info(struct gendisk *disk, struct nvme_ns *ns,
>>>> +              unsigned lbaf)
>>>> +{
>>>> +    struct nvme_effects_log *log = ns->head->effects;
>>>> +    struct request_queue *q = disk->queue;
>>>> +    struct nvme_command c = { };
>>>> +    struct nvme_id_ns_zns *id;
>>>> +    int status;
>>>> +
>>>> +    /* Driver requires zone append support */
>>>> +    if (!(log->iocs[nvme_cmd_zone_append] & NVME_CMD_EFFECTS_CSUPP))
>>>> +        return -ENODEV;
>>> Following up on the initial comment, this check should go.
>> See first comment.
> See above and please remove.
>
>>>> +
>>>> +    /* Lazily query controller append limit for the first zoned
>>>> namespace */
>>>> +    if (!ns->ctrl->max_zone_append) {
>>>> +        status = nvme_set_max_append(ns->ctrl);
>>>> +        if (status)
>>>> +            return status;
>>>> +    }
>>> This should only be applied if append is supported.
>> See first comment.
>>
>>>> +
>>>> +    id = kzalloc(sizeof(*id), GFP_KERNEL);
>>>> +    if (!id)
>>>> +        return -ENOMEM;
>>>> +
>>>> +    c.identify.opcode = nvme_admin_identify;
>>>> +    c.identify.nsid = cpu_to_le32(ns->head->ns_id);
>>>> +    c.identify.cns = NVME_ID_CNS_CS_NS;
>>>> +    c.identify.csi = NVME_CSI_ZNS;
>>>> +
>>>> +    status = nvme_submit_sync_cmd(ns->ctrl->admin_q, &c, id,
>>>> sizeof(*id));
>>>> +    if (status)
>>>> +        goto free_data;
>>>> +
>>>> +    /*
>>>> +     * We currently do not handle devices requiring any of the zoned
>>>> +     * operation characteristics.
>>>> +     */
>>>> +    if (id->zoc) {
>>>> +        status = -EINVAL;
>>>> +        goto free_data;
>>>> +    }
>>> I understand that "Variable Zone Capacity" is not supported as it
>>> requires major changes at this moment, but we should support controllers
>>> that enable "Zone Active Excursions", even when the AER event is not
>>> implemented in this patchset.
>>
>> NAK. Similarly to VZC, this allows an unsuspecting user to have major
>> data loss when a zone is suddenly moved to Full.
> I buy that.
>
>>
>>>> +
>>>> +    ns->zsze = nvme_lba_to_sect(ns, le64_to_cpu(id->lbafe[lbaf].zsze));
>>>> +    if (!ns->zsze) {
>>>> +        status = -EINVAL;
>>>> +        goto free_data;
>>>> +    }
>>>> +
>>>> +    q->limits.zoned = BLK_ZONED_HM;
>>>> +    blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q);
>>>> +free_data:
>>>> +    kfree(id);
>>>> +    return status;
>>>> +}
>>>> +
>>>> +static void *nvme_zns_alloc_report_buffer(struct nvme_ns *ns,
>>>> +                      unsigned int nr_zones, size_t *buflen)
>>>> +{
>>>> +    struct request_queue *q = ns->disk->queue;
>>>> +    size_t bufsize;
>>>> +    void *buf;
>>>> +
>>>> +    const size_t min_bufsize = sizeof(struct nvme_zone_report) +
>>>> +                   sizeof(struct nvme_zone_descriptor);
>>>> +
>>>> +    nr_zones = min_t(unsigned int, nr_zones,
>>>> +             get_capacity(ns->disk) >> ilog2(ns->zsze));
>>>> +
>>>> +    bufsize = sizeof(struct nvme_zone_report) +
>>>> +        nr_zones * sizeof(struct nvme_zone_descriptor);
>>>> +    bufsize = min_t(size_t, bufsize,
>>>> +            queue_max_hw_sectors(q) << SECTOR_SHIFT);
>>>> +    bufsize = min_t(size_t, bufsize, queue_max_segments(q) <<
>>>> PAGE_SHIFT);
>>>> +
>>>> +    while (bufsize >= min_bufsize) {
>>>> +        buf = __vmalloc(bufsize,
>>>> +                GFP_KERNEL | __GFP_ZERO | __GFP_NORETRY);
>>>> +        if (buf) {
>>>> +            *buflen = bufsize;
>>>> +            return buf;
>>>> +        }
>>>> +        bufsize >>= 1;
>>>> +    }
>>>> +    return NULL;
>>>> +}
>>>> +
>>>> +static int __nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector,
>>>> +                  struct nvme_zone_report *report,
>>>> +                  size_t buflen)
>>>> +{
>>>> +    struct nvme_command c = { };
>>>> +    int ret;
>>>> +
>>>> +    c.zmr.opcode = nvme_cmd_zone_mgmt_recv;
>>>> +    c.zmr.nsid = cpu_to_le32(ns->head->ns_id);
>>>> +    c.zmr.slba = cpu_to_le64(nvme_sect_to_lba(ns, sector));
>>>> +    c.zmr.numd = cpu_to_le32(nvme_bytes_to_numd(buflen));
>>>> +    c.zmr.zra = NVME_ZRA_ZONE_REPORT;
>>>> +    c.zmr.zrasf = NVME_ZRASF_ZONE_REPORT_ALL;
>>>> +    c.zmr.pr = NVME_REPORT_ZONE_PARTIAL;
>>>> +
>>>> +    ret = nvme_submit_sync_cmd(ns->queue, &c, report, buflen);
>>>> +    if (ret)
>>>> +        return ret;
>>>> +
>>>> +    return le64_to_cpu(report->nr_zones);
>>>> +}
>>>> +
>>>> +static int nvme_zone_parse_entry(struct nvme_ns *ns,
>>>> +                 struct nvme_zone_descriptor *entry,
>>>> +                 unsigned int idx, report_zones_cb cb,
>>>> +                 void *data)
>>>> +{
>>>> +    struct blk_zone zone = { };
>>>> +
>>>> +    if ((entry->zt & 0xf) != NVME_ZONE_TYPE_SEQWRITE_REQ) {
>>>> +        dev_err(ns->ctrl->device, "invalid zone type %#x\n",
>>>> +                entry->zt);
>>>> +        return -EINVAL;
>>>> +    }
>>>> +
>>>> +    zone.type = BLK_ZONE_TYPE_SEQWRITE_REQ;
>>>> +    zone.cond = entry->zs >> 4;
>>>> +    zone.len = ns->zsze;
>>>> +    zone.capacity = nvme_lba_to_sect(ns, le64_to_cpu(entry->zcap));
>>>> +    zone.start = nvme_lba_to_sect(ns, le64_to_cpu(entry->zslba));
>>>> +    zone.wp = nvme_lba_to_sect(ns, le64_to_cpu(entry->wp));
>>>> +
>>>> +    return cb(&zone, idx, data);
>>>> +}
>>>> +
>>>> +static int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector,
>>>> +            unsigned int nr_zones, report_zones_cb cb, void *data)
>>>> +{
>>>> +    struct nvme_zone_report *report;
>>>> +    int ret, zone_idx = 0;
>>>> +    unsigned int nz, i;
>>>> +    size_t buflen;
>>>> +
>>>> +    report = nvme_zns_alloc_report_buffer(ns, nr_zones, &buflen);
>>>> +    if (!report)
>>>> +        return -ENOMEM;
>>>> +
>>>> +    sector &= ~(ns->zsze - 1);
>>>> +    while (zone_idx < nr_zones && sector < get_capacity(ns->disk)) {
>>>> +        memset(report, 0, buflen);
>>>> +        ret = __nvme_ns_report_zones(ns, sector, report, buflen);
>>>> +        if (ret < 0)
>>>> +            goto out_free;
>>>> +
>>>> +        nz = min_t(unsigned int, ret, nr_zones);
>>>> +        if (!nz)
>>>> +            break;
>>>> +
>>>> +        for (i = 0; i < nz && zone_idx < nr_zones; i++) {
>>>> +            ret = nvme_zone_parse_entry(ns, &report->entries[i],
>>>> +                            zone_idx, cb, data);
>>>> +            if (ret)
>>>> +                goto out_free;
>>>> +            zone_idx++;
>>>> +        }
>>>> +
>>>> +        sector += ns->zsze * nz;
>>>> +    }
>>>> +
>>>> +    ret = zone_idx;
>>>> +out_free:
>>>> +    kvfree(report);
>>>> +    return ret;
>>>> +}
>>>> +
>>>> +int nvme_report_zones(struct gendisk *disk, sector_t sector,
>>>> +              unsigned int nr_zones, report_zones_cb cb, void *data)
>>>> +{
>>>> +    struct nvme_ns_head *head = NULL;
>>>> +    struct nvme_ns *ns;
>>>> +    int srcu_idx, ret;
>>>> +
>>>> +    ns = nvme_get_ns_from_disk(disk, &head, &srcu_idx);
>>>> +    if (unlikely(!ns))
>>>> +        return -EWOULDBLOCK;
>>>> +
>>>> +    if (ns->head->ids.csi == NVME_CSI_ZNS)
>>>> +        ret = nvme_ns_report_zones(ns, sector, nr_zones, cb, data);
>>>> +    else
>>>> +        ret = -EINVAL;
>>>> +    nvme_put_ns_from_disk(head, srcu_idx);
>>>> +
>>>> +    return ret;
>>>> +}
>>>> +
>>>> +blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct
>>>> request *req,
>>>> +        struct nvme_command *c, enum nvme_zone_mgmt_action action)
>>>> +{
>>>> +    c->zms.opcode = nvme_cmd_zone_mgmt_send;
>>>> +    c->zms.nsid = cpu_to_le32(ns->head->ns_id);
>>>> +    c->zms.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
>>>> +    c->zms.action = action;
>>>> +
>>>> +    if (req_op(req) == REQ_OP_ZONE_RESET_ALL)
>>>> +        c->zms.select = 1;
>>>> +
>>>> +    return BLK_STS_OK;
>>>> +}
>>>> diff --git a/include/linux/nvme.h b/include/linux/nvme.h
>>>> index ea25da572eed..7b3fa7de07bd 100644
>>>> --- a/include/linux/nvme.h
>>>> +++ b/include/linux/nvme.h
>>>> @@ -374,6 +374,30 @@ struct nvme_id_ns {
>>>>      __u8            vs[3712];
>>>> };
>>>>
>>>> +struct nvme_zns_lbafe {
>>>> +    __le64            zsze;
>>>> +    __u8            zdes;
>>>> +    __u8            rsvd9[7];
>>>> +};
>>>> +
>>>> +struct nvme_id_ns_zns {
>>>> +    __le16            zoc;
>>>> +    __le16            ozcs;
>>>> +    __le32            mar;
>>>> +    __le32            mor;
>>>> +    __le32            rrl;
>>>> +    __le32            frl;
>>>> +    __u8            rsvd20[2796];
>>>> +    struct nvme_zns_lbafe    lbafe[16];
>>>> +    __u8            rsvd3072[768];
>>>> +    __u8            vs[256];
>>>> +};
>>>> +
>>>> +struct nvme_id_ctrl_zns {
>>>> +    __u8    zamds;
>>>> +    __u8    rsvd1[4095];
>>>> +};
>>>> +
>>>> enum {
>>>>      NVME_ID_CNS_NS            = 0x00,
>>>>      NVME_ID_CNS_CTRL        = 0x01,
>>>> @@ -392,6 +416,7 @@ enum {
>>>>
>>>> enum {
>>>>      NVME_CSI_NVM            = 0,
>>>> +    NVME_CSI_ZNS            = 2,
>>>> };
>>>>
>>>> enum {
>>>> @@ -532,6 +557,27 @@ struct nvme_ana_rsp_hdr {
>>>>      __le16    rsvd10[3];
>>>> };
>>>>
>>>> +struct nvme_zone_descriptor {
>>>> +    __u8        zt;
>>>> +    __u8        zs;
>>>> +    __u8        za;
>>>> +    __u8        rsvd3[5];
>>>> +    __le64        zcap;
>>>> +    __le64        zslba;
>>>> +    __le64        wp;
>>>> +    __u8        rsvd32[32];
>>>> +};
>>>> +
>>>> +enum {
>>>> +    NVME_ZONE_TYPE_SEQWRITE_REQ    = 0x2,
>>>> +};
>>>> +
>>>> +struct nvme_zone_report {
>>>> +    __le64        nr_zones;
>>>> +    __u8        resv8[56];
>>>> +    struct nvme_zone_descriptor entries[];
>>>> +};
>>>> +
>>>> enum {
>>>>      NVME_SMART_CRIT_SPARE        = 1 << 0,
>>>>      NVME_SMART_CRIT_TEMPERATURE    = 1 << 1,
>>>> @@ -626,6 +672,9 @@ enum nvme_opcode {
>>>>      nvme_cmd_resv_report    = 0x0e,
>>>>      nvme_cmd_resv_acquire    = 0x11,
>>>>      nvme_cmd_resv_release    = 0x15,
>>>> +    nvme_cmd_zone_mgmt_send    = 0x79,
>>>> +    nvme_cmd_zone_mgmt_recv    = 0x7a,
>>>> +    nvme_cmd_zone_append    = 0x7d,
>>>> };
>>>>
>>>> #define nvme_opcode_name(opcode)    { opcode, #opcode }
>>>> @@ -764,6 +813,7 @@ struct nvme_rw_command {
>>>> enum {
>>>>      NVME_RW_LR            = 1 << 15,
>>>>      NVME_RW_FUA            = 1 << 14,
>>>> +    NVME_RW_APPEND_PIREMAP        = 1 << 9,
>>>>      NVME_RW_DSM_FREQ_UNSPEC        = 0,
>>>>      NVME_RW_DSM_FREQ_TYPICAL    = 1,
>>>>      NVME_RW_DSM_FREQ_RARE        = 2,
>>>> @@ -829,6 +879,53 @@ struct nvme_write_zeroes_cmd {
>>>>      __le16            appmask;
>>>> };
>>>>
>>>> +enum nvme_zone_mgmt_action {
>>>> +    NVME_ZONE_CLOSE        = 0x1,
>>>> +    NVME_ZONE_FINISH    = 0x2,
>>>> +    NVME_ZONE_OPEN        = 0x3,
>>>> +    NVME_ZONE_RESET        = 0x4,
>>>> +    NVME_ZONE_OFFLINE    = 0x5,
>>>> +    NVME_ZONE_SET_DESC_EXT    = 0x10,
>>>> +};
>>>> +
>>>> +struct nvme_zone_mgmt_send_cmd {
>>>> +    __u8            opcode;
>>>> +    __u8            flags;
>>>> +    __u16            command_id;
>>>> +    __le32            nsid;
>>>> +    __le32            cdw2[2];
>>>> +    __le64            metadata;
>>>> +    union nvme_data_ptr    dptr;
>>>> +    __le64            slba;
>>>> +    __le32            cdw12;
>>>> +    __u8            action;
>>> Why not zsa to make it easier to match to the spec
>>>
>>>
>>>> +    __u8            select;
>>> sel_all?
>>>
>>>> +    __u8            rsvd13[2];
>>>> +    __le32            cdw14[2];
>>>> +};
>>>> +
>>>> +struct nvme_zone_mgmt_recv_cmd {
>>>> +    __u8            opcode;
>>>> +    __u8            flags;
>>>> +    __u16            command_id;
>>>> +    __le32            nsid;
>>>> +    __le64            rsvd2[2];
>>>> +    union nvme_data_ptr    dptr;
>>>> +    __le64            slba;
>>>> +    __le32            numd;
>>>> +    __u8            zra;
>>>> +    __u8            zrasf;
>>>> +    __u8            pr;
>>> Partial Report is just one bit in the "Zone Receive Action Specific
>>> Features". What about zrasfe?
>> There currently no users of pr, and bit 1-7 are reserved in the spec.
>> Users of the pr variable should shift and mask as necessary.
>>
>> zrasf looks good to me. It is defined as a byte in the spec.
> I meant for the pr variable name. Agree with the rest.
>
>>>> +    __u8            rsvd13;
>>>> +    __le32            cdw14[2];
>>>> +};
>>>> +
>>>> +enum {
>>>> +    NVME_ZRA_ZONE_REPORT        = 0,
>>>> +    NVME_ZRASF_ZONE_REPORT_ALL    = 0,
>>>> +    NVME_REPORT_ZONE_PARTIAL    = 1,
>>>> +};
>>>> +
>>>> /* Features */
>>>>
>>>> enum {
>>>> @@ -1300,6 +1397,8 @@ struct nvme_command {
>>>>          struct nvme_format_cmd format;
>>>>          struct nvme_dsm_cmd dsm;
>>>>          struct nvme_write_zeroes_cmd write_zeroes;
>>>> +        struct nvme_zone_mgmt_send_cmd zms;
>>>> +        struct nvme_zone_mgmt_recv_cmd zmr;
>>>>          struct nvme_abort_cmd abort;
>>>>          struct nvme_get_log_page_command get_log_page;
>>>>          struct nvmf_common_command fabrics;
>>>> @@ -1433,6 +1532,18 @@ enum {
>>>>      NVME_SC_DISCOVERY_RESTART    = 0x190,
>>>>      NVME_SC_AUTH_REQUIRED        = 0x191,
>>>>
>>>> +    /*
>>>> +     * I/O Command Set Specific - Zoned commands:
>>>> +     */
>>>> +    NVME_SC_ZONE_BOUNDARY_ERROR    = 0x1b8,
>>>> +    NVME_SC_ZONE_FULL        = 0x1b9,
>>>> +    NVME_SC_ZONE_READ_ONLY        = 0x1ba,
>>>> +    NVME_SC_ZONE_OFFLINE        = 0x1bb,
>>>> +    NVME_SC_ZONE_INVALID_WRITE    = 0x1bc,
>>>> +    NVME_SC_ZONE_TOO_MANY_ACTIVE    = 0x1bd,
>>>> +    NVME_SC_ZONE_TOO_MANY_OPEN    = 0x1be,
>>>> +    NVME_SC_ZONE_INVALID_TRANSITION    = 0x1bf,
>>>> +
>>>>      /*
>>>>       * Media and Data Integrity Errors:
>>>>       */
>>>> -- 
>>>> 2.24.1
>>>>
> _______________________________________________
> linux-nvme mailing list
> linux-nvme@lists.infradead.org
> https://urldefense.proofpoint.com/v2/url?u=https-3A__protect2.fireeye.com_url-3Fk-3D5f923ae7-2D02402c42-2D5f93b1a8-2D0cc47a31bee8-2Dab724190eafdfea8-26q-3D1-26u-3Dhttps-253A-252F-252Furldefense.proofpoint.com-252Fv2-252Furl-253Fu-253Dhttp-2D3A-5F-5Flists.infradead.org-5Fmailman-5Flistinfo-5Flinux-2D2Dnvme-2526d-253DDwIGaQ-2526c-253DJfeWlBa6VbDyTXraMENjy-5Fb-5F0yKWuqQ4qY-2DFPhxK4x8w-2DTfgRBDyeV4hVQQBEgL2-2526r-253DYJM-5FQPk2w1CRIo5NNBXnCXGzNnmIIfG-5FiTRs6chBf6s-2526m-253DvuAxizG1aX1Dc1Tj0NPWUbhwmZIe1Y12kNIbJHLIdBU-2526s-253DuCIVhY22an8jd0FJv1lizpv-5FvA0tpe37xpz4af6KA10-2526e-253D&d=DwIDaQ&c=JfeWlBa6VbDyTXraMENjy_b_0yKWuqQ4qY-FPhxK4x8w-TfgRBDyeV4hVQQBEgL2&r=YJM_QPk2w1CRIo5NNBXnCXGzNnmIIfG_iTRs6chBf6s&m=EfD62msu4pGyRsUINT6Rt3UPJqKvoN2-eC_xYdgDBI0&s=zi6TqgjKYYcL-ruLuw7LD-CjqWJBWhBsNX5YzzxlVRY&e=
Judy Brock June 16, 2020, 1:32 p.m. UTC | #14
Ok last comment. I should have read Damien's explanation more carefully. I'm backing off... I see it's having to do with needing to preserve write order/ zone write locking - a lot of knowledge I don't have about the Linux storage stack (but I see that is the performance penalty he's referring to now which makes sense if one really did have to lose the overlapped write capability with non-Append-capable ZNS drives).

"To allow other ZNS drives, an emulation similar to SCSI can be implemented, with that emulation ideally combined to work for both types of drives if possible. And note that this emulation would require the drive to be operated with mq-deadline to enable zone write locking for preserving write command order"

But of course that is not what we want to do with ZRWA-capable ZNS drives.  So again, I'm outta here.

-----Original Message-----
From: Judy Brock 
Sent: Tuesday, June 16, 2020 6:18 AM
To: Judy Brock; Matias Bjørling; Javier González
Cc: Jens Axboe; Niklas Cassel; Damien Le Moal; Ajay Joshi; Sagi Grimberg; Keith Busch; Dmitry Fomichev; Aravind Ramesh; linux-nvme@lists.infradead.org; linux-block@vger.kernel.org; Hans Holmberg; Christoph Hellwig; Matias Bjørling
Subject: RE: [PATCH 5/5] nvme: support for zoned namespaces


Wait a minute. I think I misunderstood what Damien was referring to w/respect to the performance penalty. I took it to mean a penalty incurred by having to do translation (like a SCSI translation layer, etc.) but if he was referring to the fact that Append gives queue depth > 1 and he was assuming non-Zone Append SSDs would only support QD = 1, then we are all good since ZRWA is an alternate approach to achieving QD >1.

If that's what you meant Damien, my bad.

Thanks,
Judy

-----Original Message-----
From: linux-nvme [mailto:linux-nvme-bounces@lists.infradead.org] On Behalf Of Judy Brock
Sent: Tuesday, June 16, 2020 6:13 AM
To: Matias Bjørling; Javier González
Cc: Jens Axboe; Niklas Cassel; Damien Le Moal; Ajay Joshi; Sagi Grimberg; Keith Busch; Dmitry Fomichev; Aravind Ramesh; linux-nvme@lists.infradead.org; linux-block@vger.kernel.org; Hans Holmberg; Christoph Hellwig; Matias Bjørling
Subject: RE: [PATCH 5/5] nvme: support for zoned namespaces

Sorry again then if I overreacted. But it seemed that Damien essentially said that at best non- Zone Append ZNS SSDs will be at a large performance disadvantage. I assume that is even with the patches Javier creates. Due to the fact that the whole stack is mandating Append from top down and anything that doesn't natively support that can't really be integrated seamlessly no matter what.

If that's true, then how will patches solve it?

-----Original Message-----
From: Matias Bjørling [mailto:mb@lightnvm.io] 
Sent: Tuesday, June 16, 2020 5:38 AM
To: Judy Brock; Javier González
Cc: Jens Axboe; Niklas Cassel; Damien Le Moal; Ajay Joshi; Sagi Grimberg; Keith Busch; Dmitry Fomichev; Aravind Ramesh; linux-nvme@lists.infradead.org; linux-block@vger.kernel.org; Hans Holmberg; Christoph Hellwig; Matias Bjørling
Subject: Re: [PATCH 5/5] nvme: support for zoned namespaces

On 16/06/2020 14.35, Judy Brock wrote:
> 	>>> A namespace that does not support append is not supported by the driver.
>
> I am not that knowledgeable about Linux kernel drivers so maybe this is a dumb question but won't the driver in question be the default kernel driver for ZNS devices? If so, why would that driver deliberately reject a device which is 100% compliant with the ZNS spec?
> That would seem to favor specific implementations which seems like an inappropriate thing for a community driver to do unless there is an actual technical reason the driver is unable to function w/o append. Is there any such reason and if so what is it? Thanks and sorry if I've misunderstood.

Hi Judy,

This has been solved. Javier has said he will send patches that supports 
the above use-case. It is outside the scope of this patchset.

Best, Matias

> Judy
>
> -----Original Message-----
> From: linux-nvme [mailto:linux-nvme-bounces@lists.infradead.org] On Behalf Of Javier González
> Sent: Tuesday, June 16, 2020 5:00 AM
> To: Matias Bjørling
> Cc: Jens Axboe; Niklas Cassel; Damien Le Moal; Ajay Joshi; Sagi Grimberg; Keith Busch; Dmitry Fomichev; Aravind Ramesh; linux-nvme@lists.infradead.org; linux-block@vger.kernel.org; Hans Holmberg; Christoph Hellwig; Matias Bjørling
> Subject: Re: [PATCH 5/5] nvme: support for zoned namespaces
>
> On 16.06.2020 13:18, Matias Bjørling wrote:
>> On 16/06/2020 12.41, Javier González wrote:
>>> On 16.06.2020 08:34, Keith Busch wrote:
>>>> Add support for NVM Express Zoned Namespaces (ZNS) Command Set defined
>>>> in NVM Express TP4053. Zoned namespaces are discovered based on their
>>>> Command Set Identifier reported in the namespaces Namespace
>>>> Identification Descriptor list. A successfully discovered Zoned
>>>> Namespace will be registered with the block layer as a host managed
>>>> zoned block device with Zone Append command support. A namespace that
>>>> does not support append is not supported by the driver.
>>> Why are we enforcing the append command? Append is optional on the
>>> current ZNS specification, so we should not make this mandatory in the
>>> implementation. See specifics below.
>> There is already general support in the kernel for the zone append
>> command. Feel free to submit patches to emulate the support. It is
>> outside the scope of this patchset.
>>
> It is fine that the kernel supports append, but the ZNS specification
> does not impose the implementation for append, so the driver should not
> do that either.
>
> ZNS SSDs that choose to leave append as a non-implemented optional
> command should not rely on emulated SW support, specially when
> traditional writes work very fine for a large part of current ZNS use
> cases.
>
> Please, remove this virtual constraint.
>
>>>> Signed-off-by: Hans Holmberg <hans.holmberg@wdc.com>
>>>> Signed-off-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
>>>> Signed-off-by: Ajay Joshi <ajay.joshi@wdc.com>
>>>> Signed-off-by: Aravind Ramesh <aravind.ramesh@wdc.com>
>>>> Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
>>>> Signed-off-by: Matias Bjørling <matias.bjorling@wdc.com>
>>>> Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
>>>> Signed-off-by: Keith Busch <keith.busch@wdc.com>
>>>> ---
>>>> drivers/nvme/host/Makefile |   1 +
>>>> drivers/nvme/host/core.c   |  91 ++++++++++++--
>>>> drivers/nvme/host/nvme.h   |  39 ++++++
>>>> drivers/nvme/host/zns.c    | 238 +++++++++++++++++++++++++++++++++++++
>>>> include/linux/nvme.h       | 111 +++++++++++++++++
>>>> 5 files changed, 468 insertions(+), 12 deletions(-)
>>>> create mode 100644 drivers/nvme/host/zns.c
>>>>
>>>> diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile
>>>> index fc7b26be692d..d7f6a87687b8 100644
>>>> --- a/drivers/nvme/host/Makefile
>>>> +++ b/drivers/nvme/host/Makefile
>>>> @@ -13,6 +13,7 @@ nvme-core-y                := core.o
>>>> nvme-core-$(CONFIG_TRACING)        += trace.o
>>>> nvme-core-$(CONFIG_NVME_MULTIPATH)    += multipath.o
>>>> nvme-core-$(CONFIG_NVM)            += lightnvm.o
>>>> +nvme-core-$(CONFIG_BLK_DEV_ZONED)    += zns.o
>>>> nvme-core-$(CONFIG_FAULT_INJECTION_DEBUG_FS)    += fault_inject.o
>>>> nvme-core-$(CONFIG_NVME_HWMON)        += hwmon.o
>>>>
>>>> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
>>>> index 58f137b9f2c5..e961910da4ac 100644
>>>> --- a/drivers/nvme/host/core.c
>>>> +++ b/drivers/nvme/host/core.c
>>>> @@ -89,7 +89,7 @@ static dev_t nvme_chr_devt;
>>>> static struct class *nvme_class;
>>>> static struct class *nvme_subsys_class;
>>>>
>>>> -static int nvme_revalidate_disk(struct gendisk *disk);
>>>> +static int _nvme_revalidate_disk(struct gendisk *disk);
>>>> static void nvme_put_subsystem(struct nvme_subsystem *subsys);
>>>> static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
>>>>                         unsigned nsid);
>>>> @@ -287,6 +287,10 @@ void nvme_complete_rq(struct request *req)
>>>>              nvme_retry_req(req);
>>>>              return;
>>>>          }
>>>> +    } else if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
>>>> +           req_op(req) == REQ_OP_ZONE_APPEND) {
>>>> +        req->__sector = nvme_lba_to_sect(req->q->queuedata,
>>>> +            le64_to_cpu(nvme_req(req)->result.u64));
>>>>      }
>>>>
>>>>      nvme_trace_bio_complete(req, status);
>>>> @@ -673,7 +677,8 @@ static inline blk_status_t
>>>> nvme_setup_write_zeroes(struct nvme_ns *ns,
>>>> }
>>>>
>>>> static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
>>>> -        struct request *req, struct nvme_command *cmnd)
>>>> +        struct request *req, struct nvme_command *cmnd,
>>>> +        enum nvme_opcode op)
>>>> {
>>>>      struct nvme_ctrl *ctrl = ns->ctrl;
>>>>      u16 control = 0;
>>>> @@ -687,7 +692,7 @@ static inline blk_status_t
>>>> nvme_setup_rw(struct nvme_ns *ns,
>>>>      if (req->cmd_flags & REQ_RAHEAD)
>>>>          dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
>>>>
>>>> -    cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write :
>>>> nvme_cmd_read);
>>>> +    cmnd->rw.opcode = op;
>>>>      cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id);
>>>>      cmnd->rw.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
>>>>      cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >>
>>>> ns->lba_shift) - 1);
>>>> @@ -716,6 +721,8 @@ static inline blk_status_t
>>>> nvme_setup_rw(struct nvme_ns *ns,
>>>>          case NVME_NS_DPS_PI_TYPE2:
>>>>              control |= NVME_RW_PRINFO_PRCHK_GUARD |
>>>>                      NVME_RW_PRINFO_PRCHK_REF;
>>>> +            if (op == nvme_cmd_zone_append)
>>>> +                control |= NVME_RW_APPEND_PIREMAP;
>>>>              cmnd->rw.reftag = cpu_to_le32(t10_pi_ref_tag(req));
>>>>              break;
>>>>          }
>>>> @@ -756,6 +763,19 @@ blk_status_t nvme_setup_cmd(struct nvme_ns
>>>> *ns, struct request *req,
>>>>      case REQ_OP_FLUSH:
>>>>          nvme_setup_flush(ns, cmd);
>>>>          break;
>>>> +    case REQ_OP_ZONE_RESET_ALL:
>>>> +    case REQ_OP_ZONE_RESET:
>>>> +        ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_RESET);
>>>> +        break;
>>>> +    case REQ_OP_ZONE_OPEN:
>>>> +        ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_OPEN);
>>>> +        break;
>>>> +    case REQ_OP_ZONE_CLOSE:
>>>> +        ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_CLOSE);
>>>> +        break;
>>>> +    case REQ_OP_ZONE_FINISH:
>>>> +        ret = nvme_setup_zone_mgmt_send(ns, req, cmd,
>>>> NVME_ZONE_FINISH);
>>>> +        break;
>>>>      case REQ_OP_WRITE_ZEROES:
>>>>          ret = nvme_setup_write_zeroes(ns, req, cmd);
>>>>          break;
>>>> @@ -763,8 +783,13 @@ blk_status_t nvme_setup_cmd(struct nvme_ns
>>>> *ns, struct request *req,
>>>>          ret = nvme_setup_discard(ns, req, cmd);
>>>>          break;
>>>>      case REQ_OP_READ:
>>>> +        ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_read);
>>>> +        break;
>>>>      case REQ_OP_WRITE:
>>>> -        ret = nvme_setup_rw(ns, req, cmd);
>>>> +        ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_write);
>>>> +        break;
>>>> +    case REQ_OP_ZONE_APPEND:
>>>> +        ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_zone_append);
>>>>          break;
>>>>      default:
>>>>          WARN_ON_ONCE(1);
>>>> @@ -1392,14 +1417,23 @@ static u32 nvme_passthru_start(struct
>>>> nvme_ctrl *ctrl, struct nvme_ns *ns,
>>>>      return effects;
>>>> }
>>>>
>>>> -static void nvme_update_formats(struct nvme_ctrl *ctrl)
>>>> +static void nvme_update_formats(struct nvme_ctrl *ctrl, u32 *effects)
>>>> {
>>>>      struct nvme_ns *ns;
>>>>
>>>>      down_read(&ctrl->namespaces_rwsem);
>>>>      list_for_each_entry(ns, &ctrl->namespaces, list)
>>>> -        if (ns->disk && nvme_revalidate_disk(ns->disk))
>>>> +        if (ns->disk && _nvme_revalidate_disk(ns->disk))
>>>>              nvme_set_queue_dying(ns);
>>>> +        else if (blk_queue_is_zoned(ns->disk->queue)) {
>>>> +            /*
>>>> +             * IO commands are required to fully revalidate a zoned
>>>> +             * device. Force the command effects to trigger rescan
>>>> +             * work so report zones can run in a context with
>>>> +             * unfrozen IO queues.
>>>> +             */
>>>> +            *effects |= NVME_CMD_EFFECTS_NCC;
>>>> +        }
>>>>      up_read(&ctrl->namespaces_rwsem);
>>>> }
>>>>
>>>> @@ -1411,7 +1445,7 @@ static void nvme_passthru_end(struct
>>>> nvme_ctrl *ctrl, u32 effects)
>>>>       * this command.
>>>>       */
>>>>      if (effects & NVME_CMD_EFFECTS_LBCC)
>>>> -        nvme_update_formats(ctrl);
>>>> +        nvme_update_formats(ctrl, &effects);
>>>>      if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) {
>>>>          nvme_unfreeze(ctrl);
>>>>          nvme_mpath_unfreeze(ctrl->subsys);
>>>> @@ -1526,7 +1560,7 @@ static int nvme_user_cmd64(struct nvme_ctrl
>>>> *ctrl, struct nvme_ns *ns,
>>>>   * Issue ioctl requests on the first available path.  Note that
>>>> unlike normal
>>>>   * block layer requests we will not retry failed request on
>>>> another controller.
>>>>   */
>>>> -static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
>>>> +struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
>>>>          struct nvme_ns_head **head, int *srcu_idx)
>>>> {
>>>> #ifdef CONFIG_NVME_MULTIPATH
>>>> @@ -1546,7 +1580,7 @@ static struct nvme_ns
>>>> *nvme_get_ns_from_disk(struct gendisk *disk,
>>>>      return disk->private_data;
>>>> }
>>>>
>>>> -static void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx)
>>>> +void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx)
>>>> {
>>>>      if (head)
>>>>          srcu_read_unlock(&head->srcu, idx);
>>>> @@ -1939,21 +1973,28 @@ static void nvme_update_disk_info(struct
>>>> gendisk *disk,
>>>>
>>>> static int __nvme_revalidate_disk(struct gendisk *disk, struct
>>>> nvme_id_ns *id)
>>>> {
>>>> +    unsigned lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK;
>>>>      struct nvme_ns *ns = disk->private_data;
>>>>      struct nvme_ctrl *ctrl = ns->ctrl;
>>>> +    int ret;
>>>>      u32 iob;
>>>>
>>>>      /*
>>>>       * If identify namespace failed, use default 512 byte block size so
>>>>       * block layer can use before failing read/write for 0 capacity.
>>>>       */
>>>> -    ns->lba_shift = id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ds;
>>>> +    ns->lba_shift = id->lbaf[lbaf].ds;
>>>>      if (ns->lba_shift == 0)
>>>>          ns->lba_shift = 9;
>>>>
>>>>      switch (ns->head->ids.csi) {
>>>>      case NVME_CSI_NVM:
>>>>          break;
>>>> +    case NVME_CSI_ZNS:
>>>> +        ret = nvme_update_zone_info(disk, ns, lbaf);
>>>> +        if (ret)
>>>> +            return ret;
>>>> +        break;
>>>>      default:
>>>>          dev_warn(ctrl->device, "unknown csi:%d ns:%d\n",
>>>>              ns->head->ids.csi, ns->head->ns_id);
>>>> @@ -1967,7 +2008,7 @@ static int __nvme_revalidate_disk(struct
>>>> gendisk *disk, struct nvme_id_ns *id)
>>>>          iob = nvme_lba_to_sect(ns, le16_to_cpu(id->noiob));
>>>>
>>>>      ns->features = 0;
>>>> -    ns->ms = le16_to_cpu(id->lbaf[id->flbas &
>>>> NVME_NS_FLBAS_LBA_MASK].ms);
>>>> +    ns->ms = le16_to_cpu(id->lbaf[lbaf].ms);
>>>>      /* the PI implementation requires metadata equal t10 pi tuple
>>>> size */
>>>>      if (ns->ms == sizeof(struct t10_pi_tuple))
>>>>          ns->pi_type = id->dps & NVME_NS_DPS_PI_MASK;
>>>> @@ -2010,7 +2051,7 @@ static int __nvme_revalidate_disk(struct
>>>> gendisk *disk, struct nvme_id_ns *id)
>>>>      return 0;
>>>> }
>>>>
>>>> -static int nvme_revalidate_disk(struct gendisk *disk)
>>>> +static int _nvme_revalidate_disk(struct gendisk *disk)
>>>> {
>>>>      struct nvme_ns *ns = disk->private_data;
>>>>      struct nvme_ctrl *ctrl = ns->ctrl;
>>>> @@ -2058,6 +2099,28 @@ static int nvme_revalidate_disk(struct
>>>> gendisk *disk)
>>>>      return ret;
>>>> }
>>>>
>>>> +static int nvme_revalidate_disk(struct gendisk *disk)
>>>> +{
>>>> +    int ret;
>>>> +
>>>> +    ret = _nvme_revalidate_disk(disk);
>>>> +    if (ret)
>>>> +        return ret;
>>>> +
>>>> +#ifdef CONFIG_BLK_DEV_ZONED
>>>> +    if (blk_queue_is_zoned(disk->queue)) {
>>>> +        struct nvme_ns *ns = disk->private_data;
>>>> +        struct nvme_ctrl *ctrl = ns->ctrl;
>>>> +
>>>> +        ret = blk_revalidate_disk_zones(disk, NULL);
>>>> +        if (!ret)
>>>> +            blk_queue_max_zone_append_sectors(disk->queue,
>>>> +                              ctrl->max_zone_append);
>>>> +    }
>>>> +#endif
>>>> +    return ret;
>>>> +}
>>>> +
>>>> static char nvme_pr_type(enum pr_type type)
>>>> {
>>>>      switch (type) {
>>>> @@ -2188,6 +2251,7 @@ static const struct block_device_operations
>>>> nvme_fops = {
>>>>      .release    = nvme_release,
>>>>      .getgeo        = nvme_getgeo,
>>>>      .revalidate_disk= nvme_revalidate_disk,
>>>> +    .report_zones    = nvme_report_zones,
>>>>      .pr_ops        = &nvme_pr_ops,
>>>> };
>>>>
>>>> @@ -2213,6 +2277,7 @@ const struct block_device_operations
>>>> nvme_ns_head_ops = {
>>>>      .ioctl        = nvme_ioctl,
>>>>      .compat_ioctl    = nvme_compat_ioctl,
>>>>      .getgeo        = nvme_getgeo,
>>>> +    .report_zones    = nvme_report_zones,
>>>>      .pr_ops        = &nvme_pr_ops,
>>>> };
>>>> #endif /* CONFIG_NVME_MULTIPATH */
>>>> @@ -4439,6 +4504,8 @@ static inline void _nvme_check_size(void)
>>>>      BUILD_BUG_ON(sizeof(struct nvme_command) != 64);
>>>>      BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) !=
>>>> NVME_IDENTIFY_DATA_SIZE);
>>>>      BUILD_BUG_ON(sizeof(struct nvme_id_ns) != NVME_IDENTIFY_DATA_SIZE);
>>>> +    BUILD_BUG_ON(sizeof(struct nvme_id_ns_zns) !=
>>>> NVME_IDENTIFY_DATA_SIZE);
>>>> +    BUILD_BUG_ON(sizeof(struct nvme_id_ctrl_zns) !=
>>>> NVME_IDENTIFY_DATA_SIZE);
>>>>      BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
>>>>      BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
>>>>      BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64);
>>>> diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
>>>> index 58428e3a590e..662f95fbd909 100644
>>>> --- a/drivers/nvme/host/nvme.h
>>>> +++ b/drivers/nvme/host/nvme.h
>>>> @@ -239,6 +239,9 @@ struct nvme_ctrl {
>>>>      u32 max_hw_sectors;
>>>>      u32 max_segments;
>>>>      u32 max_integrity_segments;
>>>> +#ifdef CONFIG_BLK_DEV_ZONED
>>>> +    u32 max_zone_append;
>>>> +#endif
>>>>      u16 crdt[3];
>>>>      u16 oncs;
>>>>      u16 oacs;
>>>> @@ -403,6 +406,9 @@ struct nvme_ns {
>>>>      u16 sgs;
>>>>      u32 sws;
>>>>      u8 pi_type;
>>>> +#ifdef CONFIG_BLK_DEV_ZONED
>>>> +    u64 zsze;
>>>> +#endif
>>>>      unsigned long features;
>>>>      unsigned long flags;
>>>> #define NVME_NS_REMOVING    0
>>>> @@ -568,6 +574,9 @@ int nvme_delete_ctrl(struct nvme_ctrl *ctrl);
>>>>
>>>> int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8
>>>> lsp, u8 csi,
>>>>          void *log, size_t size, u64 offset);
>>>> +struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
>>>> +        struct nvme_ns_head **head, int *srcu_idx);
>>>> +void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx);
>>>>
>>>> extern const struct attribute_group *nvme_ns_id_attr_groups[];
>>>> extern const struct block_device_operations nvme_ns_head_ops;
>>>> @@ -689,6 +698,36 @@ static inline void
>>>> nvme_mpath_start_freeze(struct nvme_subsystem *subsys)
>>>> }
>>>> #endif /* CONFIG_NVME_MULTIPATH */
>>>>
>>>> +#ifdef CONFIG_BLK_DEV_ZONED
>>>> +int nvme_update_zone_info(struct gendisk *disk, struct nvme_ns *ns,
>>>> +              unsigned lbaf);
>>>> +
>>>> +int nvme_report_zones(struct gendisk *disk, sector_t sector,
>>>> +              unsigned int nr_zones, report_zones_cb cb, void *data);
>>>> +
>>>> +blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct
>>>> request *req,
>>>> +                       struct nvme_command *cmnd,
>>>> +                       enum nvme_zone_mgmt_action action);
>>>> +#else
>>>> +#define nvme_report_zones NULL
>>>> +
>>>> +static inline blk_status_t nvme_setup_zone_mgmt_send(struct
>>>> nvme_ns *ns,
>>>> +        struct request *req, struct nvme_command *cmnd,
>>>> +        enum nvme_zone_mgmt_action action)
>>>> +{
>>>> +    return BLK_STS_NOTSUPP;
>>>> +}
>>>> +
>>>> +static inline int nvme_update_zone_info(struct gendisk *disk,
>>>> +                    struct nvme_ns *ns,
>>>> +                    unsigned lbaf)
>>>> +{
>>>> +    dev_warn(ns->ctrl->device,
>>>> +         "Please enable CONFIG_BLK_DEV_ZONED to support ZNS
>>>> devices\n");
>>>> +    return -EPROTONOSUPPORT;
>>>> +}
>>>> +#endif
>>>> +
>>>> #ifdef CONFIG_NVM
>>>> int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node);
>>>> void nvme_nvm_unregister(struct nvme_ns *ns);
>>>> diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c
>>>> new file mode 100644
>>>> index 000000000000..c08f6281b614
>>>> --- /dev/null
>>>> +++ b/drivers/nvme/host/zns.c
>>>> @@ -0,0 +1,238 @@
>>>> +// SPDX-License-Identifier: GPL-2.0
>>>> +/*
>>>> + * Copyright (C) 2020 Western Digital Corporation or its affiliates.
>>>> + */
>>>> +
>>>> +#include <linux/blkdev.h>
>>>> +#include <linux/vmalloc.h>
>>>> +#include "nvme.h"
>>>> +
>>>> +static int nvme_set_max_append(struct nvme_ctrl *ctrl)
>>>> +{
>>>> +    struct nvme_command c = { };
>>>> +    struct nvme_id_ctrl_zns *id;
>>>> +    int status;
>>>> +
>>>> +    id = kzalloc(sizeof(*id), GFP_KERNEL);
>>>> +    if (!id)
>>>> +        return -ENOMEM;
>>>> +
>>>> +    c.identify.opcode = nvme_admin_identify;
>>>> +    c.identify.cns = NVME_ID_CNS_CS_CTRL;
>>>> +    c.identify.csi = NVME_CSI_ZNS;
>>>> +
>>>> +    status = nvme_submit_sync_cmd(ctrl->admin_q, &c, id, sizeof(*id));
>>>> +    if (status) {
>>>> +        kfree(id);
>>>> +        return status;
>>>> +    }
>>>> +
>>>> +    ctrl->max_zone_append = 1 << (id->zamds + 3);
>>>> +    kfree(id);
>>>> +    return 0;
>>>> +}
>>>> +
>>>> +int nvme_update_zone_info(struct gendisk *disk, struct nvme_ns *ns,
>>>> +              unsigned lbaf)
>>>> +{
>>>> +    struct nvme_effects_log *log = ns->head->effects;
>>>> +    struct request_queue *q = disk->queue;
>>>> +    struct nvme_command c = { };
>>>> +    struct nvme_id_ns_zns *id;
>>>> +    int status;
>>>> +
>>>> +    /* Driver requires zone append support */
>>>> +    if (!(log->iocs[nvme_cmd_zone_append] & NVME_CMD_EFFECTS_CSUPP))
>>>> +        return -ENODEV;
>>> Following up on the initial comment, this check should go.
>> See first comment.
> See above and please remove.
>
>>>> +
>>>> +    /* Lazily query controller append limit for the first zoned
>>>> namespace */
>>>> +    if (!ns->ctrl->max_zone_append) {
>>>> +        status = nvme_set_max_append(ns->ctrl);
>>>> +        if (status)
>>>> +            return status;
>>>> +    }
>>> This should only be applied if append is supported.
>> See first comment.
>>
>>>> +
>>>> +    id = kzalloc(sizeof(*id), GFP_KERNEL);
>>>> +    if (!id)
>>>> +        return -ENOMEM;
>>>> +
>>>> +    c.identify.opcode = nvme_admin_identify;
>>>> +    c.identify.nsid = cpu_to_le32(ns->head->ns_id);
>>>> +    c.identify.cns = NVME_ID_CNS_CS_NS;
>>>> +    c.identify.csi = NVME_CSI_ZNS;
>>>> +
>>>> +    status = nvme_submit_sync_cmd(ns->ctrl->admin_q, &c, id,
>>>> sizeof(*id));
>>>> +    if (status)
>>>> +        goto free_data;
>>>> +
>>>> +    /*
>>>> +     * We currently do not handle devices requiring any of the zoned
>>>> +     * operation characteristics.
>>>> +     */
>>>> +    if (id->zoc) {
>>>> +        status = -EINVAL;
>>>> +        goto free_data;
>>>> +    }
>>> I understand that "Variable Zone Capacity" is not supported as it
>>> requires major changes at this moment, but we should support controllers
>>> that enable "Zone Active Excursions", even when the AER event is not
>>> implemented in this patchset.
>>
>> NAK. Similarly to VZC, this allows an unsuspecting user to have major
>> data loss when a zone is suddenly moved to Full.
> I buy that.
>
>>
>>>> +
>>>> +    ns->zsze = nvme_lba_to_sect(ns, le64_to_cpu(id->lbafe[lbaf].zsze));
>>>> +    if (!ns->zsze) {
>>>> +        status = -EINVAL;
>>>> +        goto free_data;
>>>> +    }
>>>> +
>>>> +    q->limits.zoned = BLK_ZONED_HM;
>>>> +    blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q);
>>>> +free_data:
>>>> +    kfree(id);
>>>> +    return status;
>>>> +}
>>>> +
>>>> +static void *nvme_zns_alloc_report_buffer(struct nvme_ns *ns,
>>>> +                      unsigned int nr_zones, size_t *buflen)
>>>> +{
>>>> +    struct request_queue *q = ns->disk->queue;
>>>> +    size_t bufsize;
>>>> +    void *buf;
>>>> +
>>>> +    const size_t min_bufsize = sizeof(struct nvme_zone_report) +
>>>> +                   sizeof(struct nvme_zone_descriptor);
>>>> +
>>>> +    nr_zones = min_t(unsigned int, nr_zones,
>>>> +             get_capacity(ns->disk) >> ilog2(ns->zsze));
>>>> +
>>>> +    bufsize = sizeof(struct nvme_zone_report) +
>>>> +        nr_zones * sizeof(struct nvme_zone_descriptor);
>>>> +    bufsize = min_t(size_t, bufsize,
>>>> +            queue_max_hw_sectors(q) << SECTOR_SHIFT);
>>>> +    bufsize = min_t(size_t, bufsize, queue_max_segments(q) <<
>>>> PAGE_SHIFT);
>>>> +
>>>> +    while (bufsize >= min_bufsize) {
>>>> +        buf = __vmalloc(bufsize,
>>>> +                GFP_KERNEL | __GFP_ZERO | __GFP_NORETRY);
>>>> +        if (buf) {
>>>> +            *buflen = bufsize;
>>>> +            return buf;
>>>> +        }
>>>> +        bufsize >>= 1;
>>>> +    }
>>>> +    return NULL;
>>>> +}
>>>> +
>>>> +static int __nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector,
>>>> +                  struct nvme_zone_report *report,
>>>> +                  size_t buflen)
>>>> +{
>>>> +    struct nvme_command c = { };
>>>> +    int ret;
>>>> +
>>>> +    c.zmr.opcode = nvme_cmd_zone_mgmt_recv;
>>>> +    c.zmr.nsid = cpu_to_le32(ns->head->ns_id);
>>>> +    c.zmr.slba = cpu_to_le64(nvme_sect_to_lba(ns, sector));
>>>> +    c.zmr.numd = cpu_to_le32(nvme_bytes_to_numd(buflen));
>>>> +    c.zmr.zra = NVME_ZRA_ZONE_REPORT;
>>>> +    c.zmr.zrasf = NVME_ZRASF_ZONE_REPORT_ALL;
>>>> +    c.zmr.pr = NVME_REPORT_ZONE_PARTIAL;
>>>> +
>>>> +    ret = nvme_submit_sync_cmd(ns->queue, &c, report, buflen);
>>>> +    if (ret)
>>>> +        return ret;
>>>> +
>>>> +    return le64_to_cpu(report->nr_zones);
>>>> +}
>>>> +
>>>> +static int nvme_zone_parse_entry(struct nvme_ns *ns,
>>>> +                 struct nvme_zone_descriptor *entry,
>>>> +                 unsigned int idx, report_zones_cb cb,
>>>> +                 void *data)
>>>> +{
>>>> +    struct blk_zone zone = { };
>>>> +
>>>> +    if ((entry->zt & 0xf) != NVME_ZONE_TYPE_SEQWRITE_REQ) {
>>>> +        dev_err(ns->ctrl->device, "invalid zone type %#x\n",
>>>> +                entry->zt);
>>>> +        return -EINVAL;
>>>> +    }
>>>> +
>>>> +    zone.type = BLK_ZONE_TYPE_SEQWRITE_REQ;
>>>> +    zone.cond = entry->zs >> 4;
>>>> +    zone.len = ns->zsze;
>>>> +    zone.capacity = nvme_lba_to_sect(ns, le64_to_cpu(entry->zcap));
>>>> +    zone.start = nvme_lba_to_sect(ns, le64_to_cpu(entry->zslba));
>>>> +    zone.wp = nvme_lba_to_sect(ns, le64_to_cpu(entry->wp));
>>>> +
>>>> +    return cb(&zone, idx, data);
>>>> +}
>>>> +
>>>> +static int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector,
>>>> +            unsigned int nr_zones, report_zones_cb cb, void *data)
>>>> +{
>>>> +    struct nvme_zone_report *report;
>>>> +    int ret, zone_idx = 0;
>>>> +    unsigned int nz, i;
>>>> +    size_t buflen;
>>>> +
>>>> +    report = nvme_zns_alloc_report_buffer(ns, nr_zones, &buflen);
>>>> +    if (!report)
>>>> +        return -ENOMEM;
>>>> +
>>>> +    sector &= ~(ns->zsze - 1);
>>>> +    while (zone_idx < nr_zones && sector < get_capacity(ns->disk)) {
>>>> +        memset(report, 0, buflen);
>>>> +        ret = __nvme_ns_report_zones(ns, sector, report, buflen);
>>>> +        if (ret < 0)
>>>> +            goto out_free;
>>>> +
>>>> +        nz = min_t(unsigned int, ret, nr_zones);
>>>> +        if (!nz)
>>>> +            break;
>>>> +
>>>> +        for (i = 0; i < nz && zone_idx < nr_zones; i++) {
>>>> +            ret = nvme_zone_parse_entry(ns, &report->entries[i],
>>>> +                            zone_idx, cb, data);
>>>> +            if (ret)
>>>> +                goto out_free;
>>>> +            zone_idx++;
>>>> +        }
>>>> +
>>>> +        sector += ns->zsze * nz;
>>>> +    }
>>>> +
>>>> +    ret = zone_idx;
>>>> +out_free:
>>>> +    kvfree(report);
>>>> +    return ret;
>>>> +}
>>>> +
>>>> +int nvme_report_zones(struct gendisk *disk, sector_t sector,
>>>> +              unsigned int nr_zones, report_zones_cb cb, void *data)
>>>> +{
>>>> +    struct nvme_ns_head *head = NULL;
>>>> +    struct nvme_ns *ns;
>>>> +    int srcu_idx, ret;
>>>> +
>>>> +    ns = nvme_get_ns_from_disk(disk, &head, &srcu_idx);
>>>> +    if (unlikely(!ns))
>>>> +        return -EWOULDBLOCK;
>>>> +
>>>> +    if (ns->head->ids.csi == NVME_CSI_ZNS)
>>>> +        ret = nvme_ns_report_zones(ns, sector, nr_zones, cb, data);
>>>> +    else
>>>> +        ret = -EINVAL;
>>>> +    nvme_put_ns_from_disk(head, srcu_idx);
>>>> +
>>>> +    return ret;
>>>> +}
>>>> +
>>>> +blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct
>>>> request *req,
>>>> +        struct nvme_command *c, enum nvme_zone_mgmt_action action)
>>>> +{
>>>> +    c->zms.opcode = nvme_cmd_zone_mgmt_send;
>>>> +    c->zms.nsid = cpu_to_le32(ns->head->ns_id);
>>>> +    c->zms.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
>>>> +    c->zms.action = action;
>>>> +
>>>> +    if (req_op(req) == REQ_OP_ZONE_RESET_ALL)
>>>> +        c->zms.select = 1;
>>>> +
>>>> +    return BLK_STS_OK;
>>>> +}
>>>> diff --git a/include/linux/nvme.h b/include/linux/nvme.h
>>>> index ea25da572eed..7b3fa7de07bd 100644
>>>> --- a/include/linux/nvme.h
>>>> +++ b/include/linux/nvme.h
>>>> @@ -374,6 +374,30 @@ struct nvme_id_ns {
>>>>      __u8            vs[3712];
>>>> };
>>>>
>>>> +struct nvme_zns_lbafe {
>>>> +    __le64            zsze;
>>>> +    __u8            zdes;
>>>> +    __u8            rsvd9[7];
>>>> +};
>>>> +
>>>> +struct nvme_id_ns_zns {
>>>> +    __le16            zoc;
>>>> +    __le16            ozcs;
>>>> +    __le32            mar;
>>>> +    __le32            mor;
>>>> +    __le32            rrl;
>>>> +    __le32            frl;
>>>> +    __u8            rsvd20[2796];
>>>> +    struct nvme_zns_lbafe    lbafe[16];
>>>> +    __u8            rsvd3072[768];
>>>> +    __u8            vs[256];
>>>> +};
>>>> +
>>>> +struct nvme_id_ctrl_zns {
>>>> +    __u8    zamds;
>>>> +    __u8    rsvd1[4095];
>>>> +};
>>>> +
>>>> enum {
>>>>      NVME_ID_CNS_NS            = 0x00,
>>>>      NVME_ID_CNS_CTRL        = 0x01,
>>>> @@ -392,6 +416,7 @@ enum {
>>>>
>>>> enum {
>>>>      NVME_CSI_NVM            = 0,
>>>> +    NVME_CSI_ZNS            = 2,
>>>> };
>>>>
>>>> enum {
>>>> @@ -532,6 +557,27 @@ struct nvme_ana_rsp_hdr {
>>>>      __le16    rsvd10[3];
>>>> };
>>>>
>>>> +struct nvme_zone_descriptor {
>>>> +    __u8        zt;
>>>> +    __u8        zs;
>>>> +    __u8        za;
>>>> +    __u8        rsvd3[5];
>>>> +    __le64        zcap;
>>>> +    __le64        zslba;
>>>> +    __le64        wp;
>>>> +    __u8        rsvd32[32];
>>>> +};
>>>> +
>>>> +enum {
>>>> +    NVME_ZONE_TYPE_SEQWRITE_REQ    = 0x2,
>>>> +};
>>>> +
>>>> +struct nvme_zone_report {
>>>> +    __le64        nr_zones;
>>>> +    __u8        resv8[56];
>>>> +    struct nvme_zone_descriptor entries[];
>>>> +};
>>>> +
>>>> enum {
>>>>      NVME_SMART_CRIT_SPARE        = 1 << 0,
>>>>      NVME_SMART_CRIT_TEMPERATURE    = 1 << 1,
>>>> @@ -626,6 +672,9 @@ enum nvme_opcode {
>>>>      nvme_cmd_resv_report    = 0x0e,
>>>>      nvme_cmd_resv_acquire    = 0x11,
>>>>      nvme_cmd_resv_release    = 0x15,
>>>> +    nvme_cmd_zone_mgmt_send    = 0x79,
>>>> +    nvme_cmd_zone_mgmt_recv    = 0x7a,
>>>> +    nvme_cmd_zone_append    = 0x7d,
>>>> };
>>>>
>>>> #define nvme_opcode_name(opcode)    { opcode, #opcode }
>>>> @@ -764,6 +813,7 @@ struct nvme_rw_command {
>>>> enum {
>>>>      NVME_RW_LR            = 1 << 15,
>>>>      NVME_RW_FUA            = 1 << 14,
>>>> +    NVME_RW_APPEND_PIREMAP        = 1 << 9,
>>>>      NVME_RW_DSM_FREQ_UNSPEC        = 0,
>>>>      NVME_RW_DSM_FREQ_TYPICAL    = 1,
>>>>      NVME_RW_DSM_FREQ_RARE        = 2,
>>>> @@ -829,6 +879,53 @@ struct nvme_write_zeroes_cmd {
>>>>      __le16            appmask;
>>>> };
>>>>
>>>> +enum nvme_zone_mgmt_action {
>>>> +    NVME_ZONE_CLOSE        = 0x1,
>>>> +    NVME_ZONE_FINISH    = 0x2,
>>>> +    NVME_ZONE_OPEN        = 0x3,
>>>> +    NVME_ZONE_RESET        = 0x4,
>>>> +    NVME_ZONE_OFFLINE    = 0x5,
>>>> +    NVME_ZONE_SET_DESC_EXT    = 0x10,
>>>> +};
>>>> +
>>>> +struct nvme_zone_mgmt_send_cmd {
>>>> +    __u8            opcode;
>>>> +    __u8            flags;
>>>> +    __u16            command_id;
>>>> +    __le32            nsid;
>>>> +    __le32            cdw2[2];
>>>> +    __le64            metadata;
>>>> +    union nvme_data_ptr    dptr;
>>>> +    __le64            slba;
>>>> +    __le32            cdw12;
>>>> +    __u8            action;
>>> Why not zsa to make it easier to match to the spec
>>>
>>>
>>>> +    __u8            select;
>>> sel_all?
>>>
>>>> +    __u8            rsvd13[2];
>>>> +    __le32            cdw14[2];
>>>> +};
>>>> +
>>>> +struct nvme_zone_mgmt_recv_cmd {
>>>> +    __u8            opcode;
>>>> +    __u8            flags;
>>>> +    __u16            command_id;
>>>> +    __le32            nsid;
>>>> +    __le64            rsvd2[2];
>>>> +    union nvme_data_ptr    dptr;
>>>> +    __le64            slba;
>>>> +    __le32            numd;
>>>> +    __u8            zra;
>>>> +    __u8            zrasf;
>>>> +    __u8            pr;
>>> Partial Report is just one bit in the "Zone Receive Action Specific
>>> Features". What about zrasfe?
>> There currently no users of pr, and bit 1-7 are reserved in the spec.
>> Users of the pr variable should shift and mask as necessary.
>>
>> zrasf looks good to me. It is defined as a byte in the spec.
> I meant for the pr variable name. Agree with the rest.
>
>>>> +    __u8            rsvd13;
>>>> +    __le32            cdw14[2];
>>>> +};
>>>> +
>>>> +enum {
>>>> +    NVME_ZRA_ZONE_REPORT        = 0,
>>>> +    NVME_ZRASF_ZONE_REPORT_ALL    = 0,
>>>> +    NVME_REPORT_ZONE_PARTIAL    = 1,
>>>> +};
>>>> +
>>>> /* Features */
>>>>
>>>> enum {
>>>> @@ -1300,6 +1397,8 @@ struct nvme_command {
>>>>          struct nvme_format_cmd format;
>>>>          struct nvme_dsm_cmd dsm;
>>>>          struct nvme_write_zeroes_cmd write_zeroes;
>>>> +        struct nvme_zone_mgmt_send_cmd zms;
>>>> +        struct nvme_zone_mgmt_recv_cmd zmr;
>>>>          struct nvme_abort_cmd abort;
>>>>          struct nvme_get_log_page_command get_log_page;
>>>>          struct nvmf_common_command fabrics;
>>>> @@ -1433,6 +1532,18 @@ enum {
>>>>      NVME_SC_DISCOVERY_RESTART    = 0x190,
>>>>      NVME_SC_AUTH_REQUIRED        = 0x191,
>>>>
>>>> +    /*
>>>> +     * I/O Command Set Specific - Zoned commands:
>>>> +     */
>>>> +    NVME_SC_ZONE_BOUNDARY_ERROR    = 0x1b8,
>>>> +    NVME_SC_ZONE_FULL        = 0x1b9,
>>>> +    NVME_SC_ZONE_READ_ONLY        = 0x1ba,
>>>> +    NVME_SC_ZONE_OFFLINE        = 0x1bb,
>>>> +    NVME_SC_ZONE_INVALID_WRITE    = 0x1bc,
>>>> +    NVME_SC_ZONE_TOO_MANY_ACTIVE    = 0x1bd,
>>>> +    NVME_SC_ZONE_TOO_MANY_OPEN    = 0x1be,
>>>> +    NVME_SC_ZONE_INVALID_TRANSITION    = 0x1bf,
>>>> +
>>>>      /*
>>>>       * Media and Data Integrity Errors:
>>>>       */
>>>> -- 
>>>> 2.24.1
>>>>
> _______________________________________________
> linux-nvme mailing list
> linux-nvme@lists.infradead.org
> https://urldefense.proofpoint.com/v2/url?u=https-3A__protect2.fireeye.com_url-3Fk-3D5f923ae7-2D02402c42-2D5f93b1a8-2D0cc47a31bee8-2Dab724190eafdfea8-26q-3D1-26u-3Dhttps-253A-252F-252Furldefense.proofpoint.com-252Fv2-252Furl-253Fu-253Dhttp-2D3A-5F-5Flists.infradead.org-5Fmailman-5Flistinfo-5Flinux-2D2Dnvme-2526d-253DDwIGaQ-2526c-253DJfeWlBa6VbDyTXraMENjy-5Fb-5F0yKWuqQ4qY-2DFPhxK4x8w-2DTfgRBDyeV4hVQQBEgL2-2526r-253DYJM-5FQPk2w1CRIo5NNBXnCXGzNnmIIfG-5FiTRs6chBf6s-2526m-253DvuAxizG1aX1Dc1Tj0NPWUbhwmZIe1Y12kNIbJHLIdBU-2526s-253DuCIVhY22an8jd0FJv1lizpv-5FvA0tpe37xpz4af6KA10-2526e-253D&d=DwIDaQ&c=JfeWlBa6VbDyTXraMENjy_b_0yKWuqQ4qY-FPhxK4x8w-TfgRBDyeV4hVQQBEgL2&r=YJM_QPk2w1CRIo5NNBXnCXGzNnmIIfG_iTRs6chBf6s&m=EfD62msu4pGyRsUINT6Rt3UPJqKvoN2-eC_xYdgDBI0&s=zi6TqgjKYYcL-ruLuw7LD-CjqWJBWhBsNX5YzzxlVRY&e=
Matias Bjorling June 16, 2020, 1:32 p.m. UTC | #15
On 16/06/2020 15.08, Judy Brock wrote:
>      "The on-going re-work of btrfs zone support for instance now relies 100% on zone append being supported.... So the approach is: mandate zone append support for ZNS devices.... To allow other ZNS drives, an emulation similar to SCSI can be implemented, ...  While on a HDD the  performance penalty is minimal, it will likely be *significant* on a SSD."
>
> Wow. Well as I said, I don't know much about Linux but it sounds like the ongoing re-work of btrfs zone support mandating zone append should be revisited.
Feel free to go ahead and suggest an alternative solution that shows the 
same performance benefits.It is open-source, and if you can show and 
_implement_ a better solution. We will review it as any other 
contribution to the open-source eco-system.
> The reality is there will be flavors of ZNS drives in the market that do not support Append.  As many of you know, the ZRWA technical proposal is well under-way in NVMe ZNS WG.
>
> Ensuring that the entire Linux zone support ecosystem deliberately locks these devices out / or at best consigns them to a severely performance-penalized path, especially given the MULTIPLE statements that have been made in the NVMe ZNS WG by multiple companies regarding the use cases for which Zone Append is an absolute disaster (not my words), seems pretty darn inappropriate.

First a note: I appreciate you bringing up discussions that was made 
within the NVMe ZNS TG, but please note that those discussions happened 
in that forum that is under NDA. This is an open-source mailing list, 
and the content will be available online for many many years. Please 
refrain from discussing things that are not deemed public by the the 
NVMe board of directors.

On your statement, there is no deliberate locking out of devices , no 
more than a specific feature has not been implemented or that a device 
driver that is properitary to a company. Everyone is free to contribute 
to open-source. As Javier has previously pointed out, he intends to 
submit a patchset to add the necessary support for the zone append 
command API.

>
>
>
>
>
> -----Original Message-----
> From: linux-nvme [mailto:linux-nvme-bounces@lists.infradead.org] On Behalf Of Damien Le Moal
> Sent: Tuesday, June 16, 2020 5:36 AM
> To: Javier González; Matias Bjørling
> Cc: Jens Axboe; Niklas Cassel; Ajay Joshi; Sagi Grimberg; Keith Busch; Dmitry Fomichev; Aravind Ramesh; linux-nvme@lists.infradead.org; linux-block@vger.kernel.org; Hans Holmberg; Christoph Hellwig; Matias Bjorling
> Subject: Re: [PATCH 5/5] nvme: support for zoned namespaces
>
> On 2020/06/16 21:24, Javier González wrote:
>> On 16.06.2020 14:06, Matias Bjørling wrote:
>>> On 16/06/2020 14.00, Javier González wrote:
>>>> On 16.06.2020 13:18, Matias Bjørling wrote:
>>>>> On 16/06/2020 12.41, Javier González wrote:
>>>>>> On 16.06.2020 08:34, Keith Busch wrote:
>>>>>>> Add support for NVM Express Zoned Namespaces (ZNS) Command Set defined
>>>>>>> in NVM Express TP4053. Zoned namespaces are discovered based on their
>>>>>>> Command Set Identifier reported in the namespaces Namespace
>>>>>>> Identification Descriptor list. A successfully discovered Zoned
>>>>>>> Namespace will be registered with the block layer as a host managed
>>>>>>> zoned block device with Zone Append command support. A namespace that
>>>>>>> does not support append is not supported by the driver.
>>>>>> Why are we enforcing the append command? Append is optional on the
>>>>>> current ZNS specification, so we should not make this mandatory in the
>>>>>> implementation. See specifics below.
>>>>> There is already general support in the kernel for the zone append
>>>>> command. Feel free to submit patches to emulate the support. It is
>>>>> outside the scope of this patchset.
>>>>>
>>>> It is fine that the kernel supports append, but the ZNS specification
>>>> does not impose the implementation for append, so the driver should not
>>>> do that either.
>>>>
>>>> ZNS SSDs that choose to leave append as a non-implemented optional
>>>> command should not rely on emulated SW support, specially when
>>>> traditional writes work very fine for a large part of current ZNS use
>>>> cases.
>>>>
>>>> Please, remove this virtual constraint.
>>> The Zone Append command is mandatory for zoned block devices. Please
>>> see https://urldefense.proofpoint.com/v2/url?u=https-3A__lwn.net_Articles_818709_&d=DwIFAw&c=JfeWlBa6VbDyTXraMENjy_b_0yKWuqQ4qY-FPhxK4x8w-TfgRBDyeV4hVQQBEgL2&r=YJM_QPk2w1CRIo5NNBXnCXGzNnmIIfG_iTRs6chBf6s&m=-fIHWuFYU2GHiTJ2FuhTBgrypPIJW0FjLUWTaK4cH9c&s=kkJ8bJpiTjKS9PoobDPHTf11agXKNUpcw5AsIEyewZk&e=  for the background.
>> I do not see anywhere in the block layer that append is mandatory for
>> zoned devices. Append is emulated on ZBC, but beyond that there is no
>> mandatory bits. Please explain.
> This is to allow a single write IO path for all types of zoned block device for
> higher layers, e.g file systems. The on-going re-work of btrfs zone support for
> instance now relies 100% on zone append being supported. That significantly
> simplifies the file system support and more importantly remove the need for
> locking around block allocation and BIO issuing, allowing to preserve a fully
> asynchronous write path that can include workqueues for efficient CPU usage of
> things like encryption and compression. Without zone append, file system would
> either (1) have to reject these drives that do not support zone append, or (2)
> implement 2 different write IO path (slower regular write and zone append). None
> of these options are ideal, to say the least.
>
> So the approach is: mandate zone append support for ZNS devices. To allow other
> ZNS drives, an emulation similar to SCSI can be implemented, with that emulation
> ideally combined to work for both types of drives if possible. And note that
> this emulation would require the drive to be operated with mq-deadline to enable
> zone write locking for preserving write command order. While on a HDD the
> performance penalty is minimal, it will likely be significant on a SSD.
>
>>> Please submitpatches if you want to have support for ZNS devices that
>>> does not implement the Zone Append command. It is outside the scope
>>> of this patchset.
>> That we will.
>>
>>
>> _______________________________________________
>> linux-nvme mailing list
>> linux-nvme@lists.infradead.org
>> https://urldefense.proofpoint.com/v2/url?u=http-3A__lists.infradead.org_mailman_listinfo_linux-2Dnvme&d=DwIFAw&c=JfeWlBa6VbDyTXraMENjy_b_0yKWuqQ4qY-FPhxK4x8w-TfgRBDyeV4hVQQBEgL2&r=YJM_QPk2w1CRIo5NNBXnCXGzNnmIIfG_iTRs6chBf6s&m=-fIHWuFYU2GHiTJ2FuhTBgrypPIJW0FjLUWTaK4cH9c&s=HeBnGkcBM5OqESkW8yYYi2KtvVwbdamrbd_X5PgGKBk&e=
>>
>
Damien Le Moal June 16, 2020, 1:34 p.m. UTC | #16
On 2020/06/16 22:08, Judy Brock wrote:
> 
> "The on-going re-work of btrfs zone support for instance now relies 100% on
> zone append being supported.... So the approach is: mandate zone append
> support for ZNS devices.... To allow other ZNS drives, an emulation similar
> to SCSI can be implemented, ...  While on a HDD the  performance penalty is
> minimal, it will likely be *significant* on a SSD."
> 
> Wow. Well as I said, I don't know much about Linux but it sounds like the
> ongoing re-work of btrfs zone support mandating zone append should be
> revisited.
> 
> The reality is there will be flavors of ZNS drives in the market that do not
> support Append.  As many of you know, the ZRWA technical proposal is well
> under-way in NVMe ZNS WG.
> 
> Ensuring that the entire Linux zone support ecosystem deliberately locks
> these devices out / or at best consigns them to a severely
> performance-penalized path, especially given the MULTIPLE statements that
> have been made in the NVMe ZNS WG by multiple companies regarding the use
> cases for which Zone Append is an absolute disaster (not my words), seems
> pretty darn inappropriate.

The software design decision is not about locking out one class of devices, it
is about how to deliver high performance implementations of file systems for
drives that can actually provide that performance, e.g. SSDs. As I said,
mandating that zone append is always supported by the storage devices, either
natively or through emulation, allows such efficient, and simple, implementation
of zone support at higher levels in device mapper and file system layers.

Without this, the file system has to do the serialization of write commands
*and* protect itself against write command reordering by the block IO stack as
that layer of the kernel is totally asynchronous and does not give any guarantee
of a particular command execution order. This complicates the file system
implementation significantly and so is not acceptable.

For zoned devices, the block layer can provide *write* command execution order
guarantees, similarly to what the file system would need to do. That is the
mq-deadline and zone write locking I was referring to. That is acceptable for
SMR HDDs, but likely will have impact on SSD write performance (that needs to be
checked).

Summary: what needs to be done for correctly processing sequential write
commands in Linux is the same no matter which layer implements it: writes must
be throttled to at most one write per zone. This can be done by a file system or
the block layer. Native zone append support by a drive removes all this,
simplifies the code and enables high performance. Zone append emulation in the
driver gives the same code simplification overall, but *may* suffer from the
zone write locking penalty.

Overall, we get code simplification at the file system layer, with only a single
area where performance may not be optimal. Any other design choice would result
in much worse situations:
1) complex code everywhere as the file systems would have to support both
regular write and zone append write path to support all class of devices.
2) file system implementing only zone append write path end up rejecting drives
that do not have zone append native support
3) The file system layer supports only regular writes, resulting in complex code
and potentially degraded write performance for *all* devices



> 
> 
> 
> 
> 
> -----Original Message----- From: linux-nvme
> [mailto:linux-nvme-bounces@lists.infradead.org] On Behalf Of Damien Le Moal 
> Sent: Tuesday, June 16, 2020 5:36 AM To: Javier González; Matias Bjørling Cc:
> Jens Axboe; Niklas Cassel; Ajay Joshi; Sagi Grimberg; Keith Busch; Dmitry
> Fomichev; Aravind Ramesh; linux-nvme@lists.infradead.org;
> linux-block@vger.kernel.org; Hans Holmberg; Christoph Hellwig; Matias
> Bjorling Subject: Re: [PATCH 5/5] nvme: support for zoned namespaces
> 
> On 2020/06/16 21:24, Javier González wrote:
>> On 16.06.2020 14:06, Matias Bjørling wrote:
>>> On 16/06/2020 14.00, Javier González wrote:
>>>> On 16.06.2020 13:18, Matias Bjørling wrote:
>>>>> On 16/06/2020 12.41, Javier González wrote:
>>>>>> On 16.06.2020 08:34, Keith Busch wrote:
>>>>>>> Add support for NVM Express Zoned Namespaces (ZNS) Command Set
>>>>>>> defined in NVM Express TP4053. Zoned namespaces are discovered
>>>>>>> based on their Command Set Identifier reported in the namespaces
>>>>>>> Namespace Identification Descriptor list. A successfully
>>>>>>> discovered Zoned Namespace will be registered with the block
>>>>>>> layer as a host managed zoned block device with Zone Append
>>>>>>> command support. A namespace that does not support append is not
>>>>>>> supported by the driver.
>>>>>> 
>>>>>> Why are we enforcing the append command? Append is optional on the 
>>>>>> current ZNS specification, so we should not make this mandatory in
>>>>>> the implementation. See specifics below.
>>>> 
>>>>> 
>>>>> There is already general support in the kernel for the zone append 
>>>>> command. Feel free to submit patches to emulate the support. It is 
>>>>> outside the scope of this patchset.
>>>>> 
>>>> 
>>>> It is fine that the kernel supports append, but the ZNS specification 
>>>> does not impose the implementation for append, so the driver should
>>>> not do that either.
>>>> 
>>>> ZNS SSDs that choose to leave append as a non-implemented optional 
>>>> command should not rely on emulated SW support, specially when 
>>>> traditional writes work very fine for a large part of current ZNS use 
>>>> cases.
>>>> 
>>>> Please, remove this virtual constraint.
>>> 
>>> The Zone Append command is mandatory for zoned block devices. Please see
>>> https://urldefense.proofpoint.com/v2/url?u=https-3A__lwn.net_Articles_818709_&d=DwIFAw&c=JfeWlBa6VbDyTXraMENjy_b_0yKWuqQ4qY-FPhxK4x8w-TfgRBDyeV4hVQQBEgL2&r=YJM_QPk2w1CRIo5NNBXnCXGzNnmIIfG_iTRs6chBf6s&m=-fIHWuFYU2GHiTJ2FuhTBgrypPIJW0FjLUWTaK4cH9c&s=kkJ8bJpiTjKS9PoobDPHTf11agXKNUpcw5AsIEyewZk&e=
>>> for the background.
>> 
>> I do not see anywhere in the block layer that append is mandatory for zoned
>> devices. Append is emulated on ZBC, but beyond that there is no mandatory
>> bits. Please explain.
> 
> This is to allow a single write IO path for all types of zoned block device
> for higher layers, e.g file systems. The on-going re-work of btrfs zone
> support for instance now relies 100% on zone append being supported. That
> significantly simplifies the file system support and more importantly remove
> the need for locking around block allocation and BIO issuing, allowing to
> preserve a fully asynchronous write path that can include workqueues for
> efficient CPU usage of things like encryption and compression. Without zone
> append, file system would either (1) have to reject these drives that do not
> support zone append, or (2) implement 2 different write IO path (slower
> regular write and zone append). None of these options are ideal, to say the
> least.
> 
> So the approach is: mandate zone append support for ZNS devices. To allow
> other ZNS drives, an emulation similar to SCSI can be implemented, with that
> emulation ideally combined to work for both types of drives if possible. And
> note that this emulation would require the drive to be operated with
> mq-deadline to enable zone write locking for preserving write command order.
> While on a HDD the performance penalty is minimal, it will likely be
> significant on a SSD.
> 
>> 
>>> Please submitpatches if you want to have support for ZNS devices that 
>>> does not implement the Zone Append command. It is outside the scope of
>>> this patchset.
>> 
>> That we will.
>> 
>> 
>> _______________________________________________ linux-nvme mailing list 
>> linux-nvme@lists.infradead.org 
>> https://urldefense.proofpoint.com/v2/url?u=http-3A__lists.infradead.org_mailman_listinfo_linux-2Dnvme&d=DwIFAw&c=JfeWlBa6VbDyTXraMENjy_b_0yKWuqQ4qY-FPhxK4x8w-TfgRBDyeV4hVQQBEgL2&r=YJM_QPk2w1CRIo5NNBXnCXGzNnmIIfG_iTRs6chBf6s&m=-fIHWuFYU2GHiTJ2FuhTBgrypPIJW0FjLUWTaK4cH9c&s=HeBnGkcBM5OqESkW8yYYi2KtvVwbdamrbd_X5PgGKBk&e=
>> 
>> 
> 
>
Damien Le Moal June 16, 2020, 1:39 p.m. UTC | #17
On 2020/06/16 22:32, Judy Brock wrote:
> Ok last comment. I should have read Damien's explanation more carefully. I'm
> backing off... I see it's having to do with needing to preserve write order/
> zone write locking - a lot of knowledge I don't have about the Linux storage
> stack (but I see that is the performance penalty he's referring to now which
> makes sense if one really did have to lose the overlapped write capability
> with non-Append-capable ZNS drives).

Yes, that is what I meant. Regular writes need special treatment to preserve
ordering and that has potentially  performance implications.

> 
> "To allow other ZNS drives, an emulation similar to SCSI can be implemented,
> with that emulation ideally combined to work for both types of drives if
> possible. And note that this emulation would require the drive to be operated
> with mq-deadline to enable zone write locking for preserving write command
> order"
> 
> But of course that is not what we want to do with ZRWA-capable ZNS drives.

This is an on-going NVMe technical proposal under NDA, so I will refrain from
any comment about this on this public mailing list.
Javier González June 16, 2020, 2:16 p.m. UTC | #18
On 16.06.2020 12:35, Damien Le Moal wrote:
>On 2020/06/16 21:24, Javier González wrote:
>> On 16.06.2020 14:06, Matias Bjørling wrote:
>>> On 16/06/2020 14.00, Javier González wrote:
>>>> On 16.06.2020 13:18, Matias Bjørling wrote:
>>>>> On 16/06/2020 12.41, Javier González wrote:
>>>>>> On 16.06.2020 08:34, Keith Busch wrote:
>>>>>>> Add support for NVM Express Zoned Namespaces (ZNS) Command Set defined
>>>>>>> in NVM Express TP4053. Zoned namespaces are discovered based on their
>>>>>>> Command Set Identifier reported in the namespaces Namespace
>>>>>>> Identification Descriptor list. A successfully discovered Zoned
>>>>>>> Namespace will be registered with the block layer as a host managed
>>>>>>> zoned block device with Zone Append command support. A namespace that
>>>>>>> does not support append is not supported by the driver.
>>>>>>
>>>>>> Why are we enforcing the append command? Append is optional on the
>>>>>> current ZNS specification, so we should not make this mandatory in the
>>>>>> implementation. See specifics below.
>>>>
>>>>>
>>>>> There is already general support in the kernel for the zone append
>>>>> command. Feel free to submit patches to emulate the support. It is
>>>>> outside the scope of this patchset.
>>>>>
>>>>
>>>> It is fine that the kernel supports append, but the ZNS specification
>>>> does not impose the implementation for append, so the driver should not
>>>> do that either.
>>>>
>>>> ZNS SSDs that choose to leave append as a non-implemented optional
>>>> command should not rely on emulated SW support, specially when
>>>> traditional writes work very fine for a large part of current ZNS use
>>>> cases.
>>>>
>>>> Please, remove this virtual constraint.
>>>
>>> The Zone Append command is mandatory for zoned block devices. Please
>>> see https://lwn.net/Articles/818709/ for the background.
>>
>> I do not see anywhere in the block layer that append is mandatory for
>> zoned devices. Append is emulated on ZBC, but beyond that there is no
>> mandatory bits. Please explain.
>
>This is to allow a single write IO path for all types of zoned block device for
>higher layers, e.g file systems. The on-going re-work of btrfs zone support for
>instance now relies 100% on zone append being supported. That significantly
>simplifies the file system support and more importantly remove the need for
>locking around block allocation and BIO issuing, allowing to preserve a fully
>asynchronous write path that can include workqueues for efficient CPU usage of
>things like encryption and compression. Without zone append, file system would
>either (1) have to reject these drives that do not support zone append, or (2)
>implement 2 different write IO path (slower regular write and zone append). None
>of these options are ideal, to say the least.
>
>So the approach is: mandate zone append support for ZNS devices. To allow other
>ZNS drives, an emulation similar to SCSI can be implemented, with that emulation
>ideally combined to work for both types of drives if possible.

Enforcing QD=1 becomes a problem on devices with large zones. In
a ZNS device that has smaller zones this should not be a problem.

Would you agree that it is possible to have a write path that relies on
QD=1, where the FS / application has the responsibility for enforcing
this? Down the road this QD can be increased if the device is able to
buffer the writes.

I would be OK with some FS implementations to rely on append and impose
the constraint that append has to be supported (and it would be our job
to change that), but I would like to avoid the driver rejecting
initializing the device because current FS implementations have
implemented this logic.

We can agree that a number of initial customers will use these devices
raw, using the in-kernel I/O path, but without a FS on top.

Thoughts?

> and note that
>this emulation would require the drive to be operated with mq-deadline to enable
>zone write locking for preserving write command order. While on a HDD the
>performance penalty is minimal, it will likely be significant on a SSD.

Exactly my concern. I do not want ZNS SSDs to be impacted by this type
of design decision at the driver level.

Thanks,
Javier
Damien Le Moal June 16, 2020, 2:42 p.m. UTC | #19
On 2020/06/16 23:16, Javier González wrote:
> On 16.06.2020 12:35, Damien Le Moal wrote:
>> On 2020/06/16 21:24, Javier González wrote:
>>> On 16.06.2020 14:06, Matias Bjørling wrote:
>>>> On 16/06/2020 14.00, Javier González wrote:
>>>>> On 16.06.2020 13:18, Matias Bjørling wrote:
>>>>>> On 16/06/2020 12.41, Javier González wrote:
>>>>>>> On 16.06.2020 08:34, Keith Busch wrote:
>>>>>>>> Add support for NVM Express Zoned Namespaces (ZNS) Command Set defined
>>>>>>>> in NVM Express TP4053. Zoned namespaces are discovered based on their
>>>>>>>> Command Set Identifier reported in the namespaces Namespace
>>>>>>>> Identification Descriptor list. A successfully discovered Zoned
>>>>>>>> Namespace will be registered with the block layer as a host managed
>>>>>>>> zoned block device with Zone Append command support. A namespace that
>>>>>>>> does not support append is not supported by the driver.
>>>>>>>
>>>>>>> Why are we enforcing the append command? Append is optional on the
>>>>>>> current ZNS specification, so we should not make this mandatory in the
>>>>>>> implementation. See specifics below.
>>>>>
>>>>>>
>>>>>> There is already general support in the kernel for the zone append
>>>>>> command. Feel free to submit patches to emulate the support. It is
>>>>>> outside the scope of this patchset.
>>>>>>
>>>>>
>>>>> It is fine that the kernel supports append, but the ZNS specification
>>>>> does not impose the implementation for append, so the driver should not
>>>>> do that either.
>>>>>
>>>>> ZNS SSDs that choose to leave append as a non-implemented optional
>>>>> command should not rely on emulated SW support, specially when
>>>>> traditional writes work very fine for a large part of current ZNS use
>>>>> cases.
>>>>>
>>>>> Please, remove this virtual constraint.
>>>>
>>>> The Zone Append command is mandatory for zoned block devices. Please
>>>> see https://lwn.net/Articles/818709/ for the background.
>>>
>>> I do not see anywhere in the block layer that append is mandatory for
>>> zoned devices. Append is emulated on ZBC, but beyond that there is no
>>> mandatory bits. Please explain.
>>
>> This is to allow a single write IO path for all types of zoned block device for
>> higher layers, e.g file systems. The on-going re-work of btrfs zone support for
>> instance now relies 100% on zone append being supported. That significantly
>> simplifies the file system support and more importantly remove the need for
>> locking around block allocation and BIO issuing, allowing to preserve a fully
>> asynchronous write path that can include workqueues for efficient CPU usage of
>> things like encryption and compression. Without zone append, file system would
>> either (1) have to reject these drives that do not support zone append, or (2)
>> implement 2 different write IO path (slower regular write and zone append). None
>> of these options are ideal, to say the least.
>>
>> So the approach is: mandate zone append support for ZNS devices. To allow other
>> ZNS drives, an emulation similar to SCSI can be implemented, with that emulation
>> ideally combined to work for both types of drives if possible.
> 
> Enforcing QD=1 becomes a problem on devices with large zones. In
> a ZNS device that has smaller zones this should not be a problem.

Let's be precise: this is not running the drive at QD=1, it is "at most one
write *request* per zone". If the FS is simultaneously using multiple block
groups mapped to different zones, you will get a total write QD > 1, and as many
reads as you want.

> Would you agree that it is possible to have a write path that relies on
> QD=1, where the FS / application has the responsibility for enforcing
> this? Down the road this QD can be increased if the device is able to
> buffer the writes.

Doing QD=1 per zone for writes at the FS layer, that is, at the BIO layer does
not work. This is because BIOs can be as large as the FS wants them to be. Such
large BIO will be split into multiple requests in the block layer, resulting in
more than one write per zone. That is why the zone write locking is at the
scheduler level, between BIO split and request dispatch. That avoids the
multiple requests fragments of a large BIO to be reordered and fail. That is
mandatory as the block layer itself can occasionally reorder requests and lower
levels such as AHCI HW is also notoriously good at reversing sequential
requests. For NVMe with multi-queue, the IO issuing process getting rescheduled
on a different CPU can result in sequential IOs being in different queues, with
the likely result of an out-of-order execution. All cases are avoided with zone
write locking and at most one write request dispatch per zone as recommended by
the ZNS specifications (ZBC and ZAC standards for SMR HDDs are silent on this).

> I would be OK with some FS implementations to rely on append and impose
> the constraint that append has to be supported (and it would be our job
> to change that), but I would like to avoid the driver rejecting
> initializing the device because current FS implementations have
> implemented this logic.

What is the difference between the driver rejecting drives and the FS rejecting
the same drives ? That has the same end result to me: an entire class of devices
cannot be used as desired by the user. Implementing zone append emulation avoids
the rejection entirely while still allowing the FS to have a single write IO
path, thus simplifying the code.

> We can agree that a number of initial customers will use these devices
> raw, using the in-kernel I/O path, but without a FS on top.
> 
> Thoughts?
> 
>> and note that
>> this emulation would require the drive to be operated with mq-deadline to enable
>> zone write locking for preserving write command order. While on a HDD the
>> performance penalty is minimal, it will likely be significant on a SSD.
> 
> Exactly my concern. I do not want ZNS SSDs to be impacted by this type
> of design decision at the driver level.

But your proposed FS level approach would end up doing the exact same thing with
the same limitation and so the same potential performance impact. The block
layer generic approach has the advantage that we do not bother the higher levels
with the implementation of in-order request dispatch guarantees. File systems
are complex enough. The less complexity is required for zone support, the better.

> 
> Thanks,
> Javier
>
Javier González June 16, 2020, 3:02 p.m. UTC | #20
On 16.06.2020 14:42, Damien Le Moal wrote:
>On 2020/06/16 23:16, Javier González wrote:
>> On 16.06.2020 12:35, Damien Le Moal wrote:
>>> On 2020/06/16 21:24, Javier González wrote:
>>>> On 16.06.2020 14:06, Matias Bjørling wrote:
>>>>> On 16/06/2020 14.00, Javier González wrote:
>>>>>> On 16.06.2020 13:18, Matias Bjørling wrote:
>>>>>>> On 16/06/2020 12.41, Javier González wrote:
>>>>>>>> On 16.06.2020 08:34, Keith Busch wrote:
>>>>>>>>> Add support for NVM Express Zoned Namespaces (ZNS) Command Set defined
>>>>>>>>> in NVM Express TP4053. Zoned namespaces are discovered based on their
>>>>>>>>> Command Set Identifier reported in the namespaces Namespace
>>>>>>>>> Identification Descriptor list. A successfully discovered Zoned
>>>>>>>>> Namespace will be registered with the block layer as a host managed
>>>>>>>>> zoned block device with Zone Append command support. A namespace that
>>>>>>>>> does not support append is not supported by the driver.
>>>>>>>>
>>>>>>>> Why are we enforcing the append command? Append is optional on the
>>>>>>>> current ZNS specification, so we should not make this mandatory in the
>>>>>>>> implementation. See specifics below.
>>>>>>
>>>>>>>
>>>>>>> There is already general support in the kernel for the zone append
>>>>>>> command. Feel free to submit patches to emulate the support. It is
>>>>>>> outside the scope of this patchset.
>>>>>>>
>>>>>>
>>>>>> It is fine that the kernel supports append, but the ZNS specification
>>>>>> does not impose the implementation for append, so the driver should not
>>>>>> do that either.
>>>>>>
>>>>>> ZNS SSDs that choose to leave append as a non-implemented optional
>>>>>> command should not rely on emulated SW support, specially when
>>>>>> traditional writes work very fine for a large part of current ZNS use
>>>>>> cases.
>>>>>>
>>>>>> Please, remove this virtual constraint.
>>>>>
>>>>> The Zone Append command is mandatory for zoned block devices. Please
>>>>> see https://lwn.net/Articles/818709/ for the background.
>>>>
>>>> I do not see anywhere in the block layer that append is mandatory for
>>>> zoned devices. Append is emulated on ZBC, but beyond that there is no
>>>> mandatory bits. Please explain.
>>>
>>> This is to allow a single write IO path for all types of zoned block device for
>>> higher layers, e.g file systems. The on-going re-work of btrfs zone support for
>>> instance now relies 100% on zone append being supported. That significantly
>>> simplifies the file system support and more importantly remove the need for
>>> locking around block allocation and BIO issuing, allowing to preserve a fully
>>> asynchronous write path that can include workqueues for efficient CPU usage of
>>> things like encryption and compression. Without zone append, file system would
>>> either (1) have to reject these drives that do not support zone append, or (2)
>>> implement 2 different write IO path (slower regular write and zone append). None
>>> of these options are ideal, to say the least.
>>>
>>> So the approach is: mandate zone append support for ZNS devices. To allow other
>>> ZNS drives, an emulation similar to SCSI can be implemented, with that emulation
>>> ideally combined to work for both types of drives if possible.
>>
>> Enforcing QD=1 becomes a problem on devices with large zones. In
>> a ZNS device that has smaller zones this should not be a problem.
>
>Let's be precise: this is not running the drive at QD=1, it is "at most one
>write *request* per zone". If the FS is simultaneously using multiple block
>groups mapped to different zones, you will get a total write QD > 1, and as many
>reads as you want.
>
>> Would you agree that it is possible to have a write path that relies on
>> QD=1, where the FS / application has the responsibility for enforcing
>> this? Down the road this QD can be increased if the device is able to
>> buffer the writes.
>
>Doing QD=1 per zone for writes at the FS layer, that is, at the BIO layer does
>not work. This is because BIOs can be as large as the FS wants them to be. Such
>large BIO will be split into multiple requests in the block layer, resulting in
>more than one write per zone. That is why the zone write locking is at the
>scheduler level, between BIO split and request dispatch. That avoids the
>multiple requests fragments of a large BIO to be reordered and fail. That is
>mandatory as the block layer itself can occasionally reorder requests and lower
>levels such as AHCI HW is also notoriously good at reversing sequential
>requests. For NVMe with multi-queue, the IO issuing process getting rescheduled
>on a different CPU can result in sequential IOs being in different queues, with
>the likely result of an out-of-order execution. All cases are avoided with zone
>write locking and at most one write request dispatch per zone as recommended by
>the ZNS specifications (ZBC and ZAC standards for SMR HDDs are silent on this).
>

I understand. I agree that the current FSs supporting ZNS follow this
approach and it makes sense that there is a common interface that
simplifies the FS implementation. See the comment below on the part I
believe we see things differently.


>> I would be OK with some FS implementations to rely on append and impose
>> the constraint that append has to be supported (and it would be our job
>> to change that), but I would like to avoid the driver rejecting
>> initializing the device because current FS implementations have
>> implemented this logic.
>
>What is the difference between the driver rejecting drives and the FS rejecting
>the same drives ? That has the same end result to me: an entire class of devices
>cannot be used as desired by the user. Implementing zone append emulation avoids
>the rejection entirely while still allowing the FS to have a single write IO
>path, thus simplifying the code.

The difference is that users that use a raw ZNS device submitting I/O
through the kernel would still be able to use these devices. The result
would be that the ZNS SSD is recognized and initialized, but the FS
format fails.

>
>> We can agree that a number of initial customers will use these devices
>> raw, using the in-kernel I/O path, but without a FS on top.
>>
>> Thoughts?
>>
>>> and note that
>>> this emulation would require the drive to be operated with mq-deadline to enable
>>> zone write locking for preserving write command order. While on a HDD the
>>> performance penalty is minimal, it will likely be significant on a SSD.
>>
>> Exactly my concern. I do not want ZNS SSDs to be impacted by this type
>> of design decision at the driver level.
>
>But your proposed FS level approach would end up doing the exact same thing with
>the same limitation and so the same potential performance impact. The block
>layer generic approach has the advantage that we do not bother the higher levels
>with the implementation of in-order request dispatch guarantees. File systems
>are complex enough. The less complexity is required for zone support, the better.

This depends very much on how the FS / application is managing
stripping. At the moment our main use case is enabling user-space
applications submitting I/Os to raw ZNS devices through the kernel.

Can we enable this use case to start with?

Thanks,
Javier
Matias Bjorling June 16, 2020, 3:20 p.m. UTC | #21
On 16/06/2020 17.02, Javier González wrote:
> On 16.06.2020 14:42, Damien Le Moal wrote:
>> On 2020/06/16 23:16, Javier González wrote:
>>> On 16.06.2020 12:35, Damien Le Moal wrote:
>>>> On 2020/06/16 21:24, Javier González wrote:
>>>>> On 16.06.2020 14:06, Matias Bjørling wrote:
>>>>>> On 16/06/2020 14.00, Javier González wrote:
>>>>>>> On 16.06.2020 13:18, Matias Bjørling wrote:
>>>>>>>> On 16/06/2020 12.41, Javier González wrote:
>>>>>>>>> On 16.06.2020 08:34, Keith Busch wrote:
>>>>>>>>>> Add support for NVM Express Zoned Namespaces (ZNS) Command 
>>>>>>>>>> Set defined
>>>>>>>>>> in NVM Express TP4053. Zoned namespaces are discovered based 
>>>>>>>>>> on their
>>>>>>>>>> Command Set Identifier reported in the namespaces Namespace
>>>>>>>>>> Identification Descriptor list. A successfully discovered Zoned
>>>>>>>>>> Namespace will be registered with the block layer as a host 
>>>>>>>>>> managed
>>>>>>>>>> zoned block device with Zone Append command support. A 
>>>>>>>>>> namespace that
>>>>>>>>>> does not support append is not supported by the driver.
>>>>>>>>>
>>>>>>>>> Why are we enforcing the append command? Append is optional on 
>>>>>>>>> the
>>>>>>>>> current ZNS specification, so we should not make this 
>>>>>>>>> mandatory in the
>>>>>>>>> implementation. See specifics below.
>>>>>>>
>>>>>>>>
>>>>>>>> There is already general support in the kernel for the zone append
>>>>>>>> command. Feel free to submit patches to emulate the support. It is
>>>>>>>> outside the scope of this patchset.
>>>>>>>>
>>>>>>>
>>>>>>> It is fine that the kernel supports append, but the ZNS 
>>>>>>> specification
>>>>>>> does not impose the implementation for append, so the driver 
>>>>>>> should not
>>>>>>> do that either.
>>>>>>>
>>>>>>> ZNS SSDs that choose to leave append as a non-implemented optional
>>>>>>> command should not rely on emulated SW support, specially when
>>>>>>> traditional writes work very fine for a large part of current 
>>>>>>> ZNS use
>>>>>>> cases.
>>>>>>>
>>>>>>> Please, remove this virtual constraint.
>>>>>>
>>>>>> The Zone Append command is mandatory for zoned block devices. Please
>>>>>> see https://lwn.net/Articles/818709/ for the background.
>>>>>
>>>>> I do not see anywhere in the block layer that append is mandatory for
>>>>> zoned devices. Append is emulated on ZBC, but beyond that there is no
>>>>> mandatory bits. Please explain.
>>>>
>>>> This is to allow a single write IO path for all types of zoned 
>>>> block device for
>>>> higher layers, e.g file systems. The on-going re-work of btrfs zone 
>>>> support for
>>>> instance now relies 100% on zone append being supported. That 
>>>> significantly
>>>> simplifies the file system support and more importantly remove the 
>>>> need for
>>>> locking around block allocation and BIO issuing, allowing to 
>>>> preserve a fully
>>>> asynchronous write path that can include workqueues for efficient 
>>>> CPU usage of
>>>> things like encryption and compression. Without zone append, file 
>>>> system would
>>>> either (1) have to reject these drives that do not support zone 
>>>> append, or (2)
>>>> implement 2 different write IO path (slower regular write and zone 
>>>> append). None
>>>> of these options are ideal, to say the least.
>>>>
>>>> So the approach is: mandate zone append support for ZNS devices. To 
>>>> allow other
>>>> ZNS drives, an emulation similar to SCSI can be implemented, with 
>>>> that emulation
>>>> ideally combined to work for both types of drives if possible.
>>>
>>> Enforcing QD=1 becomes a problem on devices with large zones. In
>>> a ZNS device that has smaller zones this should not be a problem.
>>
>> Let's be precise: this is not running the drive at QD=1, it is "at 
>> most one
>> write *request* per zone". If the FS is simultaneously using multiple 
>> block
>> groups mapped to different zones, you will get a total write QD > 1, 
>> and as many
>> reads as you want.
>>
>>> Would you agree that it is possible to have a write path that relies on
>>> QD=1, where the FS / application has the responsibility for enforcing
>>> this? Down the road this QD can be increased if the device is able to
>>> buffer the writes.
>>
>> Doing QD=1 per zone for writes at the FS layer, that is, at the BIO 
>> layer does
>> not work. This is because BIOs can be as large as the FS wants them 
>> to be. Such
>> large BIO will be split into multiple requests in the block layer, 
>> resulting in
>> more than one write per zone. That is why the zone write locking is 
>> at the
>> scheduler level, between BIO split and request dispatch. That avoids the
>> multiple requests fragments of a large BIO to be reordered and fail. 
>> That is
>> mandatory as the block layer itself can occasionally reorder requests 
>> and lower
>> levels such as AHCI HW is also notoriously good at reversing sequential
>> requests. For NVMe with multi-queue, the IO issuing process getting 
>> rescheduled
>> on a different CPU can result in sequential IOs being in different 
>> queues, with
>> the likely result of an out-of-order execution. All cases are avoided 
>> with zone
>> write locking and at most one write request dispatch per zone as 
>> recommended by
>> the ZNS specifications (ZBC and ZAC standards for SMR HDDs are silent 
>> on this).
>>
>
> I understand. I agree that the current FSs supporting ZNS follow this
> approach and it makes sense that there is a common interface that
> simplifies the FS implementation. See the comment below on the part I
> believe we see things differently.
>
>
>>> I would be OK with some FS implementations to rely on append and impose
>>> the constraint that append has to be supported (and it would be our job
>>> to change that), but I would like to avoid the driver rejecting
>>> initializing the device because current FS implementations have
>>> implemented this logic.
>>
>> What is the difference between the driver rejecting drives and the FS 
>> rejecting
>> the same drives ? That has the same end result to me: an entire class 
>> of devices
>> cannot be used as desired by the user. Implementing zone append 
>> emulation avoids
>> the rejection entirely while still allowing the FS to have a single 
>> write IO
>> path, thus simplifying the code.
>
> The difference is that users that use a raw ZNS device submitting I/O
> through the kernel would still be able to use these devices. The result
> would be that the ZNS SSD is recognized and initialized, but the FS
> format fails.
>
>>
>>> We can agree that a number of initial customers will use these devices
>>> raw, using the in-kernel I/O path, but without a FS on top.
>>>
>>> Thoughts?
>>>
>>>> and note that
>>>> this emulation would require the drive to be operated with 
>>>> mq-deadline to enable
>>>> zone write locking for preserving write command order. While on a 
>>>> HDD the
>>>> performance penalty is minimal, it will likely be significant on a 
>>>> SSD.
>>>
>>> Exactly my concern. I do not want ZNS SSDs to be impacted by this type
>>> of design decision at the driver level.
>>
>> But your proposed FS level approach would end up doing the exact same 
>> thing with
>> the same limitation and so the same potential performance impact. The 
>> block
>> layer generic approach has the advantage that we do not bother the 
>> higher levels
>> with the implementation of in-order request dispatch guarantees. File 
>> systems
>> are complex enough. The less complexity is required for zone support, 
>> the better.
>
> This depends very much on how the FS / application is managing
> stripping. At the moment our main use case is enabling user-space
> applications submitting I/Os to raw ZNS devices through the kernel.
>
> Can we enable this use case to start with?

It is free for everyone to load kernel modules into the kernel. Those 
modules may not have the appropriate checks or may rely on the zone 
append functionality. Having per use-case limit is a no-go and at best a 
game of whack-a-mole.

You already agreed to create a set of patches to add the appropriate 
support for emulating zone append. As these would fix your specific 
issue, please go ahead and submit those.

There is another way, that you may want to consider. That is to use 
SPDK, which bypasses the stack and allows you to issue I/Os as you prefer.

Best, Matias
Keith Busch June 16, 2020, 3:48 p.m. UTC | #22
On Tue, Jun 16, 2020 at 05:02:17PM +0200, Javier González wrote:
> This depends very much on how the FS / application is managing
> stripping. At the moment our main use case is enabling user-space
> applications submitting I/Os to raw ZNS devices through the kernel.
> 
> Can we enable this use case to start with?

I think this already provides that. You can set the nsid value to
whatever you want in the passthrough interface, so a namespace block
device is not required to issue I/O to a ZNS namespace from user space.
Javier González June 16, 2020, 3:55 p.m. UTC | #23
On 16.06.2020 08:48, Keith Busch wrote:
>On Tue, Jun 16, 2020 at 05:02:17PM +0200, Javier González wrote:
>> This depends very much on how the FS / application is managing
>> stripping. At the moment our main use case is enabling user-space
>> applications submitting I/Os to raw ZNS devices through the kernel.
>>
>> Can we enable this use case to start with?
>
>I think this already provides that. You can set the nsid value to
>whatever you want in the passthrough interface, so a namespace block
>device is not required to issue I/O to a ZNS namespace from user space.

Mmmmm. Problem now is that the check on the nvme driver prevents the ZNS
namespace from being initialized. Am I missing something?

Thanks,
Javier
Javier González June 16, 2020, 4:03 p.m. UTC | #24
On 16.06.2020 17:20, Matias Bjørling wrote:
>On 16/06/2020 17.02, Javier González wrote:
>>On 16.06.2020 14:42, Damien Le Moal wrote:
>>>On 2020/06/16 23:16, Javier González wrote:
>>>>On 16.06.2020 12:35, Damien Le Moal wrote:
>>>>>On 2020/06/16 21:24, Javier González wrote:
>>>>>>On 16.06.2020 14:06, Matias Bjørling wrote:
>>>>>>>On 16/06/2020 14.00, Javier González wrote:
>>>>>>>>On 16.06.2020 13:18, Matias Bjørling wrote:
>>>>>>>>>On 16/06/2020 12.41, Javier González wrote:
>>>>>>>>>>On 16.06.2020 08:34, Keith Busch wrote:
>>>>>>>>>>>Add support for NVM Express Zoned Namespaces (ZNS) 
>>>>>>>>>>>Command Set defined
>>>>>>>>>>>in NVM Express TP4053. Zoned namespaces are 
>>>>>>>>>>>discovered based on their
>>>>>>>>>>>Command Set Identifier reported in the namespaces Namespace
>>>>>>>>>>>Identification Descriptor list. A successfully discovered Zoned
>>>>>>>>>>>Namespace will be registered with the block layer 
>>>>>>>>>>>as a host managed
>>>>>>>>>>>zoned block device with Zone Append command 
>>>>>>>>>>>support. A namespace that
>>>>>>>>>>>does not support append is not supported by the driver.
>>>>>>>>>>
>>>>>>>>>>Why are we enforcing the append command? Append is 
>>>>>>>>>>optional on the
>>>>>>>>>>current ZNS specification, so we should not make 
>>>>>>>>>>this mandatory in the
>>>>>>>>>>implementation. See specifics below.
>>>>>>>>
>>>>>>>>>
>>>>>>>>>There is already general support in the kernel for the zone append
>>>>>>>>>command. Feel free to submit patches to emulate the support. It is
>>>>>>>>>outside the scope of this patchset.
>>>>>>>>>
>>>>>>>>
>>>>>>>>It is fine that the kernel supports append, but the ZNS 
>>>>>>>>specification
>>>>>>>>does not impose the implementation for append, so the 
>>>>>>>>driver should not
>>>>>>>>do that either.
>>>>>>>>
>>>>>>>>ZNS SSDs that choose to leave append as a non-implemented optional
>>>>>>>>command should not rely on emulated SW support, specially when
>>>>>>>>traditional writes work very fine for a large part of 
>>>>>>>>current ZNS use
>>>>>>>>cases.
>>>>>>>>
>>>>>>>>Please, remove this virtual constraint.
>>>>>>>
>>>>>>>The Zone Append command is mandatory for zoned block devices. Please
>>>>>>>see https://lwn.net/Articles/818709/ for the background.
>>>>>>
>>>>>>I do not see anywhere in the block layer that append is mandatory for
>>>>>>zoned devices. Append is emulated on ZBC, but beyond that there is no
>>>>>>mandatory bits. Please explain.
>>>>>
>>>>>This is to allow a single write IO path for all types of zoned 
>>>>>block device for
>>>>>higher layers, e.g file systems. The on-going re-work of btrfs 
>>>>>zone support for
>>>>>instance now relies 100% on zone append being supported. That 
>>>>>significantly
>>>>>simplifies the file system support and more importantly remove 
>>>>>the need for
>>>>>locking around block allocation and BIO issuing, allowing to 
>>>>>preserve a fully
>>>>>asynchronous write path that can include workqueues for 
>>>>>efficient CPU usage of
>>>>>things like encryption and compression. Without zone append, 
>>>>>file system would
>>>>>either (1) have to reject these drives that do not support 
>>>>>zone append, or (2)
>>>>>implement 2 different write IO path (slower regular write and 
>>>>>zone append). None
>>>>>of these options are ideal, to say the least.
>>>>>
>>>>>So the approach is: mandate zone append support for ZNS 
>>>>>devices. To allow other
>>>>>ZNS drives, an emulation similar to SCSI can be implemented, 
>>>>>with that emulation
>>>>>ideally combined to work for both types of drives if possible.
>>>>
>>>>Enforcing QD=1 becomes a problem on devices with large zones. In
>>>>a ZNS device that has smaller zones this should not be a problem.
>>>
>>>Let's be precise: this is not running the drive at QD=1, it is "at 
>>>most one
>>>write *request* per zone". If the FS is simultaneously using 
>>>multiple block
>>>groups mapped to different zones, you will get a total write QD > 
>>>1, and as many
>>>reads as you want.
>>>
>>>>Would you agree that it is possible to have a write path that relies on
>>>>QD=1, where the FS / application has the responsibility for enforcing
>>>>this? Down the road this QD can be increased if the device is able to
>>>>buffer the writes.
>>>
>>>Doing QD=1 per zone for writes at the FS layer, that is, at the 
>>>BIO layer does
>>>not work. This is because BIOs can be as large as the FS wants 
>>>them to be. Such
>>>large BIO will be split into multiple requests in the block layer, 
>>>resulting in
>>>more than one write per zone. That is why the zone write locking 
>>>is at the
>>>scheduler level, between BIO split and request dispatch. That avoids the
>>>multiple requests fragments of a large BIO to be reordered and 
>>>fail. That is
>>>mandatory as the block layer itself can occasionally reorder 
>>>requests and lower
>>>levels such as AHCI HW is also notoriously good at reversing sequential
>>>requests. For NVMe with multi-queue, the IO issuing process 
>>>getting rescheduled
>>>on a different CPU can result in sequential IOs being in different 
>>>queues, with
>>>the likely result of an out-of-order execution. All cases are 
>>>avoided with zone
>>>write locking and at most one write request dispatch per zone as 
>>>recommended by
>>>the ZNS specifications (ZBC and ZAC standards for SMR HDDs are 
>>>silent on this).
>>>
>>
>>I understand. I agree that the current FSs supporting ZNS follow this
>>approach and it makes sense that there is a common interface that
>>simplifies the FS implementation. See the comment below on the part I
>>believe we see things differently.
>>
>>
>>>>I would be OK with some FS implementations to rely on append and impose
>>>>the constraint that append has to be supported (and it would be our job
>>>>to change that), but I would like to avoid the driver rejecting
>>>>initializing the device because current FS implementations have
>>>>implemented this logic.
>>>
>>>What is the difference between the driver rejecting drives and the 
>>>FS rejecting
>>>the same drives ? That has the same end result to me: an entire 
>>>class of devices
>>>cannot be used as desired by the user. Implementing zone append 
>>>emulation avoids
>>>the rejection entirely while still allowing the FS to have a 
>>>single write IO
>>>path, thus simplifying the code.
>>
>>The difference is that users that use a raw ZNS device submitting I/O
>>through the kernel would still be able to use these devices. The result
>>would be that the ZNS SSD is recognized and initialized, but the FS
>>format fails.
>>
>>>
>>>>We can agree that a number of initial customers will use these devices
>>>>raw, using the in-kernel I/O path, but without a FS on top.
>>>>
>>>>Thoughts?
>>>>
>>>>>and note that
>>>>>this emulation would require the drive to be operated with 
>>>>>mq-deadline to enable
>>>>>zone write locking for preserving write command order. While 
>>>>>on a HDD the
>>>>>performance penalty is minimal, it will likely be significant 
>>>>>on a SSD.
>>>>
>>>>Exactly my concern. I do not want ZNS SSDs to be impacted by this type
>>>>of design decision at the driver level.
>>>
>>>But your proposed FS level approach would end up doing the exact 
>>>same thing with
>>>the same limitation and so the same potential performance impact. 
>>>The block
>>>layer generic approach has the advantage that we do not bother the 
>>>higher levels
>>>with the implementation of in-order request dispatch guarantees. 
>>>File systems
>>>are complex enough. The less complexity is required for zone 
>>>support, the better.
>>
>>This depends very much on how the FS / application is managing
>>stripping. At the moment our main use case is enabling user-space
>>applications submitting I/Os to raw ZNS devices through the kernel.
>>
>>Can we enable this use case to start with?
>
>It is free for everyone to load kernel modules into the kernel. Those 
>modules may not have the appropriate checks or may rely on the zone 
>append functionality. Having per use-case limit is a no-go and at best 
>a game of whack-a-mole.

Let's focus on mainline support. We are leaving append as not enabled
based on customer requests for some ZNS products and would like this
devices to be supported. This is not at all a corner use-case but a very
general one.

>
>You already agreed to create a set of patches to add the appropriate 
>support for emulating zone append. As these would fix your specific 
>issue, please go ahead and submit those.

I agreed to solve the use case that some of our customers are enabling
and this is what I am doing.

Again, to start with I would like to have a path where ZNS namespaces are
identified independently of append support. Then specific users can
require append if they please to do so. We will of course take care of
sending patches for this.

Thanks,
Javier
Matias Bjørling June 16, 2020, 4:04 p.m. UTC | #25
> -----Original Message-----
> From: Javier González <javier@javigon.com>
> Sent: Tuesday, 16 June 2020 17.55
> To: Keith Busch <kbusch@kernel.org>
> Cc: Damien Le Moal <Damien.LeMoal@wdc.com>; Matias Bjørling
> <mb@lightnvm.io>; Jens Axboe <axboe@kernel.dk>; Niklas Cassel
> <Niklas.Cassel@wdc.com>; Ajay Joshi <Ajay.Joshi@wdc.com>; Sagi Grimberg
> <sagi@grimberg.me>; Keith Busch <Keith.Busch@wdc.com>; Dmitry Fomichev
> <Dmitry.Fomichev@wdc.com>; Aravind Ramesh <Aravind.Ramesh@wdc.com>;
> linux-nvme@lists.infradead.org; linux-block@vger.kernel.org; Hans Holmberg
> <Hans.Holmberg@wdc.com>; Christoph Hellwig <hch@lst.de>; Matias Bjorling
> <Matias.Bjorling@wdc.com>
> Subject: Re: [PATCH 5/5] nvme: support for zoned namespaces
> 
> On 16.06.2020 08:48, Keith Busch wrote:
> >On Tue, Jun 16, 2020 at 05:02:17PM +0200, Javier González wrote:
> >> This depends very much on how the FS / application is managing
> >> stripping. At the moment our main use case is enabling user-space
> >> applications submitting I/Os to raw ZNS devices through the kernel.
> >>
> >> Can we enable this use case to start with?
> >
> >I think this already provides that. You can set the nsid value to
> >whatever you want in the passthrough interface, so a namespace block
> >device is not required to issue I/O to a ZNS namespace from user space.
> 
> Mmmmm. Problem now is that the check on the nvme driver prevents the ZNS
> namespace from being initialized. Am I missing something?

You can issue the io to the char device (e.g., /dev/nvme0), and specify the namespace. That will be initialized. If you like, you can use Keith's libnvme, that exposes all of the functionality to issue passthru I/Os.

Best, Matias
Matias Bjørling June 16, 2020, 4:07 p.m. UTC | #26
> -----Original Message-----
> From: Javier González <javier@javigon.com>
> Sent: Tuesday, 16 June 2020 18.03
> To: Matias Bjørling <mb@lightnvm.io>
> Cc: Damien Le Moal <Damien.LeMoal@wdc.com>; Jens Axboe
> <axboe@kernel.dk>; Niklas Cassel <Niklas.Cassel@wdc.com>; Ajay Joshi
> <Ajay.Joshi@wdc.com>; Sagi Grimberg <sagi@grimberg.me>; Keith Busch
> <Keith.Busch@wdc.com>; Dmitry Fomichev <Dmitry.Fomichev@wdc.com>;
> Aravind Ramesh <Aravind.Ramesh@wdc.com>; linux-
> nvme@lists.infradead.org; linux-block@vger.kernel.org; Hans Holmberg
> <Hans.Holmberg@wdc.com>; Christoph Hellwig <hch@lst.de>; Matias Bjorling
> <Matias.Bjorling@wdc.com>
> Subject: Re: [PATCH 5/5] nvme: support for zoned namespaces
> 
> On 16.06.2020 17:20, Matias Bjørling wrote:
> >On 16/06/2020 17.02, Javier González wrote:
> >>On 16.06.2020 14:42, Damien Le Moal wrote:
> >>>On 2020/06/16 23:16, Javier González wrote:
> >>>>On 16.06.2020 12:35, Damien Le Moal wrote:
> >>>>>On 2020/06/16 21:24, Javier González wrote:
> >>>>>>On 16.06.2020 14:06, Matias Bjørling wrote:
> >>>>>>>On 16/06/2020 14.00, Javier González wrote:
> >>>>>>>>On 16.06.2020 13:18, Matias Bjørling wrote:
> >>>>>>>>>On 16/06/2020 12.41, Javier González wrote:
> >>>>>>>>>>On 16.06.2020 08:34, Keith Busch wrote:
> >>>>>>>>>>>Add support for NVM Express Zoned Namespaces (ZNS)
> Command
> >>>>>>>>>>>Set defined in NVM Express TP4053. Zoned namespaces are
> >>>>>>>>>>>discovered based on their Command Set Identifier reported in
> >>>>>>>>>>>the namespaces Namespace Identification Descriptor list. A
> >>>>>>>>>>>successfully discovered Zoned Namespace will be registered
> >>>>>>>>>>>with the block layer as a host managed zoned block device
> >>>>>>>>>>>with Zone Append command support. A namespace that does not
> >>>>>>>>>>>support append is not supported by the driver.
> >>>>>>>>>>
> >>>>>>>>>>Why are we enforcing the append command? Append is optional
> on
> >>>>>>>>>>the current ZNS specification, so we should not make this
> >>>>>>>>>>mandatory in the implementation. See specifics below.
> >>>>>>>>
> >>>>>>>>>
> >>>>>>>>>There is already general support in the kernel for the zone
> >>>>>>>>>append command. Feel free to submit patches to emulate the
> >>>>>>>>>support. It is outside the scope of this patchset.
> >>>>>>>>>
> >>>>>>>>
> >>>>>>>>It is fine that the kernel supports append, but the ZNS
> >>>>>>>>specification does not impose the implementation for append, so
> >>>>>>>>the driver should not do that either.
> >>>>>>>>
> >>>>>>>>ZNS SSDs that choose to leave append as a non-implemented
> >>>>>>>>optional command should not rely on emulated SW support,
> >>>>>>>>specially when traditional writes work very fine for a large
> >>>>>>>>part of current ZNS use cases.
> >>>>>>>>
> >>>>>>>>Please, remove this virtual constraint.
> >>>>>>>
> >>>>>>>The Zone Append command is mandatory for zoned block devices.
> >>>>>>>Please see https://lwn.net/Articles/818709/ for the background.
> >>>>>>
> >>>>>>I do not see anywhere in the block layer that append is mandatory
> >>>>>>for zoned devices. Append is emulated on ZBC, but beyond that
> >>>>>>there is no mandatory bits. Please explain.
> >>>>>
> >>>>>This is to allow a single write IO path for all types of zoned
> >>>>>block device for higher layers, e.g file systems. The on-going
> >>>>>re-work of btrfs zone support for instance now relies 100% on zone
> >>>>>append being supported. That significantly simplifies the file
> >>>>>system support and more importantly remove the need for locking
> >>>>>around block allocation and BIO issuing, allowing to preserve a
> >>>>>fully asynchronous write path that can include workqueues for
> >>>>>efficient CPU usage of things like encryption and compression.
> >>>>>Without zone append, file system would either (1) have to reject
> >>>>>these drives that do not support zone append, or (2) implement 2
> >>>>>different write IO path (slower regular write and zone append).
> >>>>>None of these options are ideal, to say the least.
> >>>>>
> >>>>>So the approach is: mandate zone append support for ZNS devices. To
> >>>>>allow other ZNS drives, an emulation similar to SCSI can be
> >>>>>implemented, with that emulation ideally combined to work for both
> >>>>>types of drives if possible.
> >>>>
> >>>>Enforcing QD=1 becomes a problem on devices with large zones. In a
> >>>>ZNS device that has smaller zones this should not be a problem.
> >>>
> >>>Let's be precise: this is not running the drive at QD=1, it is "at
> >>>most one write *request* per zone". If the FS is simultaneously using
> >>>multiple block groups mapped to different zones, you will get a total
> >>>write QD > 1, and as many reads as you want.
> >>>
> >>>>Would you agree that it is possible to have a write path that relies
> >>>>on QD=1, where the FS / application has the responsibility for
> >>>>enforcing this? Down the road this QD can be increased if the device
> >>>>is able to buffer the writes.
> >>>
> >>>Doing QD=1 per zone for writes at the FS layer, that is, at the BIO
> >>>layer does not work. This is because BIOs can be as large as the FS
> >>>wants them to be. Such large BIO will be split into multiple requests
> >>>in the block layer, resulting in more than one write per zone. That
> >>>is why the zone write locking is at the scheduler level, between BIO
> >>>split and request dispatch. That avoids the multiple requests
> >>>fragments of a large BIO to be reordered and fail. That is mandatory
> >>>as the block layer itself can occasionally reorder requests and lower
> >>>levels such as AHCI HW is also notoriously good at reversing
> >>>sequential requests. For NVMe with multi-queue, the IO issuing
> >>>process getting rescheduled on a different CPU can result in
> >>>sequential IOs being in different queues, with the likely result of
> >>>an out-of-order execution. All cases are avoided with zone write
> >>>locking and at most one write request dispatch per zone as
> >>>recommended by the ZNS specifications (ZBC and ZAC standards for SMR
> >>>HDDs are silent on this).
> >>>
> >>
> >>I understand. I agree that the current FSs supporting ZNS follow this
> >>approach and it makes sense that there is a common interface that
> >>simplifies the FS implementation. See the comment below on the part I
> >>believe we see things differently.
> >>
> >>
> >>>>I would be OK with some FS implementations to rely on append and
> >>>>impose the constraint that append has to be supported (and it would
> >>>>be our job to change that), but I would like to avoid the driver
> >>>>rejecting initializing the device because current FS implementations
> >>>>have implemented this logic.
> >>>
> >>>What is the difference between the driver rejecting drives and the FS
> >>>rejecting the same drives ? That has the same end result to me: an
> >>>entire class of devices cannot be used as desired by the user.
> >>>Implementing zone append emulation avoids the rejection entirely
> >>>while still allowing the FS to have a single write IO path, thus
> >>>simplifying the code.
> >>
> >>The difference is that users that use a raw ZNS device submitting I/O
> >>through the kernel would still be able to use these devices. The
> >>result would be that the ZNS SSD is recognized and initialized, but
> >>the FS format fails.
> >>
> >>>
> >>>>We can agree that a number of initial customers will use these
> >>>>devices raw, using the in-kernel I/O path, but without a FS on top.
> >>>>
> >>>>Thoughts?
> >>>>
> >>>>>and note that
> >>>>>this emulation would require the drive to be operated with
> >>>>>mq-deadline to enable zone write locking for preserving write
> >>>>>command order. While on a HDD the performance penalty is minimal,
> >>>>>it will likely be significant on a SSD.
> >>>>
> >>>>Exactly my concern. I do not want ZNS SSDs to be impacted by this
> >>>>type of design decision at the driver level.
> >>>
> >>>But your proposed FS level approach would end up doing the exact same
> >>>thing with the same limitation and so the same potential performance
> >>>impact.
> >>>The block
> >>>layer generic approach has the advantage that we do not bother the
> >>>higher levels with the implementation of in-order request dispatch
> >>>guarantees.
> >>>File systems
> >>>are complex enough. The less complexity is required for zone support,
> >>>the better.
> >>
> >>This depends very much on how the FS / application is managing
> >>stripping. At the moment our main use case is enabling user-space
> >>applications submitting I/Os to raw ZNS devices through the kernel.
> >>
> >>Can we enable this use case to start with?
> >
> >It is free for everyone to load kernel modules into the kernel. Those
> >modules may not have the appropriate checks or may rely on the zone
> >append functionality. Having per use-case limit is a no-go and at best
> >a game of whack-a-mole.
> 
> Let's focus on mainline support. We are leaving append as not enabled based
> on customer requests for some ZNS products and would like this devices to be
> supported. This is not at all a corner use-case but a very general one.
> 
> >
> >You already agreed to create a set of patches to add the appropriate
> >support for emulating zone append. As these would fix your specific
> >issue, please go ahead and submit those.
> 
> I agreed to solve the use case that some of our customers are enabling and this
> is what I am doing.
> 
> Again, to start with I would like to have a path where ZNS namespaces are
> identified independently of append support. Then specific users can require
> append if they please to do so. We will of course take care of sending patches
> for this.

As was previously said, there are users in the kernel that depends on zone append. As a result, it is not an option not to have this. Please go ahead and send the patches and you'll have the behavior you are seeking. 

Best, Matias
Keith Busch June 16, 2020, 4:07 p.m. UTC | #27
On Tue, Jun 16, 2020 at 05:55:26PM +0200, Javier González wrote:
> On 16.06.2020 08:48, Keith Busch wrote:
> > On Tue, Jun 16, 2020 at 05:02:17PM +0200, Javier González wrote:
> > > This depends very much on how the FS / application is managing
> > > stripping. At the moment our main use case is enabling user-space
> > > applications submitting I/Os to raw ZNS devices through the kernel.
> > > 
> > > Can we enable this use case to start with?
> > 
> > I think this already provides that. You can set the nsid value to
> > whatever you want in the passthrough interface, so a namespace block
> > device is not required to issue I/O to a ZNS namespace from user space.
> 
> Mmmmm. Problem now is that the check on the nvme driver prevents the ZNS
> namespace from being initialized. Am I missing something?

Hm, okay, it may not work for you. We need the driver to create at least
one namespace so that we have tags and request_queue. If you have that,
you can issue IO to any other attached namespace through the passthrough
interface, but we can't assume there is an available namespace.
Javier González June 16, 2020, 4:13 p.m. UTC | #28
On 16.06.2020 09:07, Keith Busch wrote:
>On Tue, Jun 16, 2020 at 05:55:26PM +0200, Javier González wrote:
>> On 16.06.2020 08:48, Keith Busch wrote:
>> > On Tue, Jun 16, 2020 at 05:02:17PM +0200, Javier González wrote:
>> > > This depends very much on how the FS / application is managing
>> > > stripping. At the moment our main use case is enabling user-space
>> > > applications submitting I/Os to raw ZNS devices through the kernel.
>> > >
>> > > Can we enable this use case to start with?
>> >
>> > I think this already provides that. You can set the nsid value to
>> > whatever you want in the passthrough interface, so a namespace block
>> > device is not required to issue I/O to a ZNS namespace from user space.
>>
>> Mmmmm. Problem now is that the check on the nvme driver prevents the ZNS
>> namespace from being initialized. Am I missing something?
>
>Hm, okay, it may not work for you. We need the driver to create at least
>one namespace so that we have tags and request_queue. If you have that,
>you can issue IO to any other attached namespace through the passthrough
>interface, but we can't assume there is an available namespace.

That makes sense for now.

The next step for us is to enable a passthrough on uring, making sure
that I/Os do not split.

Does this make sense to you?

Thanks,
Javier
Javier González June 16, 2020, 4:21 p.m. UTC | #29
On 16.06.2020 16:07, Matias Bjorling wrote:
>
>
>> -----Original Message-----
>> From: Javier González <javier@javigon.com>
>> Sent: Tuesday, 16 June 2020 18.03
>> To: Matias Bjørling <mb@lightnvm.io>
>> Cc: Damien Le Moal <Damien.LeMoal@wdc.com>; Jens Axboe
>> <axboe@kernel.dk>; Niklas Cassel <Niklas.Cassel@wdc.com>; Ajay Joshi
>> <Ajay.Joshi@wdc.com>; Sagi Grimberg <sagi@grimberg.me>; Keith Busch
>> <Keith.Busch@wdc.com>; Dmitry Fomichev <Dmitry.Fomichev@wdc.com>;
>> Aravind Ramesh <Aravind.Ramesh@wdc.com>; linux-
>> nvme@lists.infradead.org; linux-block@vger.kernel.org; Hans Holmberg
>> <Hans.Holmberg@wdc.com>; Christoph Hellwig <hch@lst.de>; Matias Bjorling
>> <Matias.Bjorling@wdc.com>
>> Subject: Re: [PATCH 5/5] nvme: support for zoned namespaces
>>
>> On 16.06.2020 17:20, Matias Bjørling wrote:
>> >On 16/06/2020 17.02, Javier González wrote:
>> >>On 16.06.2020 14:42, Damien Le Moal wrote:
>> >>>On 2020/06/16 23:16, Javier González wrote:
>> >>>>On 16.06.2020 12:35, Damien Le Moal wrote:
>> >>>>>On 2020/06/16 21:24, Javier González wrote:
>> >>>>>>On 16.06.2020 14:06, Matias Bjørling wrote:
>> >>>>>>>On 16/06/2020 14.00, Javier González wrote:
>> >>>>>>>>On 16.06.2020 13:18, Matias Bjørling wrote:
>> >>>>>>>>>On 16/06/2020 12.41, Javier González wrote:
>> >>>>>>>>>>On 16.06.2020 08:34, Keith Busch wrote:
>> >>>>>>>>>>>Add support for NVM Express Zoned Namespaces (ZNS)
>> Command
>> >>>>>>>>>>>Set defined in NVM Express TP4053. Zoned namespaces are
>> >>>>>>>>>>>discovered based on their Command Set Identifier reported in
>> >>>>>>>>>>>the namespaces Namespace Identification Descriptor list. A
>> >>>>>>>>>>>successfully discovered Zoned Namespace will be registered
>> >>>>>>>>>>>with the block layer as a host managed zoned block device
>> >>>>>>>>>>>with Zone Append command support. A namespace that does not
>> >>>>>>>>>>>support append is not supported by the driver.
>> >>>>>>>>>>
>> >>>>>>>>>>Why are we enforcing the append command? Append is optional
>> on
>> >>>>>>>>>>the current ZNS specification, so we should not make this
>> >>>>>>>>>>mandatory in the implementation. See specifics below.
>> >>>>>>>>
>> >>>>>>>>>
>> >>>>>>>>>There is already general support in the kernel for the zone
>> >>>>>>>>>append command. Feel free to submit patches to emulate the
>> >>>>>>>>>support. It is outside the scope of this patchset.
>> >>>>>>>>>
>> >>>>>>>>
>> >>>>>>>>It is fine that the kernel supports append, but the ZNS
>> >>>>>>>>specification does not impose the implementation for append, so
>> >>>>>>>>the driver should not do that either.
>> >>>>>>>>
>> >>>>>>>>ZNS SSDs that choose to leave append as a non-implemented
>> >>>>>>>>optional command should not rely on emulated SW support,
>> >>>>>>>>specially when traditional writes work very fine for a large
>> >>>>>>>>part of current ZNS use cases.
>> >>>>>>>>
>> >>>>>>>>Please, remove this virtual constraint.
>> >>>>>>>
>> >>>>>>>The Zone Append command is mandatory for zoned block devices.
>> >>>>>>>Please see https://lwn.net/Articles/818709/ for the background.
>> >>>>>>
>> >>>>>>I do not see anywhere in the block layer that append is mandatory
>> >>>>>>for zoned devices. Append is emulated on ZBC, but beyond that
>> >>>>>>there is no mandatory bits. Please explain.
>> >>>>>
>> >>>>>This is to allow a single write IO path for all types of zoned
>> >>>>>block device for higher layers, e.g file systems. The on-going
>> >>>>>re-work of btrfs zone support for instance now relies 100% on zone
>> >>>>>append being supported. That significantly simplifies the file
>> >>>>>system support and more importantly remove the need for locking
>> >>>>>around block allocation and BIO issuing, allowing to preserve a
>> >>>>>fully asynchronous write path that can include workqueues for
>> >>>>>efficient CPU usage of things like encryption and compression.
>> >>>>>Without zone append, file system would either (1) have to reject
>> >>>>>these drives that do not support zone append, or (2) implement 2
>> >>>>>different write IO path (slower regular write and zone append).
>> >>>>>None of these options are ideal, to say the least.
>> >>>>>
>> >>>>>So the approach is: mandate zone append support for ZNS devices. To
>> >>>>>allow other ZNS drives, an emulation similar to SCSI can be
>> >>>>>implemented, with that emulation ideally combined to work for both
>> >>>>>types of drives if possible.
>> >>>>
>> >>>>Enforcing QD=1 becomes a problem on devices with large zones. In a
>> >>>>ZNS device that has smaller zones this should not be a problem.
>> >>>
>> >>>Let's be precise: this is not running the drive at QD=1, it is "at
>> >>>most one write *request* per zone". If the FS is simultaneously using
>> >>>multiple block groups mapped to different zones, you will get a total
>> >>>write QD > 1, and as many reads as you want.
>> >>>
>> >>>>Would you agree that it is possible to have a write path that relies
>> >>>>on QD=1, where the FS / application has the responsibility for
>> >>>>enforcing this? Down the road this QD can be increased if the device
>> >>>>is able to buffer the writes.
>> >>>
>> >>>Doing QD=1 per zone for writes at the FS layer, that is, at the BIO
>> >>>layer does not work. This is because BIOs can be as large as the FS
>> >>>wants them to be. Such large BIO will be split into multiple requests
>> >>>in the block layer, resulting in more than one write per zone. That
>> >>>is why the zone write locking is at the scheduler level, between BIO
>> >>>split and request dispatch. That avoids the multiple requests
>> >>>fragments of a large BIO to be reordered and fail. That is mandatory
>> >>>as the block layer itself can occasionally reorder requests and lower
>> >>>levels such as AHCI HW is also notoriously good at reversing
>> >>>sequential requests. For NVMe with multi-queue, the IO issuing
>> >>>process getting rescheduled on a different CPU can result in
>> >>>sequential IOs being in different queues, with the likely result of
>> >>>an out-of-order execution. All cases are avoided with zone write
>> >>>locking and at most one write request dispatch per zone as
>> >>>recommended by the ZNS specifications (ZBC and ZAC standards for SMR
>> >>>HDDs are silent on this).
>> >>>
>> >>
>> >>I understand. I agree that the current FSs supporting ZNS follow this
>> >>approach and it makes sense that there is a common interface that
>> >>simplifies the FS implementation. See the comment below on the part I
>> >>believe we see things differently.
>> >>
>> >>
>> >>>>I would be OK with some FS implementations to rely on append and
>> >>>>impose the constraint that append has to be supported (and it would
>> >>>>be our job to change that), but I would like to avoid the driver
>> >>>>rejecting initializing the device because current FS implementations
>> >>>>have implemented this logic.
>> >>>
>> >>>What is the difference between the driver rejecting drives and the FS
>> >>>rejecting the same drives ? That has the same end result to me: an
>> >>>entire class of devices cannot be used as desired by the user.
>> >>>Implementing zone append emulation avoids the rejection entirely
>> >>>while still allowing the FS to have a single write IO path, thus
>> >>>simplifying the code.
>> >>
>> >>The difference is that users that use a raw ZNS device submitting I/O
>> >>through the kernel would still be able to use these devices. The
>> >>result would be that the ZNS SSD is recognized and initialized, but
>> >>the FS format fails.
>> >>
>> >>>
>> >>>>We can agree that a number of initial customers will use these
>> >>>>devices raw, using the in-kernel I/O path, but without a FS on top.
>> >>>>
>> >>>>Thoughts?
>> >>>>
>> >>>>>and note that
>> >>>>>this emulation would require the drive to be operated with
>> >>>>>mq-deadline to enable zone write locking for preserving write
>> >>>>>command order. While on a HDD the performance penalty is minimal,
>> >>>>>it will likely be significant on a SSD.
>> >>>>
>> >>>>Exactly my concern. I do not want ZNS SSDs to be impacted by this
>> >>>>type of design decision at the driver level.
>> >>>
>> >>>But your proposed FS level approach would end up doing the exact same
>> >>>thing with the same limitation and so the same potential performance
>> >>>impact.
>> >>>The block
>> >>>layer generic approach has the advantage that we do not bother the
>> >>>higher levels with the implementation of in-order request dispatch
>> >>>guarantees.
>> >>>File systems
>> >>>are complex enough. The less complexity is required for zone support,
>> >>>the better.
>> >>
>> >>This depends very much on how the FS / application is managing
>> >>stripping. At the moment our main use case is enabling user-space
>> >>applications submitting I/Os to raw ZNS devices through the kernel.
>> >>
>> >>Can we enable this use case to start with?
>> >
>> >It is free for everyone to load kernel modules into the kernel. Those
>> >modules may not have the appropriate checks or may rely on the zone
>> >append functionality. Having per use-case limit is a no-go and at best
>> >a game of whack-a-mole.
>>
>> Let's focus on mainline support. We are leaving append as not enabled based
>> on customer requests for some ZNS products and would like this devices to be
>> supported. This is not at all a corner use-case but a very general one.
>>
>> >
>> >You already agreed to create a set of patches to add the appropriate
>> >support for emulating zone append. As these would fix your specific
>> >issue, please go ahead and submit those.
>>
>> I agreed to solve the use case that some of our customers are enabling and this
>> is what I am doing.
>>
>> Again, to start with I would like to have a path where ZNS namespaces are
>> identified independently of append support. Then specific users can require
>> append if they please to do so. We will of course take care of sending patches
>> for this.
>
>As was previously said, there are users in the kernel that depends on
>zone append. As a result, it is not an option not to have this. Please
>go ahead and send the patches and you'll have the behavior you are
>seeking.
>

Never put in doubt that we are the ones implementing support for this,
but since you keep asking, I want to make it clear that not using append
command is a very general use case for ZNS adopters.

Thanks,
Javier
Matias Bjorling June 16, 2020, 4:25 p.m. UTC | #30
On 16/06/2020 18.21, Javier González wrote:
> On 16.06.2020 16:07, Matias Bjorling wrote:
>>
>>
>>> -----Original Message-----
>>> From: Javier González <javier@javigon.com>
>>> Sent: Tuesday, 16 June 2020 18.03
>>> To: Matias Bjørling <mb@lightnvm.io>
>>> Cc: Damien Le Moal <Damien.LeMoal@wdc.com>; Jens Axboe
>>> <axboe@kernel.dk>; Niklas Cassel <Niklas.Cassel@wdc.com>; Ajay Joshi
>>> <Ajay.Joshi@wdc.com>; Sagi Grimberg <sagi@grimberg.me>; Keith Busch
>>> <Keith.Busch@wdc.com>; Dmitry Fomichev <Dmitry.Fomichev@wdc.com>;
>>> Aravind Ramesh <Aravind.Ramesh@wdc.com>; linux-
>>> nvme@lists.infradead.org; linux-block@vger.kernel.org; Hans Holmberg
>>> <Hans.Holmberg@wdc.com>; Christoph Hellwig <hch@lst.de>; Matias 
>>> Bjorling
>>> <Matias.Bjorling@wdc.com>
>>> Subject: Re: [PATCH 5/5] nvme: support for zoned namespaces
>>>
>>> On 16.06.2020 17:20, Matias Bjørling wrote:
>>> >On 16/06/2020 17.02, Javier González wrote:
>>> >>On 16.06.2020 14:42, Damien Le Moal wrote:
>>> >>>On 2020/06/16 23:16, Javier González wrote:
>>> >>>>On 16.06.2020 12:35, Damien Le Moal wrote:
>>> >>>>>On 2020/06/16 21:24, Javier González wrote:
>>> >>>>>>On 16.06.2020 14:06, Matias Bjørling wrote:
>>> >>>>>>>On 16/06/2020 14.00, Javier González wrote:
>>> >>>>>>>>On 16.06.2020 13:18, Matias Bjørling wrote:
>>> >>>>>>>>>On 16/06/2020 12.41, Javier González wrote:
>>> >>>>>>>>>>On 16.06.2020 08:34, Keith Busch wrote:
>>> >>>>>>>>>>>Add support for NVM Express Zoned Namespaces (ZNS)
>>> Command
>>> >>>>>>>>>>>Set defined in NVM Express TP4053. Zoned namespaces are
>>> >>>>>>>>>>>discovered based on their Command Set Identifier reported in
>>> >>>>>>>>>>>the namespaces Namespace Identification Descriptor list. A
>>> >>>>>>>>>>>successfully discovered Zoned Namespace will be registered
>>> >>>>>>>>>>>with the block layer as a host managed zoned block device
>>> >>>>>>>>>>>with Zone Append command support. A namespace that does not
>>> >>>>>>>>>>>support append is not supported by the driver.
>>> >>>>>>>>>>
>>> >>>>>>>>>>Why are we enforcing the append command? Append is optional
>>> on
>>> >>>>>>>>>>the current ZNS specification, so we should not make this
>>> >>>>>>>>>>mandatory in the implementation. See specifics below.
>>> >>>>>>>>
>>> >>>>>>>>>
>>> >>>>>>>>>There is already general support in the kernel for the zone
>>> >>>>>>>>>append command. Feel free to submit patches to emulate the
>>> >>>>>>>>>support. It is outside the scope of this patchset.
>>> >>>>>>>>>
>>> >>>>>>>>
>>> >>>>>>>>It is fine that the kernel supports append, but the ZNS
>>> >>>>>>>>specification does not impose the implementation for append, so
>>> >>>>>>>>the driver should not do that either.
>>> >>>>>>>>
>>> >>>>>>>>ZNS SSDs that choose to leave append as a non-implemented
>>> >>>>>>>>optional command should not rely on emulated SW support,
>>> >>>>>>>>specially when traditional writes work very fine for a large
>>> >>>>>>>>part of current ZNS use cases.
>>> >>>>>>>>
>>> >>>>>>>>Please, remove this virtual constraint.
>>> >>>>>>>
>>> >>>>>>>The Zone Append command is mandatory for zoned block devices.
>>> >>>>>>>Please see https://lwn.net/Articles/818709/ for the background.
>>> >>>>>>
>>> >>>>>>I do not see anywhere in the block layer that append is mandatory
>>> >>>>>>for zoned devices. Append is emulated on ZBC, but beyond that
>>> >>>>>>there is no mandatory bits. Please explain.
>>> >>>>>
>>> >>>>>This is to allow a single write IO path for all types of zoned
>>> >>>>>block device for higher layers, e.g file systems. The on-going
>>> >>>>>re-work of btrfs zone support for instance now relies 100% on zone
>>> >>>>>append being supported. That significantly simplifies the file
>>> >>>>>system support and more importantly remove the need for locking
>>> >>>>>around block allocation and BIO issuing, allowing to preserve a
>>> >>>>>fully asynchronous write path that can include workqueues for
>>> >>>>>efficient CPU usage of things like encryption and compression.
>>> >>>>>Without zone append, file system would either (1) have to reject
>>> >>>>>these drives that do not support zone append, or (2) implement 2
>>> >>>>>different write IO path (slower regular write and zone append).
>>> >>>>>None of these options are ideal, to say the least.
>>> >>>>>
>>> >>>>>So the approach is: mandate zone append support for ZNS 
>>> devices. To
>>> >>>>>allow other ZNS drives, an emulation similar to SCSI can be
>>> >>>>>implemented, with that emulation ideally combined to work for both
>>> >>>>>types of drives if possible.
>>> >>>>
>>> >>>>Enforcing QD=1 becomes a problem on devices with large zones. In a
>>> >>>>ZNS device that has smaller zones this should not be a problem.
>>> >>>
>>> >>>Let's be precise: this is not running the drive at QD=1, it is "at
>>> >>>most one write *request* per zone". If the FS is simultaneously 
>>> using
>>> >>>multiple block groups mapped to different zones, you will get a 
>>> total
>>> >>>write QD > 1, and as many reads as you want.
>>> >>>
>>> >>>>Would you agree that it is possible to have a write path that 
>>> relies
>>> >>>>on QD=1, where the FS / application has the responsibility for
>>> >>>>enforcing this? Down the road this QD can be increased if the 
>>> device
>>> >>>>is able to buffer the writes.
>>> >>>
>>> >>>Doing QD=1 per zone for writes at the FS layer, that is, at the BIO
>>> >>>layer does not work. This is because BIOs can be as large as the FS
>>> >>>wants them to be. Such large BIO will be split into multiple 
>>> requests
>>> >>>in the block layer, resulting in more than one write per zone. That
>>> >>>is why the zone write locking is at the scheduler level, between BIO
>>> >>>split and request dispatch. That avoids the multiple requests
>>> >>>fragments of a large BIO to be reordered and fail. That is mandatory
>>> >>>as the block layer itself can occasionally reorder requests and 
>>> lower
>>> >>>levels such as AHCI HW is also notoriously good at reversing
>>> >>>sequential requests. For NVMe with multi-queue, the IO issuing
>>> >>>process getting rescheduled on a different CPU can result in
>>> >>>sequential IOs being in different queues, with the likely result of
>>> >>>an out-of-order execution. All cases are avoided with zone write
>>> >>>locking and at most one write request dispatch per zone as
>>> >>>recommended by the ZNS specifications (ZBC and ZAC standards for SMR
>>> >>>HDDs are silent on this).
>>> >>>
>>> >>
>>> >>I understand. I agree that the current FSs supporting ZNS follow this
>>> >>approach and it makes sense that there is a common interface that
>>> >>simplifies the FS implementation. See the comment below on the part I
>>> >>believe we see things differently.
>>> >>
>>> >>
>>> >>>>I would be OK with some FS implementations to rely on append and
>>> >>>>impose the constraint that append has to be supported (and it would
>>> >>>>be our job to change that), but I would like to avoid the driver
>>> >>>>rejecting initializing the device because current FS 
>>> implementations
>>> >>>>have implemented this logic.
>>> >>>
>>> >>>What is the difference between the driver rejecting drives and 
>>> the FS
>>> >>>rejecting the same drives ? That has the same end result to me: an
>>> >>>entire class of devices cannot be used as desired by the user.
>>> >>>Implementing zone append emulation avoids the rejection entirely
>>> >>>while still allowing the FS to have a single write IO path, thus
>>> >>>simplifying the code.
>>> >>
>>> >>The difference is that users that use a raw ZNS device submitting I/O
>>> >>through the kernel would still be able to use these devices. The
>>> >>result would be that the ZNS SSD is recognized and initialized, but
>>> >>the FS format fails.
>>> >>
>>> >>>
>>> >>>>We can agree that a number of initial customers will use these
>>> >>>>devices raw, using the in-kernel I/O path, but without a FS on top.
>>> >>>>
>>> >>>>Thoughts?
>>> >>>>
>>> >>>>>and note that
>>> >>>>>this emulation would require the drive to be operated with
>>> >>>>>mq-deadline to enable zone write locking for preserving write
>>> >>>>>command order. While on a HDD the performance penalty is minimal,
>>> >>>>>it will likely be significant on a SSD.
>>> >>>>
>>> >>>>Exactly my concern. I do not want ZNS SSDs to be impacted by this
>>> >>>>type of design decision at the driver level.
>>> >>>
>>> >>>But your proposed FS level approach would end up doing the exact 
>>> same
>>> >>>thing with the same limitation and so the same potential performance
>>> >>>impact.
>>> >>>The block
>>> >>>layer generic approach has the advantage that we do not bother the
>>> >>>higher levels with the implementation of in-order request dispatch
>>> >>>guarantees.
>>> >>>File systems
>>> >>>are complex enough. The less complexity is required for zone 
>>> support,
>>> >>>the better.
>>> >>
>>> >>This depends very much on how the FS / application is managing
>>> >>stripping. At the moment our main use case is enabling user-space
>>> >>applications submitting I/Os to raw ZNS devices through the kernel.
>>> >>
>>> >>Can we enable this use case to start with?
>>> >
>>> >It is free for everyone to load kernel modules into the kernel. Those
>>> >modules may not have the appropriate checks or may rely on the zone
>>> >append functionality. Having per use-case limit is a no-go and at best
>>> >a game of whack-a-mole.
>>>
>>> Let's focus on mainline support. We are leaving append as not 
>>> enabled based
>>> on customer requests for some ZNS products and would like this 
>>> devices to be
>>> supported. This is not at all a corner use-case but a very general one.
>>>
>>> >
>>> >You already agreed to create a set of patches to add the appropriate
>>> >support for emulating zone append. As these would fix your specific
>>> >issue, please go ahead and submit those.
>>>
>>> I agreed to solve the use case that some of our customers are 
>>> enabling and this
>>> is what I am doing.
>>>
>>> Again, to start with I would like to have a path where ZNS 
>>> namespaces are
>>> identified independently of append support. Then specific users can 
>>> require
>>> append if they please to do so. We will of course take care of 
>>> sending patches
>>> for this.
>>
>> As was previously said, there are users in the kernel that depends on
>> zone append. As a result, it is not an option not to have this. Please
>> go ahead and send the patches and you'll have the behavior you are
>> seeking.
>>
>
> Never put in doubt that we are the ones implementing support for this,
> but since you keep asking, I want to make it clear that not using append
> command is a very general use case for ZNS adopters.
>
I am not asking. I am confirming that this is orthogonal to _this_ 
specific patchset. This discussion can continue in the potential patches 
that you plan.
Damien Le Moal June 17, 2020, 12:14 a.m. UTC | #31
On 2020/06/17 0:02, Javier González wrote:
> On 16.06.2020 14:42, Damien Le Moal wrote:
>> On 2020/06/16 23:16, Javier González wrote:
>>> On 16.06.2020 12:35, Damien Le Moal wrote:
>>>> On 2020/06/16 21:24, Javier González wrote:
>>>>> On 16.06.2020 14:06, Matias Bjørling wrote:
>>>>>> On 16/06/2020 14.00, Javier González wrote:
>>>>>>> On 16.06.2020 13:18, Matias Bjørling wrote:
>>>>>>>> On 16/06/2020 12.41, Javier González wrote:
>>>>>>>>> On 16.06.2020 08:34, Keith Busch wrote:
>>>>>>>>>> Add support for NVM Express Zoned Namespaces (ZNS) Command Set defined
>>>>>>>>>> in NVM Express TP4053. Zoned namespaces are discovered based on their
>>>>>>>>>> Command Set Identifier reported in the namespaces Namespace
>>>>>>>>>> Identification Descriptor list. A successfully discovered Zoned
>>>>>>>>>> Namespace will be registered with the block layer as a host managed
>>>>>>>>>> zoned block device with Zone Append command support. A namespace that
>>>>>>>>>> does not support append is not supported by the driver.
>>>>>>>>>
>>>>>>>>> Why are we enforcing the append command? Append is optional on the
>>>>>>>>> current ZNS specification, so we should not make this mandatory in the
>>>>>>>>> implementation. See specifics below.
>>>>>>>
>>>>>>>>
>>>>>>>> There is already general support in the kernel for the zone append
>>>>>>>> command. Feel free to submit patches to emulate the support. It is
>>>>>>>> outside the scope of this patchset.
>>>>>>>>
>>>>>>>
>>>>>>> It is fine that the kernel supports append, but the ZNS specification
>>>>>>> does not impose the implementation for append, so the driver should not
>>>>>>> do that either.
>>>>>>>
>>>>>>> ZNS SSDs that choose to leave append as a non-implemented optional
>>>>>>> command should not rely on emulated SW support, specially when
>>>>>>> traditional writes work very fine for a large part of current ZNS use
>>>>>>> cases.
>>>>>>>
>>>>>>> Please, remove this virtual constraint.
>>>>>>
>>>>>> The Zone Append command is mandatory for zoned block devices. Please
>>>>>> see https://lwn.net/Articles/818709/ for the background.
>>>>>
>>>>> I do not see anywhere in the block layer that append is mandatory for
>>>>> zoned devices. Append is emulated on ZBC, but beyond that there is no
>>>>> mandatory bits. Please explain.
>>>>
>>>> This is to allow a single write IO path for all types of zoned block device for
>>>> higher layers, e.g file systems. The on-going re-work of btrfs zone support for
>>>> instance now relies 100% on zone append being supported. That significantly
>>>> simplifies the file system support and more importantly remove the need for
>>>> locking around block allocation and BIO issuing, allowing to preserve a fully
>>>> asynchronous write path that can include workqueues for efficient CPU usage of
>>>> things like encryption and compression. Without zone append, file system would
>>>> either (1) have to reject these drives that do not support zone append, or (2)
>>>> implement 2 different write IO path (slower regular write and zone append). None
>>>> of these options are ideal, to say the least.
>>>>
>>>> So the approach is: mandate zone append support for ZNS devices. To allow other
>>>> ZNS drives, an emulation similar to SCSI can be implemented, with that emulation
>>>> ideally combined to work for both types of drives if possible.
>>>
>>> Enforcing QD=1 becomes a problem on devices with large zones. In
>>> a ZNS device that has smaller zones this should not be a problem.
>>
>> Let's be precise: this is not running the drive at QD=1, it is "at most one
>> write *request* per zone". If the FS is simultaneously using multiple block
>> groups mapped to different zones, you will get a total write QD > 1, and as many
>> reads as you want.
>>
>>> Would you agree that it is possible to have a write path that relies on
>>> QD=1, where the FS / application has the responsibility for enforcing
>>> this? Down the road this QD can be increased if the device is able to
>>> buffer the writes.
>>
>> Doing QD=1 per zone for writes at the FS layer, that is, at the BIO layer does
>> not work. This is because BIOs can be as large as the FS wants them to be. Such
>> large BIO will be split into multiple requests in the block layer, resulting in
>> more than one write per zone. That is why the zone write locking is at the
>> scheduler level, between BIO split and request dispatch. That avoids the
>> multiple requests fragments of a large BIO to be reordered and fail. That is
>> mandatory as the block layer itself can occasionally reorder requests and lower
>> levels such as AHCI HW is also notoriously good at reversing sequential
>> requests. For NVMe with multi-queue, the IO issuing process getting rescheduled
>> on a different CPU can result in sequential IOs being in different queues, with
>> the likely result of an out-of-order execution. All cases are avoided with zone
>> write locking and at most one write request dispatch per zone as recommended by
>> the ZNS specifications (ZBC and ZAC standards for SMR HDDs are silent on this).
>>
> 
> I understand. I agree that the current FSs supporting ZNS follow this
> approach and it makes sense that there is a common interface that
> simplifies the FS implementation. See the comment below on the part I
> believe we see things differently.
> 
> 
>>> I would be OK with some FS implementations to rely on append and impose
>>> the constraint that append has to be supported (and it would be our job
>>> to change that), but I would like to avoid the driver rejecting
>>> initializing the device because current FS implementations have
>>> implemented this logic.
>>
>> What is the difference between the driver rejecting drives and the FS rejecting
>> the same drives ? That has the same end result to me: an entire class of devices
>> cannot be used as desired by the user. Implementing zone append emulation avoids
>> the rejection entirely while still allowing the FS to have a single write IO
>> path, thus simplifying the code.
> 
> The difference is that users that use a raw ZNS device submitting I/O
> through the kernel would still be able to use these devices. The result
> would be that the ZNS SSD is recognized and initialized, but the FS
> format fails.

I understand your point of view. Raw ZNS block device access by an application
is of course a fine use case. SMR also has plenty of these.

My point is that enabling this regular write/raw device use case should not
prevent using btrfs or other kernel components that require zone append.
Implementing zone append emulation in the NVMe/ZNS driver for devices without
native support for the command enables *all* use cases without impacting the use
case you are interested in.

This approach is, in my opinion, far better. No one is left out and the user
gains a flexible system with different setup capabilities. The user wins here.

> 
>>
>>> We can agree that a number of initial customers will use these devices
>>> raw, using the in-kernel I/O path, but without a FS on top.
>>>
>>> Thoughts?
>>>
>>>> and note that
>>>> this emulation would require the drive to be operated with mq-deadline to enable
>>>> zone write locking for preserving write command order. While on a HDD the
>>>> performance penalty is minimal, it will likely be significant on a SSD.
>>>
>>> Exactly my concern. I do not want ZNS SSDs to be impacted by this type
>>> of design decision at the driver level.
>>
>> But your proposed FS level approach would end up doing the exact same thing with
>> the same limitation and so the same potential performance impact. The block
>> layer generic approach has the advantage that we do not bother the higher levels
>> with the implementation of in-order request dispatch guarantees. File systems
>> are complex enough. The less complexity is required for zone support, the better.
> 
> This depends very much on how the FS / application is managing
> stripping. At the moment our main use case is enabling user-space
> applications submitting I/Os to raw ZNS devices through the kernel.
> 
> Can we enable this use case to start with?

Yes, see above. Again, we should not have to choose one *or* the other. The user
should be able to use both raw accesses *and* file systems that require zone
append. The initial patch set enables the latter. For the former, additional
patches are needed. And the work done in SCSI already simplifies that task. The
block layer is already wired to handle zone append emulation.

> 
> Thanks,
> Javier
>
Damien Le Moal June 17, 2020, 12:38 a.m. UTC | #32
On 2020/06/17 1:13, Javier González wrote:
> On 16.06.2020 09:07, Keith Busch wrote:
>> On Tue, Jun 16, 2020 at 05:55:26PM +0200, Javier González wrote:
>>> On 16.06.2020 08:48, Keith Busch wrote:
>>>> On Tue, Jun 16, 2020 at 05:02:17PM +0200, Javier González wrote:
>>>>> This depends very much on how the FS / application is managing
>>>>> stripping. At the moment our main use case is enabling user-space
>>>>> applications submitting I/Os to raw ZNS devices through the kernel.
>>>>>
>>>>> Can we enable this use case to start with?
>>>>
>>>> I think this already provides that. You can set the nsid value to
>>>> whatever you want in the passthrough interface, so a namespace block
>>>> device is not required to issue I/O to a ZNS namespace from user space.
>>>
>>> Mmmmm. Problem now is that the check on the nvme driver prevents the ZNS
>>> namespace from being initialized. Am I missing something?
>>
>> Hm, okay, it may not work for you. We need the driver to create at least
>> one namespace so that we have tags and request_queue. If you have that,
>> you can issue IO to any other attached namespace through the passthrough
>> interface, but we can't assume there is an available namespace.
> 
> That makes sense for now.
> 
> The next step for us is to enable a passthrough on uring, making sure
> that I/Os do not split.

Passthrough as in "application issues directly NVMe commands" like for SG_IO
with SCSI ? Or do you mean raw block device file accesses by the application,
meaning that the IO goes through the block IO stack as opposed to directly going
to the driver ?

For the latter case, I do not think it is possible to guarantee that an IO will
not get split unless we are talking about single page IOs (e.g. 4K on X86). See
a somewhat similar request here and comments about it.

https://www.spinics.net/lists/linux-block/msg55079.html

> 
> Does this make sense to you?
> 
> Thanks,
> Javier
>
Martin K. Petersen June 17, 2020, 2:08 a.m. UTC | #33
Keith,

> Add support for NVM Express Zoned Namespaces (ZNS) Command Set defined
> in NVM Express TP4053. Zoned namespaces are discovered based on their
> Command Set Identifier reported in the namespaces Namespace
> Identification Descriptor list. A successfully discovered Zoned
> Namespace will be registered with the block layer as a host managed
> zoned block device with Zone Append command support. A namespace that
> does not support append is not supported by the driver.

Looks really nice!

Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Javier González June 17, 2020, 6:09 a.m. UTC | #34
On 17.06.2020 00:14, Damien Le Moal wrote:
>On 2020/06/17 0:02, Javier González wrote:
>> On 16.06.2020 14:42, Damien Le Moal wrote:
>>> On 2020/06/16 23:16, Javier González wrote:
>>>> On 16.06.2020 12:35, Damien Le Moal wrote:
>>>>> On 2020/06/16 21:24, Javier González wrote:
>>>>>> On 16.06.2020 14:06, Matias Bjørling wrote:
>>>>>>> On 16/06/2020 14.00, Javier González wrote:
>>>>>>>> On 16.06.2020 13:18, Matias Bjørling wrote:
>>>>>>>>> On 16/06/2020 12.41, Javier González wrote:
>>>>>>>>>> On 16.06.2020 08:34, Keith Busch wrote:
>>>>>>>>>>> Add support for NVM Express Zoned Namespaces (ZNS) Command Set defined
>>>>>>>>>>> in NVM Express TP4053. Zoned namespaces are discovered based on their
>>>>>>>>>>> Command Set Identifier reported in the namespaces Namespace
>>>>>>>>>>> Identification Descriptor list. A successfully discovered Zoned
>>>>>>>>>>> Namespace will be registered with the block layer as a host managed
>>>>>>>>>>> zoned block device with Zone Append command support. A namespace that
>>>>>>>>>>> does not support append is not supported by the driver.
>>>>>>>>>>
>>>>>>>>>> Why are we enforcing the append command? Append is optional on the
>>>>>>>>>> current ZNS specification, so we should not make this mandatory in the
>>>>>>>>>> implementation. See specifics below.
>>>>>>>>
>>>>>>>>>
>>>>>>>>> There is already general support in the kernel for the zone append
>>>>>>>>> command. Feel free to submit patches to emulate the support. It is
>>>>>>>>> outside the scope of this patchset.
>>>>>>>>>
>>>>>>>>
>>>>>>>> It is fine that the kernel supports append, but the ZNS specification
>>>>>>>> does not impose the implementation for append, so the driver should not
>>>>>>>> do that either.
>>>>>>>>
>>>>>>>> ZNS SSDs that choose to leave append as a non-implemented optional
>>>>>>>> command should not rely on emulated SW support, specially when
>>>>>>>> traditional writes work very fine for a large part of current ZNS use
>>>>>>>> cases.
>>>>>>>>
>>>>>>>> Please, remove this virtual constraint.
>>>>>>>
>>>>>>> The Zone Append command is mandatory for zoned block devices. Please
>>>>>>> see https://lwn.net/Articles/818709/ for the background.
>>>>>>
>>>>>> I do not see anywhere in the block layer that append is mandatory for
>>>>>> zoned devices. Append is emulated on ZBC, but beyond that there is no
>>>>>> mandatory bits. Please explain.
>>>>>
>>>>> This is to allow a single write IO path for all types of zoned block device for
>>>>> higher layers, e.g file systems. The on-going re-work of btrfs zone support for
>>>>> instance now relies 100% on zone append being supported. That significantly
>>>>> simplifies the file system support and more importantly remove the need for
>>>>> locking around block allocation and BIO issuing, allowing to preserve a fully
>>>>> asynchronous write path that can include workqueues for efficient CPU usage of
>>>>> things like encryption and compression. Without zone append, file system would
>>>>> either (1) have to reject these drives that do not support zone append, or (2)
>>>>> implement 2 different write IO path (slower regular write and zone append). None
>>>>> of these options are ideal, to say the least.
>>>>>
>>>>> So the approach is: mandate zone append support for ZNS devices. To allow other
>>>>> ZNS drives, an emulation similar to SCSI can be implemented, with that emulation
>>>>> ideally combined to work for both types of drives if possible.
>>>>
>>>> Enforcing QD=1 becomes a problem on devices with large zones. In
>>>> a ZNS device that has smaller zones this should not be a problem.
>>>
>>> Let's be precise: this is not running the drive at QD=1, it is "at most one
>>> write *request* per zone". If the FS is simultaneously using multiple block
>>> groups mapped to different zones, you will get a total write QD > 1, and as many
>>> reads as you want.
>>>
>>>> Would you agree that it is possible to have a write path that relies on
>>>> QD=1, where the FS / application has the responsibility for enforcing
>>>> this? Down the road this QD can be increased if the device is able to
>>>> buffer the writes.
>>>
>>> Doing QD=1 per zone for writes at the FS layer, that is, at the BIO layer does
>>> not work. This is because BIOs can be as large as the FS wants them to be. Such
>>> large BIO will be split into multiple requests in the block layer, resulting in
>>> more than one write per zone. That is why the zone write locking is at the
>>> scheduler level, between BIO split and request dispatch. That avoids the
>>> multiple requests fragments of a large BIO to be reordered and fail. That is
>>> mandatory as the block layer itself can occasionally reorder requests and lower
>>> levels such as AHCI HW is also notoriously good at reversing sequential
>>> requests. For NVMe with multi-queue, the IO issuing process getting rescheduled
>>> on a different CPU can result in sequential IOs being in different queues, with
>>> the likely result of an out-of-order execution. All cases are avoided with zone
>>> write locking and at most one write request dispatch per zone as recommended by
>>> the ZNS specifications (ZBC and ZAC standards for SMR HDDs are silent on this).
>>>
>>
>> I understand. I agree that the current FSs supporting ZNS follow this
>> approach and it makes sense that there is a common interface that
>> simplifies the FS implementation. See the comment below on the part I
>> believe we see things differently.
>>
>>
>>>> I would be OK with some FS implementations to rely on append and impose
>>>> the constraint that append has to be supported (and it would be our job
>>>> to change that), but I would like to avoid the driver rejecting
>>>> initializing the device because current FS implementations have
>>>> implemented this logic.
>>>
>>> What is the difference between the driver rejecting drives and the FS rejecting
>>> the same drives ? That has the same end result to me: an entire class of devices
>>> cannot be used as desired by the user. Implementing zone append emulation avoids
>>> the rejection entirely while still allowing the FS to have a single write IO
>>> path, thus simplifying the code.
>>
>> The difference is that users that use a raw ZNS device submitting I/O
>> through the kernel would still be able to use these devices. The result
>> would be that the ZNS SSD is recognized and initialized, but the FS
>> format fails.
>
>I understand your point of view. Raw ZNS block device access by an application
>is of course a fine use case. SMR also has plenty of these.
>
>My point is that enabling this regular write/raw device use case should not
>prevent using btrfs or other kernel components that require zone append.
>Implementing zone append emulation in the NVMe/ZNS driver for devices without
>native support for the command enables *all* use cases without impacting the use
>case you are interested in.
>
>This approach is, in my opinion, far better. No one is left out and the user
>gains a flexible system with different setup capabilities. The user wins here.

So, do you see a path where we enable the following:

    1. We add the emulation layer to the NVMe driver for enabling FSs
       that currently support zoned devices
    2. We add a path from user-space (e.g., uring) to enable passthru
       commands to the NVMe driver to enable a raw ZNS path from the
       application. This path does not require the device to support
       append. An initial limitation is that I/Os must be of < 127 bio
       segments (same as append) to avod bio splits
    3. As per above, the NVMe driver allows ZNS drives without append
       support to be initialized and the check moves to the FS
       formatting.

2 and 3. is something we have on our end. We need to rebase on top of
the patches you guys submitted. 1. is something we can help with after
that.

Does the above make sense to you?

>
>>
>>>
>>>> We can agree that a number of initial customers will use these devices
>>>> raw, using the in-kernel I/O path, but without a FS on top.
>>>>
>>>> Thoughts?
>>>>
>>>>> and note that
>>>>> this emulation would require the drive to be operated with mq-deadline to enable
>>>>> zone write locking for preserving write command order. While on a HDD the
>>>>> performance penalty is minimal, it will likely be significant on a SSD.
>>>>
>>>> Exactly my concern. I do not want ZNS SSDs to be impacted by this type
>>>> of design decision at the driver level.
>>>
>>> But your proposed FS level approach would end up doing the exact same thing with
>>> the same limitation and so the same potential performance impact. The block
>>> layer generic approach has the advantage that we do not bother the higher levels
>>> with the implementation of in-order request dispatch guarantees. File systems
>>> are complex enough. The less complexity is required for zone support, the better.
>>
>> This depends very much on how the FS / application is managing
>> stripping. At the moment our main use case is enabling user-space
>> applications submitting I/Os to raw ZNS devices through the kernel.
>>
>> Can we enable this use case to start with?
>
>Yes, see above. Again, we should not have to choose one *or* the other. The user
>should be able to use both raw accesses *and* file systems that require zone
>append. The initial patch set enables the latter. For the former, additional
>patches are needed. And the work done in SCSI already simplifies that task. The
>block layer is already wired to handle zone append emulation.

Agree.

Javier
Javier González June 17, 2020, 6:18 a.m. UTC | #35
On 17.06.2020 00:38, Damien Le Moal wrote:
>On 2020/06/17 1:13, Javier González wrote:
>> On 16.06.2020 09:07, Keith Busch wrote:
>>> On Tue, Jun 16, 2020 at 05:55:26PM +0200, Javier González wrote:
>>>> On 16.06.2020 08:48, Keith Busch wrote:
>>>>> On Tue, Jun 16, 2020 at 05:02:17PM +0200, Javier González wrote:
>>>>>> This depends very much on how the FS / application is managing
>>>>>> stripping. At the moment our main use case is enabling user-space
>>>>>> applications submitting I/Os to raw ZNS devices through the kernel.
>>>>>>
>>>>>> Can we enable this use case to start with?
>>>>>
>>>>> I think this already provides that. You can set the nsid value to
>>>>> whatever you want in the passthrough interface, so a namespace block
>>>>> device is not required to issue I/O to a ZNS namespace from user space.
>>>>
>>>> Mmmmm. Problem now is that the check on the nvme driver prevents the ZNS
>>>> namespace from being initialized. Am I missing something?
>>>
>>> Hm, okay, it may not work for you. We need the driver to create at least
>>> one namespace so that we have tags and request_queue. If you have that,
>>> you can issue IO to any other attached namespace through the passthrough
>>> interface, but we can't assume there is an available namespace.
>>
>> That makes sense for now.
>>
>> The next step for us is to enable a passthrough on uring, making sure
>> that I/Os do not split.
>
>Passthrough as in "application issues directly NVMe commands" like for SG_IO
>with SCSI ? Or do you mean raw block device file accesses by the application,
>meaning that the IO goes through the block IO stack as opposed to directly going
>to the driver ?
>
>For the latter case, I do not think it is possible to guarantee that an IO will
>not get split unless we are talking about single page IOs (e.g. 4K on X86). See
>a somewhat similar request here and comments about it.
>
>https://www.spinics.net/lists/linux-block/msg55079.html

At the moment we are doing the former, but it looks like a hack to me to
go directly to the NVMe driver.

I was thinking that we could enable the second path by making use of
chunk_sectors and limit the I/O size just as the append_max_io_size
does. Is this the complete wrong way of looking at it?

Thanks,
Javier
Damien Le Moal June 17, 2020, 6:47 a.m. UTC | #36
On 2020/06/17 15:10, Javier González wrote:
> On 17.06.2020 00:14, Damien Le Moal wrote:
>> On 2020/06/17 0:02, Javier González wrote:
>>> On 16.06.2020 14:42, Damien Le Moal wrote:
>>>> On 2020/06/16 23:16, Javier González wrote:
>>>>> On 16.06.2020 12:35, Damien Le Moal wrote:
>>>>>> On 2020/06/16 21:24, Javier González wrote:
>>>>>>> On 16.06.2020 14:06, Matias Bjørling wrote:
>>>>>>>> On 16/06/2020 14.00, Javier González wrote:
>>>>>>>>> On 16.06.2020 13:18, Matias Bjørling wrote:
>>>>>>>>>> On 16/06/2020 12.41, Javier González wrote:
>>>>>>>>>>> On 16.06.2020 08:34, Keith Busch wrote:
>>>>>>>>>>>> Add support for NVM Express Zoned Namespaces (ZNS) Command Set defined
>>>>>>>>>>>> in NVM Express TP4053. Zoned namespaces are discovered based on their
>>>>>>>>>>>> Command Set Identifier reported in the namespaces Namespace
>>>>>>>>>>>> Identification Descriptor list. A successfully discovered Zoned
>>>>>>>>>>>> Namespace will be registered with the block layer as a host managed
>>>>>>>>>>>> zoned block device with Zone Append command support. A namespace that
>>>>>>>>>>>> does not support append is not supported by the driver.
>>>>>>>>>>>
>>>>>>>>>>> Why are we enforcing the append command? Append is optional on the
>>>>>>>>>>> current ZNS specification, so we should not make this mandatory in the
>>>>>>>>>>> implementation. See specifics below.
>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> There is already general support in the kernel for the zone append
>>>>>>>>>> command. Feel free to submit patches to emulate the support. It is
>>>>>>>>>> outside the scope of this patchset.
>>>>>>>>>>
>>>>>>>>>
>>>>>>>>> It is fine that the kernel supports append, but the ZNS specification
>>>>>>>>> does not impose the implementation for append, so the driver should not
>>>>>>>>> do that either.
>>>>>>>>>
>>>>>>>>> ZNS SSDs that choose to leave append as a non-implemented optional
>>>>>>>>> command should not rely on emulated SW support, specially when
>>>>>>>>> traditional writes work very fine for a large part of current ZNS use
>>>>>>>>> cases.
>>>>>>>>>
>>>>>>>>> Please, remove this virtual constraint.
>>>>>>>>
>>>>>>>> The Zone Append command is mandatory for zoned block devices. Please
>>>>>>>> see https://lwn.net/Articles/818709/ for the background.
>>>>>>>
>>>>>>> I do not see anywhere in the block layer that append is mandatory for
>>>>>>> zoned devices. Append is emulated on ZBC, but beyond that there is no
>>>>>>> mandatory bits. Please explain.
>>>>>>
>>>>>> This is to allow a single write IO path for all types of zoned block device for
>>>>>> higher layers, e.g file systems. The on-going re-work of btrfs zone support for
>>>>>> instance now relies 100% on zone append being supported. That significantly
>>>>>> simplifies the file system support and more importantly remove the need for
>>>>>> locking around block allocation and BIO issuing, allowing to preserve a fully
>>>>>> asynchronous write path that can include workqueues for efficient CPU usage of
>>>>>> things like encryption and compression. Without zone append, file system would
>>>>>> either (1) have to reject these drives that do not support zone append, or (2)
>>>>>> implement 2 different write IO path (slower regular write and zone append). None
>>>>>> of these options are ideal, to say the least.
>>>>>>
>>>>>> So the approach is: mandate zone append support for ZNS devices. To allow other
>>>>>> ZNS drives, an emulation similar to SCSI can be implemented, with that emulation
>>>>>> ideally combined to work for both types of drives if possible.
>>>>>
>>>>> Enforcing QD=1 becomes a problem on devices with large zones. In
>>>>> a ZNS device that has smaller zones this should not be a problem.
>>>>
>>>> Let's be precise: this is not running the drive at QD=1, it is "at most one
>>>> write *request* per zone". If the FS is simultaneously using multiple block
>>>> groups mapped to different zones, you will get a total write QD > 1, and as many
>>>> reads as you want.
>>>>
>>>>> Would you agree that it is possible to have a write path that relies on
>>>>> QD=1, where the FS / application has the responsibility for enforcing
>>>>> this? Down the road this QD can be increased if the device is able to
>>>>> buffer the writes.
>>>>
>>>> Doing QD=1 per zone for writes at the FS layer, that is, at the BIO layer does
>>>> not work. This is because BIOs can be as large as the FS wants them to be. Such
>>>> large BIO will be split into multiple requests in the block layer, resulting in
>>>> more than one write per zone. That is why the zone write locking is at the
>>>> scheduler level, between BIO split and request dispatch. That avoids the
>>>> multiple requests fragments of a large BIO to be reordered and fail. That is
>>>> mandatory as the block layer itself can occasionally reorder requests and lower
>>>> levels such as AHCI HW is also notoriously good at reversing sequential
>>>> requests. For NVMe with multi-queue, the IO issuing process getting rescheduled
>>>> on a different CPU can result in sequential IOs being in different queues, with
>>>> the likely result of an out-of-order execution. All cases are avoided with zone
>>>> write locking and at most one write request dispatch per zone as recommended by
>>>> the ZNS specifications (ZBC and ZAC standards for SMR HDDs are silent on this).
>>>>
>>>
>>> I understand. I agree that the current FSs supporting ZNS follow this
>>> approach and it makes sense that there is a common interface that
>>> simplifies the FS implementation. See the comment below on the part I
>>> believe we see things differently.
>>>
>>>
>>>>> I would be OK with some FS implementations to rely on append and impose
>>>>> the constraint that append has to be supported (and it would be our job
>>>>> to change that), but I would like to avoid the driver rejecting
>>>>> initializing the device because current FS implementations have
>>>>> implemented this logic.
>>>>
>>>> What is the difference between the driver rejecting drives and the FS rejecting
>>>> the same drives ? That has the same end result to me: an entire class of devices
>>>> cannot be used as desired by the user. Implementing zone append emulation avoids
>>>> the rejection entirely while still allowing the FS to have a single write IO
>>>> path, thus simplifying the code.
>>>
>>> The difference is that users that use a raw ZNS device submitting I/O
>>> through the kernel would still be able to use these devices. The result
>>> would be that the ZNS SSD is recognized and initialized, but the FS
>>> format fails.
>>
>> I understand your point of view. Raw ZNS block device access by an application
>> is of course a fine use case. SMR also has plenty of these.
>>
>> My point is that enabling this regular write/raw device use case should not
>> prevent using btrfs or other kernel components that require zone append.
>> Implementing zone append emulation in the NVMe/ZNS driver for devices without
>> native support for the command enables *all* use cases without impacting the use
>> case you are interested in.
>>
>> This approach is, in my opinion, far better. No one is left out and the user
>> gains a flexible system with different setup capabilities. The user wins here.
> 
> So, do you see a path where we enable the following:
> 
>     1. We add the emulation layer to the NVMe driver for enabling FSs
>        that currently support zoned devices
>     2. We add a path from user-space (e.g., uring) to enable passthru
>        commands to the NVMe driver to enable a raw ZNS path from the
>        application. This path does not require the device to support
>        append. An initial limitation is that I/Os must be of < 127 bio
>        segments (same as append) to avod bio splits
>     3. As per above, the NVMe driver allows ZNS drives without append
>        support to be initialized and the check moves to the FS
>        formatting.
> 
> 2 and 3. is something we have on our end. We need to rebase on top of
> the patches you guys submitted. 1. is something we can help with after
> that.
> 
> Does the above make sense to you?

Doing (1) first will give you a regular nvme namespace block device that you can
use to send passthrough commands with ioctl(). So (1) gives you (2).

However, I do not understand what io-uring has to do with passthrough. io-uring
being a block layer functionality, I do not think you can use it to send
passthrough commands to the driver. I amy be wrong though, but my understanding
is that for NVMe, passthrough is either ioctl() to device file or the entire
driver in user space with SPDK.

As for (3), I do not understand your point. If you have (1), then an FS
requiring zone append will work.
Damien Le Moal June 17, 2020, 6:54 a.m. UTC | #37
On 2020/06/17 15:18, Javier González wrote:
> On 17.06.2020 00:38, Damien Le Moal wrote:
>> On 2020/06/17 1:13, Javier González wrote:
>>> On 16.06.2020 09:07, Keith Busch wrote:
>>>> On Tue, Jun 16, 2020 at 05:55:26PM +0200, Javier González wrote:
>>>>> On 16.06.2020 08:48, Keith Busch wrote:
>>>>>> On Tue, Jun 16, 2020 at 05:02:17PM +0200, Javier González wrote:
>>>>>>> This depends very much on how the FS / application is managing
>>>>>>> stripping. At the moment our main use case is enabling user-space
>>>>>>> applications submitting I/Os to raw ZNS devices through the kernel.
>>>>>>>
>>>>>>> Can we enable this use case to start with?
>>>>>>
>>>>>> I think this already provides that. You can set the nsid value to
>>>>>> whatever you want in the passthrough interface, so a namespace block
>>>>>> device is not required to issue I/O to a ZNS namespace from user space.
>>>>>
>>>>> Mmmmm. Problem now is that the check on the nvme driver prevents the ZNS
>>>>> namespace from being initialized. Am I missing something?
>>>>
>>>> Hm, okay, it may not work for you. We need the driver to create at least
>>>> one namespace so that we have tags and request_queue. If you have that,
>>>> you can issue IO to any other attached namespace through the passthrough
>>>> interface, but we can't assume there is an available namespace.
>>>
>>> That makes sense for now.
>>>
>>> The next step for us is to enable a passthrough on uring, making sure
>>> that I/Os do not split.
>>
>> Passthrough as in "application issues directly NVMe commands" like for SG_IO
>> with SCSI ? Or do you mean raw block device file accesses by the application,
>> meaning that the IO goes through the block IO stack as opposed to directly going
>> to the driver ?
>>
>> For the latter case, I do not think it is possible to guarantee that an IO will
>> not get split unless we are talking about single page IOs (e.g. 4K on X86). See
>> a somewhat similar request here and comments about it.
>>
>> https://www.spinics.net/lists/linux-block/msg55079.html
> 
> At the moment we are doing the former, but it looks like a hack to me to
> go directly to the NVMe driver.

That is what the nvme driver ioctl() is for no ? An application can send an NVMe
command directly to the driver with it. That is not a hack, but the regular way
of doing passthrough for NVMe, isn't it ?

> I was thinking that we could enable the second path by making use of
> chunk_sectors and limit the I/O size just as the append_max_io_size
> does. Is this the complete wrong way of looking at it?

The block layer cannot limit the size of a passthrough command since the command
is protocol specific and the block layer is a protocol independent interface.
SCSI SG does not split passthrough requests, it cannot. For passthrough
commands, the command buffer can be dma-mapped or it cannot. If mapping
succeeds, the command is issued. If it cannot, the command is failed. At least,
that is my understanding of how the stack is working.

> 
> Thanks,
> Javier
> 
> _______________________________________________
> linux-nvme mailing list
> linux-nvme@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-nvme
>
Javier González June 17, 2020, 7:02 a.m. UTC | #38
On 17.06.2020 06:47, Damien Le Moal wrote:
>On 2020/06/17 15:10, Javier González wrote:
>> On 17.06.2020 00:14, Damien Le Moal wrote:
>>> On 2020/06/17 0:02, Javier González wrote:
>>>> On 16.06.2020 14:42, Damien Le Moal wrote:
>>>>> On 2020/06/16 23:16, Javier González wrote:
>>>>>> On 16.06.2020 12:35, Damien Le Moal wrote:
>>>>>>> On 2020/06/16 21:24, Javier González wrote:
>>>>>>>> On 16.06.2020 14:06, Matias Bjørling wrote:
>>>>>>>>> On 16/06/2020 14.00, Javier González wrote:
>>>>>>>>>> On 16.06.2020 13:18, Matias Bjørling wrote:
>>>>>>>>>>> On 16/06/2020 12.41, Javier González wrote:
>>>>>>>>>>>> On 16.06.2020 08:34, Keith Busch wrote:
>>>>>>>>>>>>> Add support for NVM Express Zoned Namespaces (ZNS) Command Set defined
>>>>>>>>>>>>> in NVM Express TP4053. Zoned namespaces are discovered based on their
>>>>>>>>>>>>> Command Set Identifier reported in the namespaces Namespace
>>>>>>>>>>>>> Identification Descriptor list. A successfully discovered Zoned
>>>>>>>>>>>>> Namespace will be registered with the block layer as a host managed
>>>>>>>>>>>>> zoned block device with Zone Append command support. A namespace that
>>>>>>>>>>>>> does not support append is not supported by the driver.
>>>>>>>>>>>>
>>>>>>>>>>>> Why are we enforcing the append command? Append is optional on the
>>>>>>>>>>>> current ZNS specification, so we should not make this mandatory in the
>>>>>>>>>>>> implementation. See specifics below.
>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> There is already general support in the kernel for the zone append
>>>>>>>>>>> command. Feel free to submit patches to emulate the support. It is
>>>>>>>>>>> outside the scope of this patchset.
>>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> It is fine that the kernel supports append, but the ZNS specification
>>>>>>>>>> does not impose the implementation for append, so the driver should not
>>>>>>>>>> do that either.
>>>>>>>>>>
>>>>>>>>>> ZNS SSDs that choose to leave append as a non-implemented optional
>>>>>>>>>> command should not rely on emulated SW support, specially when
>>>>>>>>>> traditional writes work very fine for a large part of current ZNS use
>>>>>>>>>> cases.
>>>>>>>>>>
>>>>>>>>>> Please, remove this virtual constraint.
>>>>>>>>>
>>>>>>>>> The Zone Append command is mandatory for zoned block devices. Please
>>>>>>>>> see https://lwn.net/Articles/818709/ for the background.
>>>>>>>>
>>>>>>>> I do not see anywhere in the block layer that append is mandatory for
>>>>>>>> zoned devices. Append is emulated on ZBC, but beyond that there is no
>>>>>>>> mandatory bits. Please explain.
>>>>>>>
>>>>>>> This is to allow a single write IO path for all types of zoned block device for
>>>>>>> higher layers, e.g file systems. The on-going re-work of btrfs zone support for
>>>>>>> instance now relies 100% on zone append being supported. That significantly
>>>>>>> simplifies the file system support and more importantly remove the need for
>>>>>>> locking around block allocation and BIO issuing, allowing to preserve a fully
>>>>>>> asynchronous write path that can include workqueues for efficient CPU usage of
>>>>>>> things like encryption and compression. Without zone append, file system would
>>>>>>> either (1) have to reject these drives that do not support zone append, or (2)
>>>>>>> implement 2 different write IO path (slower regular write and zone append). None
>>>>>>> of these options are ideal, to say the least.
>>>>>>>
>>>>>>> So the approach is: mandate zone append support for ZNS devices. To allow other
>>>>>>> ZNS drives, an emulation similar to SCSI can be implemented, with that emulation
>>>>>>> ideally combined to work for both types of drives if possible.
>>>>>>
>>>>>> Enforcing QD=1 becomes a problem on devices with large zones. In
>>>>>> a ZNS device that has smaller zones this should not be a problem.
>>>>>
>>>>> Let's be precise: this is not running the drive at QD=1, it is "at most one
>>>>> write *request* per zone". If the FS is simultaneously using multiple block
>>>>> groups mapped to different zones, you will get a total write QD > 1, and as many
>>>>> reads as you want.
>>>>>
>>>>>> Would you agree that it is possible to have a write path that relies on
>>>>>> QD=1, where the FS / application has the responsibility for enforcing
>>>>>> this? Down the road this QD can be increased if the device is able to
>>>>>> buffer the writes.
>>>>>
>>>>> Doing QD=1 per zone for writes at the FS layer, that is, at the BIO layer does
>>>>> not work. This is because BIOs can be as large as the FS wants them to be. Such
>>>>> large BIO will be split into multiple requests in the block layer, resulting in
>>>>> more than one write per zone. That is why the zone write locking is at the
>>>>> scheduler level, between BIO split and request dispatch. That avoids the
>>>>> multiple requests fragments of a large BIO to be reordered and fail. That is
>>>>> mandatory as the block layer itself can occasionally reorder requests and lower
>>>>> levels such as AHCI HW is also notoriously good at reversing sequential
>>>>> requests. For NVMe with multi-queue, the IO issuing process getting rescheduled
>>>>> on a different CPU can result in sequential IOs being in different queues, with
>>>>> the likely result of an out-of-order execution. All cases are avoided with zone
>>>>> write locking and at most one write request dispatch per zone as recommended by
>>>>> the ZNS specifications (ZBC and ZAC standards for SMR HDDs are silent on this).
>>>>>
>>>>
>>>> I understand. I agree that the current FSs supporting ZNS follow this
>>>> approach and it makes sense that there is a common interface that
>>>> simplifies the FS implementation. See the comment below on the part I
>>>> believe we see things differently.
>>>>
>>>>
>>>>>> I would be OK with some FS implementations to rely on append and impose
>>>>>> the constraint that append has to be supported (and it would be our job
>>>>>> to change that), but I would like to avoid the driver rejecting
>>>>>> initializing the device because current FS implementations have
>>>>>> implemented this logic.
>>>>>
>>>>> What is the difference between the driver rejecting drives and the FS rejecting
>>>>> the same drives ? That has the same end result to me: an entire class of devices
>>>>> cannot be used as desired by the user. Implementing zone append emulation avoids
>>>>> the rejection entirely while still allowing the FS to have a single write IO
>>>>> path, thus simplifying the code.
>>>>
>>>> The difference is that users that use a raw ZNS device submitting I/O
>>>> through the kernel would still be able to use these devices. The result
>>>> would be that the ZNS SSD is recognized and initialized, but the FS
>>>> format fails.
>>>
>>> I understand your point of view. Raw ZNS block device access by an application
>>> is of course a fine use case. SMR also has plenty of these.
>>>
>>> My point is that enabling this regular write/raw device use case should not
>>> prevent using btrfs or other kernel components that require zone append.
>>> Implementing zone append emulation in the NVMe/ZNS driver for devices without
>>> native support for the command enables *all* use cases without impacting the use
>>> case you are interested in.
>>>
>>> This approach is, in my opinion, far better. No one is left out and the user
>>> gains a flexible system with different setup capabilities. The user wins here.
>>
>> So, do you see a path where we enable the following:
>>
>>     1. We add the emulation layer to the NVMe driver for enabling FSs
>>        that currently support zoned devices
>>     2. We add a path from user-space (e.g., uring) to enable passthru
>>        commands to the NVMe driver to enable a raw ZNS path from the
>>        application. This path does not require the device to support
>>        append. An initial limitation is that I/Os must be of < 127 bio
>>        segments (same as append) to avod bio splits
>>     3. As per above, the NVMe driver allows ZNS drives without append
>>        support to be initialized and the check moves to the FS
>>        formatting.
>>
>> 2 and 3. is something we have on our end. We need to rebase on top of
>> the patches you guys submitted. 1. is something we can help with after
>> that.
>>
>> Does the above make sense to you?
>
>Doing (1) first will give you a regular nvme namespace block device that you can
>use to send passthrough commands with ioctl(). So (1) gives you (2).
>
>However, I do not understand what io-uring has to do with passthrough. io-uring
>being a block layer functionality, I do not think you can use it to send
>passthrough commands to the driver. I amy be wrong though, but my understanding
>is that for NVMe, passthrough is either ioctl() to device file or the entire
>driver in user space with SPDK.

We would like to have an async() passthru I/O path and it seems possible
to do through uring. As mentioned on the other email, the goal is to
have the I/O go through the block layer for better integration, but this
work is still ongoing. See other thread.

>
>As for (3), I do not understand your point. If you have (1), then an FS
>requiring zone append will work.

In order to enable (2), we need the device to come up first. At the
moment the NVMe driver rejects ZNS devices without append support, so
either ioctl() or the uring path will not work.

Javier
Javier González June 17, 2020, 7:11 a.m. UTC | #39
On 17.06.2020 06:54, Damien Le Moal wrote:
>On 2020/06/17 15:18, Javier González wrote:
>> On 17.06.2020 00:38, Damien Le Moal wrote:
>>> On 2020/06/17 1:13, Javier González wrote:
>>>> On 16.06.2020 09:07, Keith Busch wrote:
>>>>> On Tue, Jun 16, 2020 at 05:55:26PM +0200, Javier González wrote:
>>>>>> On 16.06.2020 08:48, Keith Busch wrote:
>>>>>>> On Tue, Jun 16, 2020 at 05:02:17PM +0200, Javier González wrote:
>>>>>>>> This depends very much on how the FS / application is managing
>>>>>>>> stripping. At the moment our main use case is enabling user-space
>>>>>>>> applications submitting I/Os to raw ZNS devices through the kernel.
>>>>>>>>
>>>>>>>> Can we enable this use case to start with?
>>>>>>>
>>>>>>> I think this already provides that. You can set the nsid value to
>>>>>>> whatever you want in the passthrough interface, so a namespace block
>>>>>>> device is not required to issue I/O to a ZNS namespace from user space.
>>>>>>
>>>>>> Mmmmm. Problem now is that the check on the nvme driver prevents the ZNS
>>>>>> namespace from being initialized. Am I missing something?
>>>>>
>>>>> Hm, okay, it may not work for you. We need the driver to create at least
>>>>> one namespace so that we have tags and request_queue. If you have that,
>>>>> you can issue IO to any other attached namespace through the passthrough
>>>>> interface, but we can't assume there is an available namespace.
>>>>
>>>> That makes sense for now.
>>>>
>>>> The next step for us is to enable a passthrough on uring, making sure
>>>> that I/Os do not split.
>>>
>>> Passthrough as in "application issues directly NVMe commands" like for SG_IO
>>> with SCSI ? Or do you mean raw block device file accesses by the application,
>>> meaning that the IO goes through the block IO stack as opposed to directly going
>>> to the driver ?
>>>
>>> For the latter case, I do not think it is possible to guarantee that an IO will
>>> not get split unless we are talking about single page IOs (e.g. 4K on X86). See
>>> a somewhat similar request here and comments about it.
>>>
>>> https://www.spinics.net/lists/linux-block/msg55079.html
>>
>> At the moment we are doing the former, but it looks like a hack to me to
>> go directly to the NVMe driver.
>
>That is what the nvme driver ioctl() is for no ? An application can send an NVMe
>command directly to the driver with it. That is not a hack, but the regular way
>of doing passthrough for NVMe, isn't it ?

We have enabled it through uring to get async() passthru submission.
Looks like a hack at the moment, but we might just send a RFC to have
something concrete to based the discussion on.

>
>> I was thinking that we could enable the second path by making use of
>> chunk_sectors and limit the I/O size just as the append_max_io_size
>> does. Is this the complete wrong way of looking at it?
>
>The block layer cannot limit the size of a passthrough command since the command
>is protocol specific and the block layer is a protocol independent interface.

Agree. This work depend in the application being aware of a max I/O size
at the moment. Down the road, we will remove (or at least limit a lot)
this constraint for ZNS devices that can eventually cache out-of-order
I/Os.

>SCSI SG does not split passthrough requests, it cannot. For passthrough
>commands, the command buffer can be dma-mapped or it cannot. If mapping
>succeeds, the command is issued. If it cannot, the command is failed. At least,
>that is my understanding of how the stack is working.

I am not familiar with SCSI SG. This looks like how the ioctl() passthru
works in NVMe, but as mentioned above, we would like to enable an
async() passthru path.

Thanks,
Javier
Damien Le Moal June 17, 2020, 7:24 a.m. UTC | #40
On 2020/06/17 16:02, Javier González wrote:
> On 17.06.2020 06:47, Damien Le Moal wrote:
>> On 2020/06/17 15:10, Javier González wrote:
>>> On 17.06.2020 00:14, Damien Le Moal wrote:
>>>> On 2020/06/17 0:02, Javier González wrote:
>>>>> On 16.06.2020 14:42, Damien Le Moal wrote:
>>>>>> On 2020/06/16 23:16, Javier González wrote:
>>>>>>> On 16.06.2020 12:35, Damien Le Moal wrote:
>>>>>>>> On 2020/06/16 21:24, Javier González wrote:
>>>>>>>>> On 16.06.2020 14:06, Matias Bjørling wrote:
>>>>>>>>>> On 16/06/2020 14.00, Javier González wrote:
>>>>>>>>>>> On 16.06.2020 13:18, Matias Bjørling wrote:
>>>>>>>>>>>> On 16/06/2020 12.41, Javier González wrote:
>>>>>>>>>>>>> On 16.06.2020 08:34, Keith Busch wrote:
>>>>>>>>>>>>>> Add support for NVM Express Zoned Namespaces (ZNS) Command Set defined
>>>>>>>>>>>>>> in NVM Express TP4053. Zoned namespaces are discovered based on their
>>>>>>>>>>>>>> Command Set Identifier reported in the namespaces Namespace
>>>>>>>>>>>>>> Identification Descriptor list. A successfully discovered Zoned
>>>>>>>>>>>>>> Namespace will be registered with the block layer as a host managed
>>>>>>>>>>>>>> zoned block device with Zone Append command support. A namespace that
>>>>>>>>>>>>>> does not support append is not supported by the driver.
>>>>>>>>>>>>>
>>>>>>>>>>>>> Why are we enforcing the append command? Append is optional on the
>>>>>>>>>>>>> current ZNS specification, so we should not make this mandatory in the
>>>>>>>>>>>>> implementation. See specifics below.
>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>> There is already general support in the kernel for the zone append
>>>>>>>>>>>> command. Feel free to submit patches to emulate the support. It is
>>>>>>>>>>>> outside the scope of this patchset.
>>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> It is fine that the kernel supports append, but the ZNS specification
>>>>>>>>>>> does not impose the implementation for append, so the driver should not
>>>>>>>>>>> do that either.
>>>>>>>>>>>
>>>>>>>>>>> ZNS SSDs that choose to leave append as a non-implemented optional
>>>>>>>>>>> command should not rely on emulated SW support, specially when
>>>>>>>>>>> traditional writes work very fine for a large part of current ZNS use
>>>>>>>>>>> cases.
>>>>>>>>>>>
>>>>>>>>>>> Please, remove this virtual constraint.
>>>>>>>>>>
>>>>>>>>>> The Zone Append command is mandatory for zoned block devices. Please
>>>>>>>>>> see https://lwn.net/Articles/818709/ for the background.
>>>>>>>>>
>>>>>>>>> I do not see anywhere in the block layer that append is mandatory for
>>>>>>>>> zoned devices. Append is emulated on ZBC, but beyond that there is no
>>>>>>>>> mandatory bits. Please explain.
>>>>>>>>
>>>>>>>> This is to allow a single write IO path for all types of zoned block device for
>>>>>>>> higher layers, e.g file systems. The on-going re-work of btrfs zone support for
>>>>>>>> instance now relies 100% on zone append being supported. That significantly
>>>>>>>> simplifies the file system support and more importantly remove the need for
>>>>>>>> locking around block allocation and BIO issuing, allowing to preserve a fully
>>>>>>>> asynchronous write path that can include workqueues for efficient CPU usage of
>>>>>>>> things like encryption and compression. Without zone append, file system would
>>>>>>>> either (1) have to reject these drives that do not support zone append, or (2)
>>>>>>>> implement 2 different write IO path (slower regular write and zone append). None
>>>>>>>> of these options are ideal, to say the least.
>>>>>>>>
>>>>>>>> So the approach is: mandate zone append support for ZNS devices. To allow other
>>>>>>>> ZNS drives, an emulation similar to SCSI can be implemented, with that emulation
>>>>>>>> ideally combined to work for both types of drives if possible.
>>>>>>>
>>>>>>> Enforcing QD=1 becomes a problem on devices with large zones. In
>>>>>>> a ZNS device that has smaller zones this should not be a problem.
>>>>>>
>>>>>> Let's be precise: this is not running the drive at QD=1, it is "at most one
>>>>>> write *request* per zone". If the FS is simultaneously using multiple block
>>>>>> groups mapped to different zones, you will get a total write QD > 1, and as many
>>>>>> reads as you want.
>>>>>>
>>>>>>> Would you agree that it is possible to have a write path that relies on
>>>>>>> QD=1, where the FS / application has the responsibility for enforcing
>>>>>>> this? Down the road this QD can be increased if the device is able to
>>>>>>> buffer the writes.
>>>>>>
>>>>>> Doing QD=1 per zone for writes at the FS layer, that is, at the BIO layer does
>>>>>> not work. This is because BIOs can be as large as the FS wants them to be. Such
>>>>>> large BIO will be split into multiple requests in the block layer, resulting in
>>>>>> more than one write per zone. That is why the zone write locking is at the
>>>>>> scheduler level, between BIO split and request dispatch. That avoids the
>>>>>> multiple requests fragments of a large BIO to be reordered and fail. That is
>>>>>> mandatory as the block layer itself can occasionally reorder requests and lower
>>>>>> levels such as AHCI HW is also notoriously good at reversing sequential
>>>>>> requests. For NVMe with multi-queue, the IO issuing process getting rescheduled
>>>>>> on a different CPU can result in sequential IOs being in different queues, with
>>>>>> the likely result of an out-of-order execution. All cases are avoided with zone
>>>>>> write locking and at most one write request dispatch per zone as recommended by
>>>>>> the ZNS specifications (ZBC and ZAC standards for SMR HDDs are silent on this).
>>>>>>
>>>>>
>>>>> I understand. I agree that the current FSs supporting ZNS follow this
>>>>> approach and it makes sense that there is a common interface that
>>>>> simplifies the FS implementation. See the comment below on the part I
>>>>> believe we see things differently.
>>>>>
>>>>>
>>>>>>> I would be OK with some FS implementations to rely on append and impose
>>>>>>> the constraint that append has to be supported (and it would be our job
>>>>>>> to change that), but I would like to avoid the driver rejecting
>>>>>>> initializing the device because current FS implementations have
>>>>>>> implemented this logic.
>>>>>>
>>>>>> What is the difference between the driver rejecting drives and the FS rejecting
>>>>>> the same drives ? That has the same end result to me: an entire class of devices
>>>>>> cannot be used as desired by the user. Implementing zone append emulation avoids
>>>>>> the rejection entirely while still allowing the FS to have a single write IO
>>>>>> path, thus simplifying the code.
>>>>>
>>>>> The difference is that users that use a raw ZNS device submitting I/O
>>>>> through the kernel would still be able to use these devices. The result
>>>>> would be that the ZNS SSD is recognized and initialized, but the FS
>>>>> format fails.
>>>>
>>>> I understand your point of view. Raw ZNS block device access by an application
>>>> is of course a fine use case. SMR also has plenty of these.
>>>>
>>>> My point is that enabling this regular write/raw device use case should not
>>>> prevent using btrfs or other kernel components that require zone append.
>>>> Implementing zone append emulation in the NVMe/ZNS driver for devices without
>>>> native support for the command enables *all* use cases without impacting the use
>>>> case you are interested in.
>>>>
>>>> This approach is, in my opinion, far better. No one is left out and the user
>>>> gains a flexible system with different setup capabilities. The user wins here.
>>>
>>> So, do you see a path where we enable the following:
>>>
>>>     1. We add the emulation layer to the NVMe driver for enabling FSs
>>>        that currently support zoned devices
>>>     2. We add a path from user-space (e.g., uring) to enable passthru
>>>        commands to the NVMe driver to enable a raw ZNS path from the
>>>        application. This path does not require the device to support
>>>        append. An initial limitation is that I/Os must be of < 127 bio
>>>        segments (same as append) to avod bio splits
>>>     3. As per above, the NVMe driver allows ZNS drives without append
>>>        support to be initialized and the check moves to the FS
>>>        formatting.
>>>
>>> 2 and 3. is something we have on our end. We need to rebase on top of
>>> the patches you guys submitted. 1. is something we can help with after
>>> that.
>>>
>>> Does the above make sense to you?
>>
>> Doing (1) first will give you a regular nvme namespace block device that you can
>> use to send passthrough commands with ioctl(). So (1) gives you (2).
>>
>> However, I do not understand what io-uring has to do with passthrough. io-uring
>> being a block layer functionality, I do not think you can use it to send
>> passthrough commands to the driver. I amy be wrong though, but my understanding
>> is that for NVMe, passthrough is either ioctl() to device file or the entire
>> driver in user space with SPDK.
> 
> We would like to have an async() passthru I/O path and it seems possible
> to do through uring. As mentioned on the other email, the goal is to
> have the I/O go through the block layer for better integration, but this
> work is still ongoing. See other thread.

Indeed. I do not think that is special to ZNS at all.

>> As for (3), I do not understand your point. If you have (1), then an FS
>> requiring zone append will work.
> 
> In order to enable (2), we need the device to come up first. At the
> moment the NVMe driver rejects ZNS devices without append support, so
> either ioctl() or the uring path will not work.

I repeat again here: if you implement zone append emulation, there is no reason
to reject devices that do not have zone append native support. Zone append
emulation gives you the block device, you can do ioctl(), implement the new
async passthrough and file systems requiring zone append work too. All problems
solved.
Damien Le Moal June 17, 2020, 7:29 a.m. UTC | #41
On 2020/06/17 16:11, Javier González wrote:
> On 17.06.2020 06:54, Damien Le Moal wrote:
>> On 2020/06/17 15:18, Javier González wrote:
>>> On 17.06.2020 00:38, Damien Le Moal wrote:
>>>> On 2020/06/17 1:13, Javier González wrote:
>>>>> On 16.06.2020 09:07, Keith Busch wrote:
>>>>>> On Tue, Jun 16, 2020 at 05:55:26PM +0200, Javier González wrote:
>>>>>>> On 16.06.2020 08:48, Keith Busch wrote:
>>>>>>>> On Tue, Jun 16, 2020 at 05:02:17PM +0200, Javier González wrote:
>>>>>>>>> This depends very much on how the FS / application is managing
>>>>>>>>> stripping. At the moment our main use case is enabling user-space
>>>>>>>>> applications submitting I/Os to raw ZNS devices through the kernel.
>>>>>>>>>
>>>>>>>>> Can we enable this use case to start with?
>>>>>>>>
>>>>>>>> I think this already provides that. You can set the nsid value to
>>>>>>>> whatever you want in the passthrough interface, so a namespace block
>>>>>>>> device is not required to issue I/O to a ZNS namespace from user space.
>>>>>>>
>>>>>>> Mmmmm. Problem now is that the check on the nvme driver prevents the ZNS
>>>>>>> namespace from being initialized. Am I missing something?
>>>>>>
>>>>>> Hm, okay, it may not work for you. We need the driver to create at least
>>>>>> one namespace so that we have tags and request_queue. If you have that,
>>>>>> you can issue IO to any other attached namespace through the passthrough
>>>>>> interface, but we can't assume there is an available namespace.
>>>>>
>>>>> That makes sense for now.
>>>>>
>>>>> The next step for us is to enable a passthrough on uring, making sure
>>>>> that I/Os do not split.
>>>>
>>>> Passthrough as in "application issues directly NVMe commands" like for SG_IO
>>>> with SCSI ? Or do you mean raw block device file accesses by the application,
>>>> meaning that the IO goes through the block IO stack as opposed to directly going
>>>> to the driver ?
>>>>
>>>> For the latter case, I do not think it is possible to guarantee that an IO will
>>>> not get split unless we are talking about single page IOs (e.g. 4K on X86). See
>>>> a somewhat similar request here and comments about it.
>>>>
>>>> https://www.spinics.net/lists/linux-block/msg55079.html
>>>
>>> At the moment we are doing the former, but it looks like a hack to me to
>>> go directly to the NVMe driver.
>>
>> That is what the nvme driver ioctl() is for no ? An application can send an NVMe
>> command directly to the driver with it. That is not a hack, but the regular way
>> of doing passthrough for NVMe, isn't it ?
> 
> We have enabled it through uring to get async() passthru submission.
> Looks like a hack at the moment, but we might just send a RFC to have
> something concrete to based the discussion on.

Yes, that would clarify things.

>>> I was thinking that we could enable the second path by making use of
>>> chunk_sectors and limit the I/O size just as the append_max_io_size
>>> does. Is this the complete wrong way of looking at it?
>>
>> The block layer cannot limit the size of a passthrough command since the command
>> is protocol specific and the block layer is a protocol independent interface.
> 
> Agree. This work depend in the application being aware of a max I/O size
> at the moment. Down the road, we will remove (or at least limit a lot)
> this constraint for ZNS devices that can eventually cache out-of-order
> I/Os.

I/Os with a data buffer all need mapping for DMA, no matter the device
functionalities or the command being executed. With passthrough, I do not think
it is possible to have the block layer limit anything. It will likely always be
pass-or-fail. With passthrough, the application needs to understand what it is
doing.

> 
>> SCSI SG does not split passthrough requests, it cannot. For passthrough
>> commands, the command buffer can be dma-mapped or it cannot. If mapping
>> succeeds, the command is issued. If it cannot, the command is failed. At least,
>> that is my understanding of how the stack is working.
> 
> I am not familiar with SCSI SG. This looks like how the ioctl() passthru
> works in NVMe, but as mentioned above, we would like to enable an
> async() passthru path.

That is done with bsg for SCSI I believe. You may want to have a look around
there. The SG driver used to have the write() system call mapped to "issuing a
command" and read() for "getting a command result". That was removed however.
But I think bsg has a replacement for that defunct async passthrough interface.
Not sure. I have not looked at that for a while.



> 
> Thanks,
> Javier
>
Javier González June 17, 2020, 7:29 a.m. UTC | #42
On 17.06.2020 07:24, Damien Le Moal wrote:
>On 2020/06/17 16:02, Javier González wrote:
>> On 17.06.2020 06:47, Damien Le Moal wrote:
>>> On 2020/06/17 15:10, Javier González wrote:
>>>> On 17.06.2020 00:14, Damien Le Moal wrote:
>>>>> On 2020/06/17 0:02, Javier González wrote:
>>>>>> On 16.06.2020 14:42, Damien Le Moal wrote:
>>>>>>> On 2020/06/16 23:16, Javier González wrote:
>>>>>>>> On 16.06.2020 12:35, Damien Le Moal wrote:
>>>>>>>>> On 2020/06/16 21:24, Javier González wrote:
>>>>>>>>>> On 16.06.2020 14:06, Matias Bjørling wrote:
>>>>>>>>>>> On 16/06/2020 14.00, Javier González wrote:
>>>>>>>>>>>> On 16.06.2020 13:18, Matias Bjørling wrote:
>>>>>>>>>>>>> On 16/06/2020 12.41, Javier González wrote:
>>>>>>>>>>>>>> On 16.06.2020 08:34, Keith Busch wrote:
>>>>>>>>>>>>>>> Add support for NVM Express Zoned Namespaces (ZNS) Command Set defined
>>>>>>>>>>>>>>> in NVM Express TP4053. Zoned namespaces are discovered based on their
>>>>>>>>>>>>>>> Command Set Identifier reported in the namespaces Namespace
>>>>>>>>>>>>>>> Identification Descriptor list. A successfully discovered Zoned
>>>>>>>>>>>>>>> Namespace will be registered with the block layer as a host managed
>>>>>>>>>>>>>>> zoned block device with Zone Append command support. A namespace that
>>>>>>>>>>>>>>> does not support append is not supported by the driver.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Why are we enforcing the append command? Append is optional on the
>>>>>>>>>>>>>> current ZNS specification, so we should not make this mandatory in the
>>>>>>>>>>>>>> implementation. See specifics below.
>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>> There is already general support in the kernel for the zone append
>>>>>>>>>>>>> command. Feel free to submit patches to emulate the support. It is
>>>>>>>>>>>>> outside the scope of this patchset.
>>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>> It is fine that the kernel supports append, but the ZNS specification
>>>>>>>>>>>> does not impose the implementation for append, so the driver should not
>>>>>>>>>>>> do that either.
>>>>>>>>>>>>
>>>>>>>>>>>> ZNS SSDs that choose to leave append as a non-implemented optional
>>>>>>>>>>>> command should not rely on emulated SW support, specially when
>>>>>>>>>>>> traditional writes work very fine for a large part of current ZNS use
>>>>>>>>>>>> cases.
>>>>>>>>>>>>
>>>>>>>>>>>> Please, remove this virtual constraint.
>>>>>>>>>>>
>>>>>>>>>>> The Zone Append command is mandatory for zoned block devices. Please
>>>>>>>>>>> see https://lwn.net/Articles/818709/ for the background.
>>>>>>>>>>
>>>>>>>>>> I do not see anywhere in the block layer that append is mandatory for
>>>>>>>>>> zoned devices. Append is emulated on ZBC, but beyond that there is no
>>>>>>>>>> mandatory bits. Please explain.
>>>>>>>>>
>>>>>>>>> This is to allow a single write IO path for all types of zoned block device for
>>>>>>>>> higher layers, e.g file systems. The on-going re-work of btrfs zone support for
>>>>>>>>> instance now relies 100% on zone append being supported. That significantly
>>>>>>>>> simplifies the file system support and more importantly remove the need for
>>>>>>>>> locking around block allocation and BIO issuing, allowing to preserve a fully
>>>>>>>>> asynchronous write path that can include workqueues for efficient CPU usage of
>>>>>>>>> things like encryption and compression. Without zone append, file system would
>>>>>>>>> either (1) have to reject these drives that do not support zone append, or (2)
>>>>>>>>> implement 2 different write IO path (slower regular write and zone append). None
>>>>>>>>> of these options are ideal, to say the least.
>>>>>>>>>
>>>>>>>>> So the approach is: mandate zone append support for ZNS devices. To allow other
>>>>>>>>> ZNS drives, an emulation similar to SCSI can be implemented, with that emulation
>>>>>>>>> ideally combined to work for both types of drives if possible.
>>>>>>>>
>>>>>>>> Enforcing QD=1 becomes a problem on devices with large zones. In
>>>>>>>> a ZNS device that has smaller zones this should not be a problem.
>>>>>>>
>>>>>>> Let's be precise: this is not running the drive at QD=1, it is "at most one
>>>>>>> write *request* per zone". If the FS is simultaneously using multiple block
>>>>>>> groups mapped to different zones, you will get a total write QD > 1, and as many
>>>>>>> reads as you want.
>>>>>>>
>>>>>>>> Would you agree that it is possible to have a write path that relies on
>>>>>>>> QD=1, where the FS / application has the responsibility for enforcing
>>>>>>>> this? Down the road this QD can be increased if the device is able to
>>>>>>>> buffer the writes.
>>>>>>>
>>>>>>> Doing QD=1 per zone for writes at the FS layer, that is, at the BIO layer does
>>>>>>> not work. This is because BIOs can be as large as the FS wants them to be. Such
>>>>>>> large BIO will be split into multiple requests in the block layer, resulting in
>>>>>>> more than one write per zone. That is why the zone write locking is at the
>>>>>>> scheduler level, between BIO split and request dispatch. That avoids the
>>>>>>> multiple requests fragments of a large BIO to be reordered and fail. That is
>>>>>>> mandatory as the block layer itself can occasionally reorder requests and lower
>>>>>>> levels such as AHCI HW is also notoriously good at reversing sequential
>>>>>>> requests. For NVMe with multi-queue, the IO issuing process getting rescheduled
>>>>>>> on a different CPU can result in sequential IOs being in different queues, with
>>>>>>> the likely result of an out-of-order execution. All cases are avoided with zone
>>>>>>> write locking and at most one write request dispatch per zone as recommended by
>>>>>>> the ZNS specifications (ZBC and ZAC standards for SMR HDDs are silent on this).
>>>>>>>
>>>>>>
>>>>>> I understand. I agree that the current FSs supporting ZNS follow this
>>>>>> approach and it makes sense that there is a common interface that
>>>>>> simplifies the FS implementation. See the comment below on the part I
>>>>>> believe we see things differently.
>>>>>>
>>>>>>
>>>>>>>> I would be OK with some FS implementations to rely on append and impose
>>>>>>>> the constraint that append has to be supported (and it would be our job
>>>>>>>> to change that), but I would like to avoid the driver rejecting
>>>>>>>> initializing the device because current FS implementations have
>>>>>>>> implemented this logic.
>>>>>>>
>>>>>>> What is the difference between the driver rejecting drives and the FS rejecting
>>>>>>> the same drives ? That has the same end result to me: an entire class of devices
>>>>>>> cannot be used as desired by the user. Implementing zone append emulation avoids
>>>>>>> the rejection entirely while still allowing the FS to have a single write IO
>>>>>>> path, thus simplifying the code.
>>>>>>
>>>>>> The difference is that users that use a raw ZNS device submitting I/O
>>>>>> through the kernel would still be able to use these devices. The result
>>>>>> would be that the ZNS SSD is recognized and initialized, but the FS
>>>>>> format fails.
>>>>>
>>>>> I understand your point of view. Raw ZNS block device access by an application
>>>>> is of course a fine use case. SMR also has plenty of these.
>>>>>
>>>>> My point is that enabling this regular write/raw device use case should not
>>>>> prevent using btrfs or other kernel components that require zone append.
>>>>> Implementing zone append emulation in the NVMe/ZNS driver for devices without
>>>>> native support for the command enables *all* use cases without impacting the use
>>>>> case you are interested in.
>>>>>
>>>>> This approach is, in my opinion, far better. No one is left out and the user
>>>>> gains a flexible system with different setup capabilities. The user wins here.
>>>>
>>>> So, do you see a path where we enable the following:
>>>>
>>>>     1. We add the emulation layer to the NVMe driver for enabling FSs
>>>>        that currently support zoned devices
>>>>     2. We add a path from user-space (e.g., uring) to enable passthru
>>>>        commands to the NVMe driver to enable a raw ZNS path from the
>>>>        application. This path does not require the device to support
>>>>        append. An initial limitation is that I/Os must be of < 127 bio
>>>>        segments (same as append) to avod bio splits
>>>>     3. As per above, the NVMe driver allows ZNS drives without append
>>>>        support to be initialized and the check moves to the FS
>>>>        formatting.
>>>>
>>>> 2 and 3. is something we have on our end. We need to rebase on top of
>>>> the patches you guys submitted. 1. is something we can help with after
>>>> that.
>>>>
>>>> Does the above make sense to you?
>>>
>>> Doing (1) first will give you a regular nvme namespace block device that you can
>>> use to send passthrough commands with ioctl(). So (1) gives you (2).
>>>
>>> However, I do not understand what io-uring has to do with passthrough. io-uring
>>> being a block layer functionality, I do not think you can use it to send
>>> passthrough commands to the driver. I amy be wrong though, but my understanding
>>> is that for NVMe, passthrough is either ioctl() to device file or the entire
>>> driver in user space with SPDK.
>>
>> We would like to have an async() passthru I/O path and it seems possible
>> to do through uring. As mentioned on the other email, the goal is to
>> have the I/O go through the block layer for better integration, but this
>> work is still ongoing. See other thread.
>
>Indeed. I do not think that is special to ZNS at all.

Agree.

>
>>> As for (3), I do not understand your point. If you have (1), then an FS
>>> requiring zone append will work.
>>
>> In order to enable (2), we need the device to come up first. At the
>> moment the NVMe driver rejects ZNS devices without append support, so
>> either ioctl() or the uring path will not work.
>
>I repeat again here: if you implement zone append emulation, there is no reason
>to reject devices that do not have zone append native support. Zone append
>emulation gives you the block device, you can do ioctl(), implement the new
>async passthrough and file systems requiring zone append work too. All problems
>solved.
>

Ok. We will get started with this to recognize the device. I believe we
have enough for a first round of patches.

Thanks for the help Damien!

Javier
Javier González June 17, 2020, 7:34 a.m. UTC | #43
On 17.06.2020 07:29, Damien Le Moal wrote:
>On 2020/06/17 16:11, Javier González wrote:
>> On 17.06.2020 06:54, Damien Le Moal wrote:
>>> On 2020/06/17 15:18, Javier González wrote:
>>>> On 17.06.2020 00:38, Damien Le Moal wrote:
>>>>> On 2020/06/17 1:13, Javier González wrote:
>>>>>> On 16.06.2020 09:07, Keith Busch wrote:
>>>>>>> On Tue, Jun 16, 2020 at 05:55:26PM +0200, Javier González wrote:
>>>>>>>> On 16.06.2020 08:48, Keith Busch wrote:
>>>>>>>>> On Tue, Jun 16, 2020 at 05:02:17PM +0200, Javier González wrote:
>>>>>>>>>> This depends very much on how the FS / application is managing
>>>>>>>>>> stripping. At the moment our main use case is enabling user-space
>>>>>>>>>> applications submitting I/Os to raw ZNS devices through the kernel.
>>>>>>>>>>
>>>>>>>>>> Can we enable this use case to start with?
>>>>>>>>>
>>>>>>>>> I think this already provides that. You can set the nsid value to
>>>>>>>>> whatever you want in the passthrough interface, so a namespace block
>>>>>>>>> device is not required to issue I/O to a ZNS namespace from user space.
>>>>>>>>
>>>>>>>> Mmmmm. Problem now is that the check on the nvme driver prevents the ZNS
>>>>>>>> namespace from being initialized. Am I missing something?
>>>>>>>
>>>>>>> Hm, okay, it may not work for you. We need the driver to create at least
>>>>>>> one namespace so that we have tags and request_queue. If you have that,
>>>>>>> you can issue IO to any other attached namespace through the passthrough
>>>>>>> interface, but we can't assume there is an available namespace.
>>>>>>
>>>>>> That makes sense for now.
>>>>>>
>>>>>> The next step for us is to enable a passthrough on uring, making sure
>>>>>> that I/Os do not split.
>>>>>
>>>>> Passthrough as in "application issues directly NVMe commands" like for SG_IO
>>>>> with SCSI ? Or do you mean raw block device file accesses by the application,
>>>>> meaning that the IO goes through the block IO stack as opposed to directly going
>>>>> to the driver ?
>>>>>
>>>>> For the latter case, I do not think it is possible to guarantee that an IO will
>>>>> not get split unless we are talking about single page IOs (e.g. 4K on X86). See
>>>>> a somewhat similar request here and comments about it.
>>>>>
>>>>> https://www.spinics.net/lists/linux-block/msg55079.html
>>>>
>>>> At the moment we are doing the former, but it looks like a hack to me to
>>>> go directly to the NVMe driver.
>>>
>>> That is what the nvme driver ioctl() is for no ? An application can send an NVMe
>>> command directly to the driver with it. That is not a hack, but the regular way
>>> of doing passthrough for NVMe, isn't it ?
>>
>> We have enabled it through uring to get async() passthru submission.
>> Looks like a hack at the moment, but we might just send a RFC to have
>> something concrete to based the discussion on.
>
>Yes, that would clarify things.
>
>>>> I was thinking that we could enable the second path by making use of
>>>> chunk_sectors and limit the I/O size just as the append_max_io_size
>>>> does. Is this the complete wrong way of looking at it?
>>>
>>> The block layer cannot limit the size of a passthrough command since the command
>>> is protocol specific and the block layer is a protocol independent interface.
>>
>> Agree. This work depend in the application being aware of a max I/O size
>> at the moment. Down the road, we will remove (or at least limit a lot)
>> this constraint for ZNS devices that can eventually cache out-of-order
>> I/Os.
>
>I/Os with a data buffer all need mapping for DMA, no matter the device
>functionalities or the command being executed. With passthrough, I do not think
>it is possible to have the block layer limit anything. It will likely always be
>pass-or-fail. With passthrough, the application needs to understand what it is
>doing.

Yes. It is definitely for applications that are implementing directly
zone-aware logic.

>
>>
>>> SCSI SG does not split passthrough requests, it cannot. For passthrough
>>> commands, the command buffer can be dma-mapped or it cannot. If mapping
>>> succeeds, the command is issued. If it cannot, the command is failed. At least,
>>> that is my understanding of how the stack is working.
>>
>> I am not familiar with SCSI SG. This looks like how the ioctl() passthru
>> works in NVMe, but as mentioned above, we would like to enable an
>> async() passthru path.
>
>That is done with bsg for SCSI I believe. You may want to have a look around
>there. The SG driver used to have the write() system call mapped to "issuing a
>command" and read() for "getting a command result". That was removed however.
>But I think bsg has a replacement for that defunct async passthrough interface.
>Not sure. I have not looked at that for a while.
>

Thanks for the pointer; I was not aware of this. We will look into it.

Thanks again for the help Damien!
Javier
Christoph Hellwig June 17, 2020, 7:43 a.m. UTC | #44
On Tue, Jun 16, 2020 at 12:41:42PM +0200, Javier González wrote:
> On 16.06.2020 08:34, Keith Busch wrote:
>> Add support for NVM Express Zoned Namespaces (ZNS) Command Set defined
>> in NVM Express TP4053. Zoned namespaces are discovered based on their
>> Command Set Identifier reported in the namespaces Namespace
>> Identification Descriptor list. A successfully discovered Zoned
>> Namespace will be registered with the block layer as a host managed
>> zoned block device with Zone Append command support. A namespace that
>> does not support append is not supported by the driver.
>
> Why are we enforcing the append command? Append is optional on the
> current ZNS specification, so we should not make this mandatory in the
> implementation. See specifics below.

Because Append is the way to go and we've moved the Linux zoned block
I/O stack to required it, as should have been obvious to anyone
following linux-block in the last few months.  I also have to say I'm
really tired of the stupid politics tha your company started in the
NVMe working group, and will say that these do not matter for Linux
development at all.  If you think it is worthwhile to support devices
without Zone Append you can contribute support for them on top of this
series by porting the SCSI Zone Append Emulation code to NVMe.

And I'm not even going to read the rest of this thread as I'm on a
vacation that I badly needed because of the Samsung TWG bullshit.
Martin K. Petersen June 17, 2020, 12:01 p.m. UTC | #45
> Because Append is the way to go and we've moved the Linux zoned block
> I/O stack to required it,

Just to add some historical context: The first discussions about how to
support block devices with a non-random write model in Linux happened
maybe a decade ago.

Drive vendors came to LSF/MM to solicit feedback on how Linux could
support impending SMR devices. We spent a long time going over various
approaches, including some that are similar to what is now being
entertained as alternative to Append. The conclusion back then was that
an Append-like model (tell-us-where-you-put-it) was the only reasonable
way to accommodate these devices in Linux given how our filesystems and
I/O stack worked.

Consequently, I don't think it is at all unreasonable for us to focus on
devices that implement that mode of operation in the kernel. This is
exactly the that we as a community asked the storage industry to
provide!
Javier González June 17, 2020, 2:42 p.m. UTC | #46
On 17.06.2020 09:43, Christoph Hellwig wrote:
>On Tue, Jun 16, 2020 at 12:41:42PM +0200, Javier González wrote:
>> On 16.06.2020 08:34, Keith Busch wrote:
>>> Add support for NVM Express Zoned Namespaces (ZNS) Command Set defined
>>> in NVM Express TP4053. Zoned namespaces are discovered based on their
>>> Command Set Identifier reported in the namespaces Namespace
>>> Identification Descriptor list. A successfully discovered Zoned
>>> Namespace will be registered with the block layer as a host managed
>>> zoned block device with Zone Append command support. A namespace that
>>> does not support append is not supported by the driver.
>>
>> Why are we enforcing the append command? Append is optional on the
>> current ZNS specification, so we should not make this mandatory in the
>> implementation. See specifics below.
>
>Because Append is the way to go and we've moved the Linux zoned block
>I/O stack to required it, as should have been obvious to anyone
>following linux-block in the last few months.  I also have to say I'm
>really tired of the stupid politics tha your company started in the
>NVMe working group, and will say that these do not matter for Linux
>development at all.  If you think it is worthwhile to support devices
>without Zone Append you can contribute support for them on top of this
>series by porting the SCSI Zone Append Emulation code to NVMe.
>
>And I'm not even going to read the rest of this thread as I'm on a
>vacation that I badly needed because of the Samsung TWG bullshit.

My intention is to support some Samsung ZNS devices that will not enable
append. I do not think this is an unreasonable thing to do. How / why
append ended up being an optional feature in the ZNS TP is orthogonal to
this conversation. Bullshit or not, it ends up on devices that we would
like to support one way or another.

After the discussion with Damien and Keith I have a clear idea how we
can do this and we will go ahead with the work.

I apologize that this conversation got mixed with leftovers from NVMe
TWG internals. This is a public mailing list, so I guess anyone can
comment on it.

Thanks,
Javier
Javier González June 17, 2020, 3 p.m. UTC | #47
On 17.06.2020 08:01, Martin K. Petersen wrote:
>
>> Because Append is the way to go and we've moved the Linux zoned block
>> I/O stack to required it,
>
>Just to add some historical context: The first discussions about how to
>support block devices with a non-random write model in Linux happened
>maybe a decade ago.
>
>Drive vendors came to LSF/MM to solicit feedback on how Linux could
>support impending SMR devices. We spent a long time going over various
>approaches, including some that are similar to what is now being
>entertained as alternative to Append. The conclusion back then was that
>an Append-like model (tell-us-where-you-put-it) was the only reasonable
>way to accommodate these devices in Linux given how our filesystems and
>I/O stack worked.
>
>Consequently, I don't think it is at all unreasonable for us to focus on
>devices that implement that mode of operation in the kernel. This is
>exactly the that we as a community asked the storage industry to
>provide!
>

Martin,

Thanks for sharing the historical context. I agree that append solves a
number of problems in Linux - we have had internal implementations of
append for a long time (and are sending patches extending support for it
later today).

This said, there are users that do not see append as a good fit for
their needs and we would like to support them too.

We will go back to our code and re-iterate based on the feedback we have
gotten out of this thread.

Thanks,
Javier
Matias Bjorling June 17, 2020, 5:57 p.m. UTC | #48
On 17/06/2020 16.42, Javier González wrote:
> On 17.06.2020 09:43, Christoph Hellwig wrote:
>> On Tue, Jun 16, 2020 at 12:41:42PM +0200, Javier González wrote:
>>> On 16.06.2020 08:34, Keith Busch wrote:
>>>> Add support for NVM Express Zoned Namespaces (ZNS) Command Set defined
>>>> in NVM Express TP4053. Zoned namespaces are discovered based on their
>>>> Command Set Identifier reported in the namespaces Namespace
>>>> Identification Descriptor list. A successfully discovered Zoned
>>>> Namespace will be registered with the block layer as a host managed
>>>> zoned block device with Zone Append command support. A namespace that
>>>> does not support append is not supported by the driver.
>>>
>>> Why are we enforcing the append command? Append is optional on the
>>> current ZNS specification, so we should not make this mandatory in the
>>> implementation. See specifics below.
>>
>> Because Append is the way to go and we've moved the Linux zoned block
>> I/O stack to required it, as should have been obvious to anyone
>> following linux-block in the last few months.  I also have to say I'm
>> really tired of the stupid politics tha your company started in the
>> NVMe working group, and will say that these do not matter for Linux
>> development at all.  If you think it is worthwhile to support devices
>> without Zone Append you can contribute support for them on top of this
>> series by porting the SCSI Zone Append Emulation code to NVMe.
>>
>> And I'm not even going to read the rest of this thread as I'm on a
>> vacation that I badly needed because of the Samsung TWG bullshit.
>
> My intention is to support some Samsung ZNS devices that will not enable
> append. I do not think this is an unreasonable thing to do. How / why
> append ended up being an optional feature in the ZNS TP is orthogonal to
> this conversation. Bullshit or not, it ends up on devices that we would
> like to support one way or another.

I do not believe any of us have said that it is unreasonable to support. 
We've only asked that you make the patches for it.

All of us have communicated why Zone Append is a great addition to the 
Linux kernel. Also, as Christoph points out, this has not been a secret 
for the past couple of months, and as Martin pointed out, have been a 
wanted feature for the past decade in the Linux community.

I do want to politely point out, that you've got a very clear signal 
from the key storage maintainers. Each of them is part of the planet's 
best of the best and most well-respected software developers, that 
literally have built the storage stack that most of the world depends 
on. The storage stack that recently sent manned rockets into space. They 
each unanimously said that the Zone Append command is the right approach 
for the Linux kernel to reduce the overhead of I/O tracking for zoned 
block devices. It may be worth bringing this information to your 
engineering organization, and also potentially consider Zone Append 
support for devices that you intend to used with the Linux kernel 
storage stack.

Another approach, is to use SPDK, and bypass the Linux kernel. This 
might even be an advantage, your customers does not have to wait on the 
Linux distribution being released with a long term release, before they 
can even get started and deploy in volume. I.e., they will actually get 
faster to market, and your company will be able to sell more drives.
Javier González June 17, 2020, 6:28 p.m. UTC | #49
On 17.06.2020 19:57, Matias Bjørling wrote:
>On 17/06/2020 16.42, Javier González wrote:
>>On 17.06.2020 09:43, Christoph Hellwig wrote:
>>>On Tue, Jun 16, 2020 at 12:41:42PM +0200, Javier González wrote:
>>>>On 16.06.2020 08:34, Keith Busch wrote:
>>>>>Add support for NVM Express Zoned Namespaces (ZNS) Command Set defined
>>>>>in NVM Express TP4053. Zoned namespaces are discovered based on their
>>>>>Command Set Identifier reported in the namespaces Namespace
>>>>>Identification Descriptor list. A successfully discovered Zoned
>>>>>Namespace will be registered with the block layer as a host managed
>>>>>zoned block device with Zone Append command support. A namespace that
>>>>>does not support append is not supported by the driver.
>>>>
>>>>Why are we enforcing the append command? Append is optional on the
>>>>current ZNS specification, so we should not make this mandatory in the
>>>>implementation. See specifics below.
>>>
>>>Because Append is the way to go and we've moved the Linux zoned block
>>>I/O stack to required it, as should have been obvious to anyone
>>>following linux-block in the last few months.  I also have to say I'm
>>>really tired of the stupid politics tha your company started in the
>>>NVMe working group, and will say that these do not matter for Linux
>>>development at all.  If you think it is worthwhile to support devices
>>>without Zone Append you can contribute support for them on top of this
>>>series by porting the SCSI Zone Append Emulation code to NVMe.
>>>
>>>And I'm not even going to read the rest of this thread as I'm on a
>>>vacation that I badly needed because of the Samsung TWG bullshit.
>>
>>My intention is to support some Samsung ZNS devices that will not enable
>>append. I do not think this is an unreasonable thing to do. How / why
>>append ended up being an optional feature in the ZNS TP is orthogonal to
>>this conversation. Bullshit or not, it ends up on devices that we would
>>like to support one way or another.
>
>I do not believe any of us have said that it is unreasonable to 
>support. We've only asked that you make the patches for it.
>
>All of us have communicated why Zone Append is a great addition to the 
>Linux kernel. Also, as Christoph points out, this has not been a 
>secret for the past couple of months, and as Martin pointed out, have 
>been a wanted feature for the past decade in the Linux community.

>
>I do want to politely point out, that you've got a very clear signal 
>from the key storage maintainers. Each of them is part of the planet's 
>best of the best and most well-respected software developers, that 
>literally have built the storage stack that most of the world depends 
>on. The storage stack that recently sent manned rockets into space. 
>They each unanimously said that the Zone Append command is the right 
>approach for the Linux kernel to reduce the overhead of I/O tracking 
>for zoned block devices. It may be worth bringing this information to 
>your engineering organization, and also potentially consider Zone 
>Append support for devices that you intend to used with the Linux 
>kernel storage stack.

I understand and I have never said the opposite. Append is a great
addition that we also have been working on for several months (see
patches additions from today). We just have a couple of use cases where
append is not required and I would like to make sure that they are
supported.

At the end of the day, the only thing I have disagreed on is that the
NVMe driver rejects ZNS SSDs that do not support append, as opposed to
doing this instead when an in-kernel user wants to utilize the drive
(e.g., formatting a FS with zoned support) This would allow _today_
ioctl() passthru to work for normal writes.

I still believe the above would be a more inclusive solution with the
current ZNS specification, but I can see that the general consensus is
different.

So we will go back, apply the feedback that we got and return with an
approach that better fits the ecosystem.

>
>Another approach, is to use SPDK, and bypass the Linux kernel. This 
>might even be an advantage, your customers does not have to wait on 
>the Linux distribution being released with a long term release, before 
>they can even get started and deploy in volume. I.e., they will 
>actually get faster to market, and your company will be able to sell 
>more drives.

I think I will refrain from discussing our business strategy on an open
mailing list. Appreciate the feedback though. Very insightful.

Thanks,
Javier
Matias Bjørling June 17, 2020, 6:55 p.m. UTC | #50
> -----Original Message-----
> From: Javier González <javier@javigon.com>
> Sent: Wednesday, 17 June 2020 20.29
> To: Matias Bjørling <mb@lightnvm.io>
> Cc: Christoph Hellwig <hch@lst.de>; Keith Busch <Keith.Busch@wdc.com>;
> linux-nvme@lists.infradead.org; linux-block@vger.kernel.org; Damien Le Moal
> <Damien.LeMoal@wdc.com>; Matias Bjorling <Matias.Bjorling@wdc.com>;
> Sagi Grimberg <sagi@grimberg.me>; Jens Axboe <axboe@kernel.dk>; Hans
> Holmberg <Hans.Holmberg@wdc.com>; Dmitry Fomichev
> <Dmitry.Fomichev@wdc.com>; Ajay Joshi <Ajay.Joshi@wdc.com>; Aravind
> Ramesh <Aravind.Ramesh@wdc.com>; Niklas Cassel
> <Niklas.Cassel@wdc.com>; Judy Brock <judy.brock@samsung.com>
> Subject: Re: [PATCH 5/5] nvme: support for zoned namespaces
> 
> On 17.06.2020 19:57, Matias Bjørling wrote:
> >On 17/06/2020 16.42, Javier González wrote:
> >>On 17.06.2020 09:43, Christoph Hellwig wrote:
> >>>On Tue, Jun 16, 2020 at 12:41:42PM +0200, Javier González wrote:
> >>>>On 16.06.2020 08:34, Keith Busch wrote:
> >>>>>Add support for NVM Express Zoned Namespaces (ZNS) Command Set
> >>>>>defined in NVM Express TP4053. Zoned namespaces are discovered
> >>>>>based on their Command Set Identifier reported in the namespaces
> >>>>>Namespace Identification Descriptor list. A successfully discovered
> >>>>>Zoned Namespace will be registered with the block layer as a host
> >>>>>managed zoned block device with Zone Append command support. A
> >>>>>namespace that does not support append is not supported by the driver.
> >>>>
> >>>>Why are we enforcing the append command? Append is optional on the
> >>>>current ZNS specification, so we should not make this mandatory in
> >>>>the implementation. See specifics below.
> >>>
> >>>Because Append is the way to go and we've moved the Linux zoned block
> >>>I/O stack to required it, as should have been obvious to anyone
> >>>following linux-block in the last few months.  I also have to say I'm
> >>>really tired of the stupid politics tha your company started in the
> >>>NVMe working group, and will say that these do not matter for Linux
> >>>development at all.  If you think it is worthwhile to support devices
> >>>without Zone Append you can contribute support for them on top of
> >>>this series by porting the SCSI Zone Append Emulation code to NVMe.
> >>>
> >>>And I'm not even going to read the rest of this thread as I'm on a
> >>>vacation that I badly needed because of the Samsung TWG bullshit.
> >>
> >>My intention is to support some Samsung ZNS devices that will not
> >>enable append. I do not think this is an unreasonable thing to do. How
> >>/ why append ended up being an optional feature in the ZNS TP is
> >>orthogonal to this conversation. Bullshit or not, it ends up on
> >>devices that we would like to support one way or another.
> >
> >I do not believe any of us have said that it is unreasonable to
> >support. We've only asked that you make the patches for it.
> >
> >All of us have communicated why Zone Append is a great addition to the
> >Linux kernel. Also, as Christoph points out, this has not been a secret
> >for the past couple of months, and as Martin pointed out, have been a
> >wanted feature for the past decade in the Linux community.
> 
> >
> >I do want to politely point out, that you've got a very clear signal
> >from the key storage maintainers. Each of them is part of the planet's
> >best of the best and most well-respected software developers, that
> >literally have built the storage stack that most of the world depends
> >on. The storage stack that recently sent manned rockets into space.
> >They each unanimously said that the Zone Append command is the right
> >approach for the Linux kernel to reduce the overhead of I/O tracking
> >for zoned block devices. It may be worth bringing this information to
> >your engineering organization, and also potentially consider Zone
> >Append support for devices that you intend to used with the Linux
> >kernel storage stack.
> 
> I understand and I have never said the opposite.
>
> Append is a great addition that

One may have interpreted your SDC EMEA talk the opposite. It was not very neutral towards Zone Append, but that is of cause one of its least problems. But I am happy to hear that you've changed your opinion.

> we also have been working on for several months (see patches additions from
> today). We just have a couple of use cases where append is not required and I
> would like to make sure that they are supported.
> 
> At the end of the day, the only thing I have disagreed on is that the NVMe
> driver rejects ZNS SSDs that do not support append, as opposed to doing this
> instead when an in-kernel user wants to utilize the drive (e.g., formatting a FS
> with zoned support) This would allow _today_
> ioctl() passthru to work for normal writes.
> 
> I still believe the above would be a more inclusive solution with the current ZNS
> specification, but I can see that the general consensus is different.

The comment from the community, including me, is that there is a general requirement for Zone Append command when utilizing Zoned storage devices. This is similar to implement an API that one wants to support. It is not a general consensus or opinion. It is hard facts and how the Linux kernel source code is implemented at this point. One must implement support for ZNS SSDs that do not expose the Zone Append command natively. Period. 

> 
> So we will go back, apply the feedback that we got and return with an
> approach that better fits the ecosystem.
> 
> >
> >Another approach, is to use SPDK, and bypass the Linux kernel. This
> >might even be an advantage, your customers does not have to wait on the
> >Linux distribution being released with a long term release, before they
> >can even get started and deploy in volume. I.e., they will actually get
> >faster to market, and your company will be able to sell more drives.
> 
> I think I will refrain from discussing our business strategy on an open mailing
> list. Appreciate the feedback though. Very insightful.

I am not asking for you to discuss your business strategy on the mailing list. My comment was to give you genuinely advise that may save a lot of work, and might even get better results. 

> 
> Thanks,
> Javier
Javier González June 17, 2020, 7:09 p.m. UTC | #51
On 17.06.2020 18:55, Matias Bjorling wrote:
>> -----Original Message-----
>> From: Javier González <javier@javigon.com>
>> Sent: Wednesday, 17 June 2020 20.29
>> To: Matias Bjørling <mb@lightnvm.io>
>> Cc: Christoph Hellwig <hch@lst.de>; Keith Busch <Keith.Busch@wdc.com>;
>> linux-nvme@lists.infradead.org; linux-block@vger.kernel.org; Damien Le Moal
>> <Damien.LeMoal@wdc.com>; Matias Bjorling <Matias.Bjorling@wdc.com>;
>> Sagi Grimberg <sagi@grimberg.me>; Jens Axboe <axboe@kernel.dk>; Hans
>> Holmberg <Hans.Holmberg@wdc.com>; Dmitry Fomichev
>> <Dmitry.Fomichev@wdc.com>; Ajay Joshi <Ajay.Joshi@wdc.com>; Aravind
>> Ramesh <Aravind.Ramesh@wdc.com>; Niklas Cassel
>> <Niklas.Cassel@wdc.com>; Judy Brock <judy.brock@samsung.com>
>> Subject: Re: [PATCH 5/5] nvme: support for zoned namespaces
>>
>> On 17.06.2020 19:57, Matias Bjørling wrote:
>> >On 17/06/2020 16.42, Javier González wrote:
>> >>On 17.06.2020 09:43, Christoph Hellwig wrote:
>> >>>On Tue, Jun 16, 2020 at 12:41:42PM +0200, Javier González wrote:
>> >>>>On 16.06.2020 08:34, Keith Busch wrote:
>> >>>>>Add support for NVM Express Zoned Namespaces (ZNS) Command Set
>> >>>>>defined in NVM Express TP4053. Zoned namespaces are discovered
>> >>>>>based on their Command Set Identifier reported in the namespaces
>> >>>>>Namespace Identification Descriptor list. A successfully discovered
>> >>>>>Zoned Namespace will be registered with the block layer as a host
>> >>>>>managed zoned block device with Zone Append command support. A
>> >>>>>namespace that does not support append is not supported by the driver.
>> >>>>
>> >>>>Why are we enforcing the append command? Append is optional on the
>> >>>>current ZNS specification, so we should not make this mandatory in
>> >>>>the implementation. See specifics below.
>> >>>
>> >>>Because Append is the way to go and we've moved the Linux zoned block
>> >>>I/O stack to required it, as should have been obvious to anyone
>> >>>following linux-block in the last few months.  I also have to say I'm
>> >>>really tired of the stupid politics tha your company started in the
>> >>>NVMe working group, and will say that these do not matter for Linux
>> >>>development at all.  If you think it is worthwhile to support devices
>> >>>without Zone Append you can contribute support for them on top of
>> >>>this series by porting the SCSI Zone Append Emulation code to NVMe.
>> >>>
>> >>>And I'm not even going to read the rest of this thread as I'm on a
>> >>>vacation that I badly needed because of the Samsung TWG bullshit.
>> >>
>> >>My intention is to support some Samsung ZNS devices that will not
>> >>enable append. I do not think this is an unreasonable thing to do. How
>> >>/ why append ended up being an optional feature in the ZNS TP is
>> >>orthogonal to this conversation. Bullshit or not, it ends up on
>> >>devices that we would like to support one way or another.
>> >
>> >I do not believe any of us have said that it is unreasonable to
>> >support. We've only asked that you make the patches for it.
>> >
>> >All of us have communicated why Zone Append is a great addition to the
>> >Linux kernel. Also, as Christoph points out, this has not been a secret
>> >for the past couple of months, and as Martin pointed out, have been a
>> >wanted feature for the past decade in the Linux community.
>>
>> >
>> >I do want to politely point out, that you've got a very clear signal
>> >from the key storage maintainers. Each of them is part of the planet's
>> >best of the best and most well-respected software developers, that
>> >literally have built the storage stack that most of the world depends
>> >on. The storage stack that recently sent manned rockets into space.
>> >They each unanimously said that the Zone Append command is the right
>> >approach for the Linux kernel to reduce the overhead of I/O tracking
>> >for zoned block devices. It may be worth bringing this information to
>> >your engineering organization, and also potentially consider Zone
>> >Append support for devices that you intend to used with the Linux
>> >kernel storage stack.
>>
>> I understand and I have never said the opposite.
>>
>> Append is a great addition that
>
>One may have interpreted your SDC EMEA talk the opposite. It was not
>very neutral towards Zone Append, but that is of cause one of its least
>problems. But I am happy to hear that you've changed your opinion.

As you are well aware, there are some cases where append introduces
challenges. This is well-documented on the bibliography around nameless
writes. Part of the talk was on presenting an alternative for these
particular use cases.

This said, I am not afraid of changing my point of view when I am proven
wrong.

>
>> we also have been working on for several months (see patches additions from
>> today). We just have a couple of use cases where append is not required and I
>> would like to make sure that they are supported.
>>
>> At the end of the day, the only thing I have disagreed on is that the NVMe
>> driver rejects ZNS SSDs that do not support append, as opposed to doing this
>> instead when an in-kernel user wants to utilize the drive (e.g., formatting a FS
>> with zoned support) This would allow _today_
>> ioctl() passthru to work for normal writes.
>>
>> I still believe the above would be a more inclusive solution with the current ZNS
>> specification, but I can see that the general consensus is different.
>
>The comment from the community, including me, is that there is a
>general requirement for Zone Append command when utilizing Zoned
>storage devices. This is similar to implement an API that one wants to
>support. It is not a general consensus or opinion. It is hard facts and
>how the Linux kernel source code is implemented at this point. One must
>implement support for ZNS SSDs that do not expose the Zone Append
>command natively. Period.

Again, I am not saying the opposite. Read the 2 lines below...

>>
>> So we will go back, apply the feedback that we got and return with an
>> approach that better fits the ecosystem.
>>
>> >
>> >Another approach, is to use SPDK, and bypass the Linux kernel. This
>> >might even be an advantage, your customers does not have to wait on the
>> >Linux distribution being released with a long term release, before they
>> >can even get started and deploy in volume. I.e., they will actually get
>> >faster to market, and your company will be able to sell more drives.
>>
>> I think I will refrain from discussing our business strategy on an open mailing
>> list. Appreciate the feedback though. Very insightful.
>
>I am not asking for you to discuss your business strategy on the mailing list. My comment was to give you genuinely advise that may save a lot of work, and might even get better results.
>
>>
>> Thanks,
>> Javier
Matias Bjorling June 17, 2020, 7:23 p.m. UTC | #52
On 17/06/2020 21.09, Javier González wrote:
> On 17.06.2020 18:55, Matias Bjorling wrote:
>>> -----Original Message-----
>>> From: Javier González <javier@javigon.com>
>>> Sent: Wednesday, 17 June 2020 20.29
>>> To: Matias Bjørling <mb@lightnvm.io>
>>> Cc: Christoph Hellwig <hch@lst.de>; Keith Busch <Keith.Busch@wdc.com>;
>>> linux-nvme@lists.infradead.org; linux-block@vger.kernel.org; Damien 
>>> Le Moal
>>> <Damien.LeMoal@wdc.com>; Matias Bjorling <Matias.Bjorling@wdc.com>;
>>> Sagi Grimberg <sagi@grimberg.me>; Jens Axboe <axboe@kernel.dk>; Hans
>>> Holmberg <Hans.Holmberg@wdc.com>; Dmitry Fomichev
>>> <Dmitry.Fomichev@wdc.com>; Ajay Joshi <Ajay.Joshi@wdc.com>; Aravind
>>> Ramesh <Aravind.Ramesh@wdc.com>; Niklas Cassel
>>> <Niklas.Cassel@wdc.com>; Judy Brock <judy.brock@samsung.com>
>>> Subject: Re: [PATCH 5/5] nvme: support for zoned namespaces
>>>
>>> On 17.06.2020 19:57, Matias Bjørling wrote:
>>> >On 17/06/2020 16.42, Javier González wrote:
>>> >>On 17.06.2020 09:43, Christoph Hellwig wrote:
>>> >>>On Tue, Jun 16, 2020 at 12:41:42PM +0200, Javier González wrote:
>>> >>>>On 16.06.2020 08:34, Keith Busch wrote:
>>> >>>>>Add support for NVM Express Zoned Namespaces (ZNS) Command Set
>>> >>>>>defined in NVM Express TP4053. Zoned namespaces are discovered
>>> >>>>>based on their Command Set Identifier reported in the namespaces
>>> >>>>>Namespace Identification Descriptor list. A successfully 
>>> discovered
>>> >>>>>Zoned Namespace will be registered with the block layer as a host
>>> >>>>>managed zoned block device with Zone Append command support. A
>>> >>>>>namespace that does not support append is not supported by the 
>>> driver.
>>> >>>>
>>> >>>>Why are we enforcing the append command? Append is optional on the
>>> >>>>current ZNS specification, so we should not make this mandatory in
>>> >>>>the implementation. See specifics below.
>>> >>>
>>> >>>Because Append is the way to go and we've moved the Linux zoned 
>>> block
>>> >>>I/O stack to required it, as should have been obvious to anyone
>>> >>>following linux-block in the last few months.  I also have to say 
>>> I'm
>>> >>>really tired of the stupid politics tha your company started in the
>>> >>>NVMe working group, and will say that these do not matter for Linux
>>> >>>development at all.  If you think it is worthwhile to support 
>>> devices
>>> >>>without Zone Append you can contribute support for them on top of
>>> >>>this series by porting the SCSI Zone Append Emulation code to NVMe.
>>> >>>
>>> >>>And I'm not even going to read the rest of this thread as I'm on a
>>> >>>vacation that I badly needed because of the Samsung TWG bullshit.
>>> >>
>>> >>My intention is to support some Samsung ZNS devices that will not
>>> >>enable append. I do not think this is an unreasonable thing to do. 
>>> How
>>> >>/ why append ended up being an optional feature in the ZNS TP is
>>> >>orthogonal to this conversation. Bullshit or not, it ends up on
>>> >>devices that we would like to support one way or another.
>>> >
>>> >I do not believe any of us have said that it is unreasonable to
>>> >support. We've only asked that you make the patches for it.
>>> >
>>> >All of us have communicated why Zone Append is a great addition to the
>>> >Linux kernel. Also, as Christoph points out, this has not been a 
>>> secret
>>> >for the past couple of months, and as Martin pointed out, have been a
>>> >wanted feature for the past decade in the Linux community.
>>>
>>> >
>>> >I do want to politely point out, that you've got a very clear signal
>>> >from the key storage maintainers. Each of them is part of the planet's
>>> >best of the best and most well-respected software developers, that
>>> >literally have built the storage stack that most of the world depends
>>> >on. The storage stack that recently sent manned rockets into space.
>>> >They each unanimously said that the Zone Append command is the right
>>> >approach for the Linux kernel to reduce the overhead of I/O tracking
>>> >for zoned block devices. It may be worth bringing this information to
>>> >your engineering organization, and also potentially consider Zone
>>> >Append support for devices that you intend to used with the Linux
>>> >kernel storage stack.
>>>
>>> I understand and I have never said the opposite.
>>>
>>> Append is a great addition that
>>
>> One may have interpreted your SDC EMEA talk the opposite. It was not
>> very neutral towards Zone Append, but that is of cause one of its least
>> problems. But I am happy to hear that you've changed your opinion.
>
> As you are well aware, there are some cases where append introduces
> challenges. This is well-documented on the bibliography around nameless
> writes. 

The nameless writes idea is vastly different from Zone append, and have 
little of the drawbacks of nameless writes, which makes the 
well-documented literature not apply.

> Part of the talk was on presenting an alternative for these
> particular use cases.
>
> This said, I am not afraid of changing my point of view when I am proven
> wrong.
>
>>
>>> we also have been working on for several months (see patches 
>>> additions from
>>> today). We just have a couple of use cases where append is not 
>>> required and I
>>> would like to make sure that they are supported.
>>>
>>> At the end of the day, the only thing I have disagreed on is that 
>>> the NVMe
>>> driver rejects ZNS SSDs that do not support append, as opposed to 
>>> doing this
>>> instead when an in-kernel user wants to utilize the drive (e.g., 
>>> formatting a FS
>>> with zoned support) This would allow _today_
>>> ioctl() passthru to work for normal writes.
>>>
>>> I still believe the above would be a more inclusive solution with 
>>> the current ZNS
>>> specification, but I can see that the general consensus is different.
>>
>> The comment from the community, including me, is that there is a
>> general requirement for Zone Append command when utilizing Zoned
>> storage devices. This is similar to implement an API that one wants to
>> support. It is not a general consensus or opinion. It is hard facts and
>> how the Linux kernel source code is implemented at this point. One must
>> implement support for ZNS SSDs that do not expose the Zone Append
>> command natively. Period.
>
> Again, I am not saying the opposite. Read the 2 lines below...

My point with the above paragraph was to clarify that we are not trying 
to be difficult or opinionated, but point out that the reason we give 
you the specific feedback, is that it is the way it is in the kernel as 
today.

>
>>>
>>> So we will go back, apply the feedback that we got and return with an
>>> approach that better fits the ecosystem.
>>>
>>> >
>>> >Another approach, is to use SPDK, and bypass the Linux kernel. This
>>> >might even be an advantage, your customers does not have to wait on 
>>> the
>>> >Linux distribution being released with a long term release, before 
>>> they
>>> >can even get started and deploy in volume. I.e., they will actually 
>>> get
>>> >faster to market, and your company will be able to sell more drives.
>>>
>>> I think I will refrain from discussing our business strategy on an 
>>> open mailing
>>> list. Appreciate the feedback though. Very insightful.
>>
>> I am not asking for you to discuss your business strategy on the 
>> mailing list. My comment was to give you genuinely advise that may 
>> save a lot of work, and might even get better results.
>>
>>>
>>> Thanks,
>>> Javier
>
Javier González June 17, 2020, 7:40 p.m. UTC | #53
On 17.06.2020 21:23, Matias Bjørling wrote:
>On 17/06/2020 21.09, Javier González wrote:
>>On 17.06.2020 18:55, Matias Bjorling wrote:
>>>>-----Original Message-----
>>>>From: Javier González <javier@javigon.com>
>>>>Sent: Wednesday, 17 June 2020 20.29
>>>>To: Matias Bjørling <mb@lightnvm.io>
>>>>Cc: Christoph Hellwig <hch@lst.de>; Keith Busch <Keith.Busch@wdc.com>;
>>>>linux-nvme@lists.infradead.org; linux-block@vger.kernel.org; 
>>>>Damien Le Moal
>>>><Damien.LeMoal@wdc.com>; Matias Bjorling <Matias.Bjorling@wdc.com>;
>>>>Sagi Grimberg <sagi@grimberg.me>; Jens Axboe <axboe@kernel.dk>; Hans
>>>>Holmberg <Hans.Holmberg@wdc.com>; Dmitry Fomichev
>>>><Dmitry.Fomichev@wdc.com>; Ajay Joshi <Ajay.Joshi@wdc.com>; Aravind
>>>>Ramesh <Aravind.Ramesh@wdc.com>; Niklas Cassel
>>>><Niklas.Cassel@wdc.com>; Judy Brock <judy.brock@samsung.com>
>>>>Subject: Re: [PATCH 5/5] nvme: support for zoned namespaces
>>>>
>>>>On 17.06.2020 19:57, Matias Bjørling wrote:
>>>>>On 17/06/2020 16.42, Javier González wrote:
>>>>>>On 17.06.2020 09:43, Christoph Hellwig wrote:
>>>>>>>On Tue, Jun 16, 2020 at 12:41:42PM +0200, Javier González wrote:
>>>>>>>>On 16.06.2020 08:34, Keith Busch wrote:
>>>>>>>>>Add support for NVM Express Zoned Namespaces (ZNS) Command Set
>>>>>>>>>defined in NVM Express TP4053. Zoned namespaces are discovered
>>>>>>>>>based on their Command Set Identifier reported in the namespaces
>>>>>>>>>Namespace Identification Descriptor list. A successfully 
>>>>discovered
>>>>>>>>>Zoned Namespace will be registered with the block layer as a host
>>>>>>>>>managed zoned block device with Zone Append command support. A
>>>>>>>>>namespace that does not support append is not supported by 
>>>>the driver.
>>>>>>>>
>>>>>>>>Why are we enforcing the append command? Append is optional on the
>>>>>>>>current ZNS specification, so we should not make this mandatory in
>>>>>>>>the implementation. See specifics below.
>>>>>>>
>>>>>>>Because Append is the way to go and we've moved the Linux 
>>>>zoned block
>>>>>>>I/O stack to required it, as should have been obvious to anyone
>>>>>>>following linux-block in the last few months.  I also have to 
>>>>say I'm
>>>>>>>really tired of the stupid politics tha your company started in the
>>>>>>>NVMe working group, and will say that these do not matter for Linux
>>>>>>>development at all.  If you think it is worthwhile to support 
>>>>devices
>>>>>>>without Zone Append you can contribute support for them on top of
>>>>>>>this series by porting the SCSI Zone Append Emulation code to NVMe.
>>>>>>>
>>>>>>>And I'm not even going to read the rest of this thread as I'm on a
>>>>>>>vacation that I badly needed because of the Samsung TWG bullshit.
>>>>>>
>>>>>>My intention is to support some Samsung ZNS devices that will not
>>>>>>enable append. I do not think this is an unreasonable thing to 
>>>>do. How
>>>>>>/ why append ended up being an optional feature in the ZNS TP is
>>>>>>orthogonal to this conversation. Bullshit or not, it ends up on
>>>>>>devices that we would like to support one way or another.
>>>>>
>>>>>I do not believe any of us have said that it is unreasonable to
>>>>>support. We've only asked that you make the patches for it.
>>>>>
>>>>>All of us have communicated why Zone Append is a great addition to the
>>>>>Linux kernel. Also, as Christoph points out, this has not been 
>>>>a secret
>>>>>for the past couple of months, and as Martin pointed out, have been a
>>>>>wanted feature for the past decade in the Linux community.
>>>>
>>>>>
>>>>>I do want to politely point out, that you've got a very clear signal
>>>>>from the key storage maintainers. Each of them is part of the planet's
>>>>>best of the best and most well-respected software developers, that
>>>>>literally have built the storage stack that most of the world depends
>>>>>on. The storage stack that recently sent manned rockets into space.
>>>>>They each unanimously said that the Zone Append command is the right
>>>>>approach for the Linux kernel to reduce the overhead of I/O tracking
>>>>>for zoned block devices. It may be worth bringing this information to
>>>>>your engineering organization, and also potentially consider Zone
>>>>>Append support for devices that you intend to used with the Linux
>>>>>kernel storage stack.
>>>>
>>>>I understand and I have never said the opposite.
>>>>
>>>>Append is a great addition that
>>>
>>>One may have interpreted your SDC EMEA talk the opposite. It was not
>>>very neutral towards Zone Append, but that is of cause one of its least
>>>problems. But I am happy to hear that you've changed your opinion.
>>
>>As you are well aware, there are some cases where append introduces
>>challenges. This is well-documented on the bibliography around nameless
>>writes.
>
>The nameless writes idea is vastly different from Zone append, and 
>have little of the drawbacks of nameless writes, which makes the 
>well-documented literature not apply.

You can have that conversation with your customer base.

>
>>Part of the talk was on presenting an alternative for these
>>particular use cases.
>>
>>This said, I am not afraid of changing my point of view when I am proven
>>wrong.
>>
>>>
>>>>we also have been working on for several months (see patches 
>>>>additions from
>>>>today). We just have a couple of use cases where append is not 
>>>>required and I
>>>>would like to make sure that they are supported.
>>>>
>>>>At the end of the day, the only thing I have disagreed on is 
>>>>that the NVMe
>>>>driver rejects ZNS SSDs that do not support append, as opposed 
>>>>to doing this
>>>>instead when an in-kernel user wants to utilize the drive (e.g., 
>>>>formatting a FS
>>>>with zoned support) This would allow _today_
>>>>ioctl() passthru to work for normal writes.
>>>>
>>>>I still believe the above would be a more inclusive solution 
>>>>with the current ZNS
>>>>specification, but I can see that the general consensus is different.
>>>
>>>The comment from the community, including me, is that there is a
>>>general requirement for Zone Append command when utilizing Zoned
>>>storage devices. This is similar to implement an API that one wants to
>>>support. It is not a general consensus or opinion. It is hard facts and
>>>how the Linux kernel source code is implemented at this point. One must
>>>implement support for ZNS SSDs that do not expose the Zone Append
>>>command natively. Period.
>>
>>Again, I am not saying the opposite. Read the 2 lines below...
>
>My point with the above paragraph was to clarify that we are not 
>trying to be difficult or opinionated, but point out that the reason 
>we give you the specific feedback, is that it is the way it is in the 
>kernel as today.

Again, yes, we will apply the feedback and come back with an approach
that fits so that we can enable the raw ZNS block access that we want to
enable.

>
>>
>>>>
>>>>So we will go back, apply the feedback that we got and return with an
>>>>approach that better fits the ecosystem.
>>>>
>>>>>
>>>>>Another approach, is to use SPDK, and bypass the Linux kernel. This
>>>>>might even be an advantage, your customers does not have to 
>>>>wait on the
>>>>>Linux distribution being released with a long term release, 
>>>>before they
>>>>>can even get started and deploy in volume. I.e., they will 
>>>>actually get
>>>>>faster to market, and your company will be able to sell more drives.
>>>>
>>>>I think I will refrain from discussing our business strategy on 
>>>>an open mailing
>>>>list. Appreciate the feedback though. Very insightful.
>>>
>>>I am not asking for you to discuss your business strategy on the 
>>>mailing list. My comment was to give you genuinely advise that may 
>>>save a lot of work, and might even get better results.
>>>
>>>>
>>>>Thanks,
>>>>Javier
>>
Heiner Litz June 17, 2020, 11:44 p.m. UTC | #54
Thanks for the interesting discussion but it made me wonder about the
usefulness of 4K writes in the first place. Append seems to be a
workaround for a problem (single writer/QD) that shouldn't exist in
the first place. If writes need to be sequential, what is the purpose
of allowing 4K writes at all (it provides no placement flexibility).
Mandating zone-sized writes would address all problems with ease and
reduce request rate and overheads in the kernel. I don't see why we
would disassemble a zone-sized block into smaller writes just to
re-assemble them again on the device. A promise of ZNS is to move the
translation overhead from the device into the FS layer so why
re-introduce complexity in the bio layer? Managing zone-sized blocks
on the application/FS layer is also much more convenient than
receiving random 4K addresses from append commands.
Finally, note that splitting zone-sized bios in the kernel does not
achieve any purpose as interleaving/scheduling within a zone isn't
possible. If we want to interleave accesses to multiple open zones,
this should be done on the device layer by exposing queue(s) per zone.
For applications writing large, consecutive blocks (RocksDB), the best
implementation seems to be providing a kernel path that guarantees
non-splittable zone-sized writes.

On Wed, Jun 17, 2020 at 12:40 PM Javier González <javier@javigon.com> wrote:
>
> On 17.06.2020 21:23, Matias Bjørling wrote:
> >On 17/06/2020 21.09, Javier González wrote:
> >>On 17.06.2020 18:55, Matias Bjorling wrote:
> >>>>-----Original Message-----
> >>>>From: Javier González <javier@javigon.com>
> >>>>Sent: Wednesday, 17 June 2020 20.29
> >>>>To: Matias Bjørling <mb@lightnvm.io>
> >>>>Cc: Christoph Hellwig <hch@lst.de>; Keith Busch <Keith.Busch@wdc.com>;
> >>>>linux-nvme@lists.infradead.org; linux-block@vger.kernel.org;
> >>>>Damien Le Moal
> >>>><Damien.LeMoal@wdc.com>; Matias Bjorling <Matias.Bjorling@wdc.com>;
> >>>>Sagi Grimberg <sagi@grimberg.me>; Jens Axboe <axboe@kernel.dk>; Hans
> >>>>Holmberg <Hans.Holmberg@wdc.com>; Dmitry Fomichev
> >>>><Dmitry.Fomichev@wdc.com>; Ajay Joshi <Ajay.Joshi@wdc.com>; Aravind
> >>>>Ramesh <Aravind.Ramesh@wdc.com>; Niklas Cassel
> >>>><Niklas.Cassel@wdc.com>; Judy Brock <judy.brock@samsung.com>
> >>>>Subject: Re: [PATCH 5/5] nvme: support for zoned namespaces
> >>>>
> >>>>On 17.06.2020 19:57, Matias Bjørling wrote:
> >>>>>On 17/06/2020 16.42, Javier González wrote:
> >>>>>>On 17.06.2020 09:43, Christoph Hellwig wrote:
> >>>>>>>On Tue, Jun 16, 2020 at 12:41:42PM +0200, Javier González wrote:
> >>>>>>>>On 16.06.2020 08:34, Keith Busch wrote:
> >>>>>>>>>Add support for NVM Express Zoned Namespaces (ZNS) Command Set
> >>>>>>>>>defined in NVM Express TP4053. Zoned namespaces are discovered
> >>>>>>>>>based on their Command Set Identifier reported in the namespaces
> >>>>>>>>>Namespace Identification Descriptor list. A successfully
> >>>>discovered
> >>>>>>>>>Zoned Namespace will be registered with the block layer as a host
> >>>>>>>>>managed zoned block device with Zone Append command support. A
> >>>>>>>>>namespace that does not support append is not supported by
> >>>>the driver.
> >>>>>>>>
> >>>>>>>>Why are we enforcing the append command? Append is optional on the
> >>>>>>>>current ZNS specification, so we should not make this mandatory in
> >>>>>>>>the implementation. See specifics below.
> >>>>>>>
> >>>>>>>Because Append is the way to go and we've moved the Linux
> >>>>zoned block
> >>>>>>>I/O stack to required it, as should have been obvious to anyone
> >>>>>>>following linux-block in the last few months.  I also have to
> >>>>say I'm
> >>>>>>>really tired of the stupid politics tha your company started in the
> >>>>>>>NVMe working group, and will say that these do not matter for Linux
> >>>>>>>development at all.  If you think it is worthwhile to support
> >>>>devices
> >>>>>>>without Zone Append you can contribute support for them on top of
> >>>>>>>this series by porting the SCSI Zone Append Emulation code to NVMe.
> >>>>>>>
> >>>>>>>And I'm not even going to read the rest of this thread as I'm on a
> >>>>>>>vacation that I badly needed because of the Samsung TWG bullshit.
> >>>>>>
> >>>>>>My intention is to support some Samsung ZNS devices that will not
> >>>>>>enable append. I do not think this is an unreasonable thing to
> >>>>do. How
> >>>>>>/ why append ended up being an optional feature in the ZNS TP is
> >>>>>>orthogonal to this conversation. Bullshit or not, it ends up on
> >>>>>>devices that we would like to support one way or another.
> >>>>>
> >>>>>I do not believe any of us have said that it is unreasonable to
> >>>>>support. We've only asked that you make the patches for it.
> >>>>>
> >>>>>All of us have communicated why Zone Append is a great addition to the
> >>>>>Linux kernel. Also, as Christoph points out, this has not been
> >>>>a secret
> >>>>>for the past couple of months, and as Martin pointed out, have been a
> >>>>>wanted feature for the past decade in the Linux community.
> >>>>
> >>>>>
> >>>>>I do want to politely point out, that you've got a very clear signal
> >>>>>from the key storage maintainers. Each of them is part of the planet's
> >>>>>best of the best and most well-respected software developers, that
> >>>>>literally have built the storage stack that most of the world depends
> >>>>>on. The storage stack that recently sent manned rockets into space.
> >>>>>They each unanimously said that the Zone Append command is the right
> >>>>>approach for the Linux kernel to reduce the overhead of I/O tracking
> >>>>>for zoned block devices. It may be worth bringing this information to
> >>>>>your engineering organization, and also potentially consider Zone
> >>>>>Append support for devices that you intend to used with the Linux
> >>>>>kernel storage stack.
> >>>>
> >>>>I understand and I have never said the opposite.
> >>>>
> >>>>Append is a great addition that
> >>>
> >>>One may have interpreted your SDC EMEA talk the opposite. It was not
> >>>very neutral towards Zone Append, but that is of cause one of its least
> >>>problems. But I am happy to hear that you've changed your opinion.
> >>
> >>As you are well aware, there are some cases where append introduces
> >>challenges. This is well-documented on the bibliography around nameless
> >>writes.
> >
> >The nameless writes idea is vastly different from Zone append, and
> >have little of the drawbacks of nameless writes, which makes the
> >well-documented literature not apply.
>
> You can have that conversation with your customer base.
>
> >
> >>Part of the talk was on presenting an alternative for these
> >>particular use cases.
> >>
> >>This said, I am not afraid of changing my point of view when I am proven
> >>wrong.
> >>
> >>>
> >>>>we also have been working on for several months (see patches
> >>>>additions from
> >>>>today). We just have a couple of use cases where append is not
> >>>>required and I
> >>>>would like to make sure that they are supported.
> >>>>
> >>>>At the end of the day, the only thing I have disagreed on is
> >>>>that the NVMe
> >>>>driver rejects ZNS SSDs that do not support append, as opposed
> >>>>to doing this
> >>>>instead when an in-kernel user wants to utilize the drive (e.g.,
> >>>>formatting a FS
> >>>>with zoned support) This would allow _today_
> >>>>ioctl() passthru to work for normal writes.
> >>>>
> >>>>I still believe the above would be a more inclusive solution
> >>>>with the current ZNS
> >>>>specification, but I can see that the general consensus is different.
> >>>
> >>>The comment from the community, including me, is that there is a
> >>>general requirement for Zone Append command when utilizing Zoned
> >>>storage devices. This is similar to implement an API that one wants to
> >>>support. It is not a general consensus or opinion. It is hard facts and
> >>>how the Linux kernel source code is implemented at this point. One must
> >>>implement support for ZNS SSDs that do not expose the Zone Append
> >>>command natively. Period.
> >>
> >>Again, I am not saying the opposite. Read the 2 lines below...
> >
> >My point with the above paragraph was to clarify that we are not
> >trying to be difficult or opinionated, but point out that the reason
> >we give you the specific feedback, is that it is the way it is in the
> >kernel as today.
>
> Again, yes, we will apply the feedback and come back with an approach
> that fits so that we can enable the raw ZNS block access that we want to
> enable.
>
> >
> >>
> >>>>
> >>>>So we will go back, apply the feedback that we got and return with an
> >>>>approach that better fits the ecosystem.
> >>>>
> >>>>>
> >>>>>Another approach, is to use SPDK, and bypass the Linux kernel. This
> >>>>>might even be an advantage, your customers does not have to
> >>>>wait on the
> >>>>>Linux distribution being released with a long term release,
> >>>>before they
> >>>>>can even get started and deploy in volume. I.e., they will
> >>>>actually get
> >>>>>faster to market, and your company will be able to sell more drives.
> >>>>
> >>>>I think I will refrain from discussing our business strategy on
> >>>>an open mailing
> >>>>list. Appreciate the feedback though. Very insightful.
> >>>
> >>>I am not asking for you to discuss your business strategy on the
> >>>mailing list. My comment was to give you genuinely advise that may
> >>>save a lot of work, and might even get better results.
> >>>
> >>>>
> >>>>Thanks,
> >>>>Javier
> >>
Keith Busch June 18, 2020, 1:55 a.m. UTC | #55
On Wed, Jun 17, 2020 at 04:44:23PM -0700, Heiner Litz wrote:
> Mandating zone-sized writes would address all problems with ease and
> reduce request rate and overheads in the kernel.

Yikes, no. Typical zone sizes are much to large for that to be
reasonable.
Heiner Litz June 18, 2020, 4:24 a.m. UTC | #56
What is the purpose of making zones larger than the erase block size
of flash? And why are large writes fundamentally unreasonable?

I don't see why it should be a fundamental problem for e.g. RocksDB to
issue single zone-sized writes (whatever the zone size is because
RocksDB needs to cope with it). The write buffer exists as a level in
DRAM anyways and increasing write latency will not matter either.

On Wed, Jun 17, 2020 at 6:55 PM Keith Busch <kbusch@kernel.org> wrote:
>
> On Wed, Jun 17, 2020 at 04:44:23PM -0700, Heiner Litz wrote:
> > Mandating zone-sized writes would address all problems with ease and
> > reduce request rate and overheads in the kernel.
>
> Yikes, no. Typical zone sizes are much to large for that to be
> reasonable.
Damien Le Moal June 18, 2020, 5:15 a.m. UTC | #57
On 2020/06/18 13:24, Heiner Litz wrote:
> What is the purpose of making zones larger than the erase block size
> of flash? And why are large writes fundamentally unreasonable?

It is up to the drive vendor to decide how zones are mapped onto flash media.
Different mapping give different properties for different use cases. Zones, in
many cases, will be much larger than an erase block due to stripping across many
dies for example. And erase block size also has a tendency to grow over time
with new media generations.
The block layer management of zoned block devices also applies to SMR HDDs,
which can have any zone size they want. This is not all about flash.

As for large writes, they may not be possible due to memory fragmentation and/or
limited SGL size of the drive interface. E.g. AHCI max out at 168 segments, most
HBAs are at best 256, etc.

> I don't see why it should be a fundamental problem for e.g. RocksDB to
> issue single zone-sized writes (whatever the zone size is because
> RocksDB needs to cope with it). The write buffer exists as a level in
> DRAM anyways and increasing write latency will not matter either.

Rocksdb is an application, so of course it is free to issue a single write()
call with a buffer size equal to the zone size. But due to the buffer mapping
limitations stated above, there is a very high probability that this single
zone-sized large write operation will end-up being split into multiple write
commands in the kernel.

> 
> On Wed, Jun 17, 2020 at 6:55 PM Keith Busch <kbusch@kernel.org> wrote:
>>
>> On Wed, Jun 17, 2020 at 04:44:23PM -0700, Heiner Litz wrote:
>>> Mandating zone-sized writes would address all problems with ease and
>>> reduce request rate and overheads in the kernel.
>>
>> Yikes, no. Typical zone sizes are much to large for that to be
>> reasonable.
>
Heiner Litz June 18, 2020, 8:47 p.m. UTC | #58
Thanks Damien,
the striping explanation makes sense. In this case will rephase to: It
is sufficient to support large enough un-splittable writes to achieve
full per-zone bandwidth with a single writer/single QD.

My main point is: There is no fundamental reason for splitting up
requests intermittently just to re-assemble them in the same form
later.

On Wed, Jun 17, 2020 at 10:15 PM Damien Le Moal <Damien.LeMoal@wdc.com> wrote:
>
> On 2020/06/18 13:24, Heiner Litz wrote:
> > What is the purpose of making zones larger than the erase block size
> > of flash? And why are large writes fundamentally unreasonable?
>
> It is up to the drive vendor to decide how zones are mapped onto flash media.
> Different mapping give different properties for different use cases. Zones, in
> many cases, will be much larger than an erase block due to stripping across many
> dies for example. And erase block size also has a tendency to grow over time
> with new media generations.
> The block layer management of zoned block devices also applies to SMR HDDs,
> which can have any zone size they want. This is not all about flash.
>
> As for large writes, they may not be possible due to memory fragmentation and/or
> limited SGL size of the drive interface. E.g. AHCI max out at 168 segments, most
> HBAs are at best 256, etc.
>
> > I don't see why it should be a fundamental problem for e.g. RocksDB to
> > issue single zone-sized writes (whatever the zone size is because
> > RocksDB needs to cope with it). The write buffer exists as a level in
> > DRAM anyways and increasing write latency will not matter either.
>
> Rocksdb is an application, so of course it is free to issue a single write()
> call with a buffer size equal to the zone size. But due to the buffer mapping
> limitations stated above, there is a very high probability that this single
> zone-sized large write operation will end-up being split into multiple write
> commands in the kernel.
>
> >
> > On Wed, Jun 17, 2020 at 6:55 PM Keith Busch <kbusch@kernel.org> wrote:
> >>
> >> On Wed, Jun 17, 2020 at 04:44:23PM -0700, Heiner Litz wrote:
> >>> Mandating zone-sized writes would address all problems with ease and
> >>> reduce request rate and overheads in the kernel.
> >>
> >> Yikes, no. Typical zone sizes are much to large for that to be
> >> reasonable.
> >
>
>
> --
> Damien Le Moal
> Western Digital Research
Matias Bjørling June 18, 2020, 9:04 p.m. UTC | #59
> -----Original Message-----
> From: Heiner Litz <hlitz@ucsc.edu>
> Sent: Thursday, 18 June 2020 22.47
> To: Damien Le Moal <Damien.LeMoal@wdc.com>
> Cc: Keith Busch <kbusch@kernel.org>; Javier González <javier@javigon.com>;
> Matias Bjørling <mb@lightnvm.io>; Matias Bjorling
> <Matias.Bjorling@wdc.com>; Christoph Hellwig <hch@lst.de>; Keith Busch
> <Keith.Busch@wdc.com>; linux-nvme@lists.infradead.org; linux-
> block@vger.kernel.org; Sagi Grimberg <sagi@grimberg.me>; Jens Axboe
> <axboe@kernel.dk>; Hans Holmberg <Hans.Holmberg@wdc.com>; Dmitry
> Fomichev <Dmitry.Fomichev@wdc.com>; Ajay Joshi <Ajay.Joshi@wdc.com>;
> Aravind Ramesh <Aravind.Ramesh@wdc.com>; Niklas Cassel
> <Niklas.Cassel@wdc.com>; Judy Brock <judy.brock@samsung.com>
> Subject: Re: [PATCH 5/5] nvme: support for zoned namespaces
> 
> Thanks Damien,
> the striping explanation makes sense. In this case will rephase to: It is sufficient
> to support large enough un-splittable writes to achieve full per-zone bandwidth
> with a single writer/single QD.

Hi Heiner,

For ZNS in general, there is no performance information for a zone, other than what is communicated per namespace. I.e., In a well-developed ZNS drive, the host should not have to stripe zones to get the full performance of a ZNS drive. This is important and was one of the learnings from OCSSD. We saw that the main bottlenecks in OCSSD host software implementations were striping, host buffering, and vendor-specific hacks. For ZNS - I wanted to make sure that we did not make the same mistake, and as such, the complexity should solely be managed within an ZNS SSD.

If one does want to expose this kind of architecture, for whatever reason. One can make use of Endurance Groups in NVMe, and as such expose groups, that are physically separated in the drive, and as such, the host can then stripe zones is separate endurance groups to get the necessary performance.

That being said, some vendors implemented ZNS SSDs as it were OCSSDs, and as such, one has to stripe zones together to get the expected performance. For Linux, that is not something that'll be supported (other than if a device does it the appropriate way by using the standardized endurance groups). That being said, adoptors that run custom storage stacks can make use of it at the cost of having to manage the same challenges that OCSSD had., i.e., manually manage striping, host buffering, and even vendor-specific hacks.

> 
> My main point is: There is no fundamental reason for splitting up requests
> intermittently just to re-assemble them in the same form later.
> 
> On Wed, Jun 17, 2020 at 10:15 PM Damien Le Moal
> <Damien.LeMoal@wdc.com> wrote:
> >
> > On 2020/06/18 13:24, Heiner Litz wrote:
> > > What is the purpose of making zones larger than the erase block size
> > > of flash? And why are large writes fundamentally unreasonable?
> >
> > It is up to the drive vendor to decide how zones are mapped onto flash media.
> > Different mapping give different properties for different use cases.
> > Zones, in many cases, will be much larger than an erase block due to
> > stripping across many dies for example. And erase block size also has
> > a tendency to grow over time with new media generations.
> > The block layer management of zoned block devices also applies to SMR
> > HDDs, which can have any zone size they want. This is not all about flash.
> >
> > As for large writes, they may not be possible due to memory
> > fragmentation and/or limited SGL size of the drive interface. E.g.
> > AHCI max out at 168 segments, most HBAs are at best 256, etc.
> >
> > > I don't see why it should be a fundamental problem for e.g. RocksDB
> > > to issue single zone-sized writes (whatever the zone size is because
> > > RocksDB needs to cope with it). The write buffer exists as a level
> > > in DRAM anyways and increasing write latency will not matter either.
> >
> > Rocksdb is an application, so of course it is free to issue a single
> > write() call with a buffer size equal to the zone size. But due to the
> > buffer mapping limitations stated above, there is a very high
> > probability that this single zone-sized large write operation will
> > end-up being split into multiple write commands in the kernel.
> >
> > >
> > > On Wed, Jun 17, 2020 at 6:55 PM Keith Busch <kbusch@kernel.org> wrote:
> > >>
> > >> On Wed, Jun 17, 2020 at 04:44:23PM -0700, Heiner Litz wrote:
> > >>> Mandating zone-sized writes would address all problems with ease
> > >>> and reduce request rate and overheads in the kernel.
> > >>
> > >> Yikes, no. Typical zone sizes are much to large for that to be
> > >> reasonable.
> > >
> >
> >
> > --
> > Damien Le Moal
> > Western Digital Research
Keith Busch June 18, 2020, 9:19 p.m. UTC | #60
On Thu, Jun 18, 2020 at 01:47:20PM -0700, Heiner Litz wrote:
> the striping explanation makes sense. In this case will rephase to: It
> is sufficient to support large enough un-splittable writes to achieve
> full per-zone bandwidth with a single writer/single QD.

This is subject to the capabilities of the device and software's memory
constraints. The maximum DMA size for a single request an nvme device can
handle often range anywhere from 64k to 4MB. The pci nvme driver maxes out at
4MB anyway because that's the most we can guarantee forward progress right now,
otherwise the scatter lists become to big to ensure we'll be able to allocate
one to dispatch a write command.

We do report the size and the alignment constraints so that it won't get split,
but we still have to work with applications that don't abide by those
constraints.
 
> My main point is: There is no fundamental reason for splitting up
> requests intermittently just to re-assemble them in the same form
> later.
Heiner Litz June 18, 2020, 10:05 p.m. UTC | #61
Matias, Keith,
thanks, this all sounds good and it makes total sense to hide striping
from the user.

In the end, the real problem really seems to be that ZNS effectively
requires in-order IO delivery which the kernel cannot guarantee. I
think fixing this problem in the ZNS specification instead of in the
communication substrate (kernel) is problematic, especially as
out-of-order delivery absolutely has no benefit in the case of ZNS.
But I guess this has been discussed before..

On Thu, Jun 18, 2020 at 2:19 PM Keith Busch <kbusch@kernel.org> wrote:
>
> On Thu, Jun 18, 2020 at 01:47:20PM -0700, Heiner Litz wrote:
> > the striping explanation makes sense. In this case will rephase to: It
> > is sufficient to support large enough un-splittable writes to achieve
> > full per-zone bandwidth with a single writer/single QD.
>
> This is subject to the capabilities of the device and software's memory
> constraints. The maximum DMA size for a single request an nvme device can
> handle often range anywhere from 64k to 4MB. The pci nvme driver maxes out at
> 4MB anyway because that's the most we can guarantee forward progress right now,
> otherwise the scatter lists become to big to ensure we'll be able to allocate
> one to dispatch a write command.
>
> We do report the size and the alignment constraints so that it won't get split,
> but we still have to work with applications that don't abide by those
> constraints.
>
> > My main point is: There is no fundamental reason for splitting up
> > requests intermittently just to re-assemble them in the same form
> > later.
Damien Le Moal June 19, 2020, 12:57 a.m. UTC | #62
On 2020/06/19 7:05, Heiner Litz wrote:
> Matias, Keith,
> thanks, this all sounds good and it makes total sense to hide striping
> from the user.
> 
> In the end, the real problem really seems to be that ZNS effectively
> requires in-order IO delivery which the kernel cannot guarantee. I
> think fixing this problem in the ZNS specification instead of in the
> communication substrate (kernel) is problematic, especially as
> out-of-order delivery absolutely has no benefit in the case of ZNS.
> But I guess this has been discussed before..

From the device interface perspective, that is from the ZNS specifications point
of view, only regular writes require in order dispatching by the host. Zone
append write commands can be issued in any order and will succeed as long as
there are enough unwritten blocks in the target zone to fit the append request.
And the zone append command processing can happen in any order the drive sees
fit. SO there is indeed no guarantee back to the host that zone append command
execution will be done in the same order as issued by the host.

That is from the interface perspective, for the protocol. Now the question that
I think you are after seems to be "does this work for the user" ? The answer is
a simple "it depends what the use case is". The device user is free to choose
between issuing regular writes or zone append write. This choice heavily depends
on the answer to the question: "Can I tolerate out of order writes ?". For a
file system, the answer is yes, since metadata is used to indicate the mapping
of file offsets to on-disk locations. It does not matter, functionally speaking,
if the file data blocks for increasing file offsets are out of order. That can
happen with any file system on any regular disk due to block
allocation/fragmentation today.

For an application using raw block device accesses without a file system, the
usability of zone append will heavily depend on the structure/format of the data
being written. A simple logging application where every write to a device stores
a single independent "record" will likely be fine with zone append. If the
application is writing something like a B-tree with dependency between data
blocks pointing to each other, zone append may not be the best choice as the
final location on disk of a write is only approximately known (i.e., one can
only guarantee that it will land "somewhere" in a zone). That however depend on
how the application issues IO requests.

Zone append is not a magic command solving all problems. But it certainly does
simplify a lot of things in the kernel IO stack (no need for strong ordering)
and also can simplify file system implementation (no need to control write
issuing order).

> 
> On Thu, Jun 18, 2020 at 2:19 PM Keith Busch <kbusch@kernel.org> wrote:
>>
>> On Thu, Jun 18, 2020 at 01:47:20PM -0700, Heiner Litz wrote:
>>> the striping explanation makes sense. In this case will rephase to: It
>>> is sufficient to support large enough un-splittable writes to achieve
>>> full per-zone bandwidth with a single writer/single QD.
>>
>> This is subject to the capabilities of the device and software's memory
>> constraints. The maximum DMA size for a single request an nvme device can
>> handle often range anywhere from 64k to 4MB. The pci nvme driver maxes out at
>> 4MB anyway because that's the most we can guarantee forward progress right now,
>> otherwise the scatter lists become to big to ensure we'll be able to allocate
>> one to dispatch a write command.
>>
>> We do report the size and the alignment constraints so that it won't get split,
>> but we still have to work with applications that don't abide by those
>> constraints.
>>
>>> My main point is: There is no fundamental reason for splitting up
>>> requests intermittently just to re-assemble them in the same form
>>> later.
>
Matias Bjørling June 19, 2020, 10:29 a.m. UTC | #63
> -----Original Message-----
> From: Heiner Litz <hlitz@ucsc.edu>
> Sent: Friday, 19 June 2020 00.05
> To: Keith Busch <kbusch@kernel.org>
> Cc: Damien Le Moal <Damien.LeMoal@wdc.com>; Javier González
> <javier@javigon.com>; Matias Bjørling <mb@lightnvm.io>; Matias Bjorling
> <Matias.Bjorling@wdc.com>; Christoph Hellwig <hch@lst.de>; Keith Busch
> <Keith.Busch@wdc.com>; linux-nvme@lists.infradead.org; linux-
> block@vger.kernel.org; Sagi Grimberg <sagi@grimberg.me>; Jens Axboe
> <axboe@kernel.dk>; Hans Holmberg <Hans.Holmberg@wdc.com>; Dmitry
> Fomichev <Dmitry.Fomichev@wdc.com>; Ajay Joshi <Ajay.Joshi@wdc.com>;
> Aravind Ramesh <Aravind.Ramesh@wdc.com>; Niklas Cassel
> <Niklas.Cassel@wdc.com>; Judy Brock <judy.brock@samsung.com>
> Subject: Re: [PATCH 5/5] nvme: support for zoned namespaces
> 
> Matias, Keith,
> thanks, this all sounds good and it makes total sense to hide striping from the
> user.
> 
> In the end, the real problem really seems to be that ZNS effectively requires in-
> order IO delivery which the kernel cannot guarantee. I think fixing this problem
> in the ZNS specification instead of in the communication substrate (kernel) is
> problematic, especially as out-of-order delivery absolutely has no benefit in the
> case of ZNS.
> But I guess this has been discussed before..

I'm a bit dense, by the above, is your conclusion that ZNS has a deficit/feature, which OCSSD didn't already have? They both had the same requirement that a chunk/zone must be written sequentially. It's the name of the game when deploying NAND-based media, I am not sure how ZNS should be able to help with this. The goal of ZNS is to align with the media (and OCSSD), which makes writes required to be sequential, and one thereby gets a bunch of benefits.

If there was an understanding that ZNS would allow one to write randomly, I must probably disappoint. For random writes, typical implementations either use a write-back scheme, that stores data in random write media first, and then later write it out sequentially, or write a host-side FTL (with its usual overheads).
Heiner Litz June 19, 2020, 6:08 p.m. UTC | #64
Hi Matias,
no, I am rather saying that the Linux kernel has a deficit or at least
is not a good fit for ZNS because it cannot enforce in-order delivery.
The requirement of sequential writes basically imposes this
requirement. Append essentially a Linux specific fix on the ZNS level
and that enforcing ordering would be a cleaner way to enable QD>1.

On Fri, Jun 19, 2020 at 3:29 AM Matias Bjorling <Matias.Bjorling@wdc.com> wrote:
>
> > -----Original Message-----
> > From: Heiner Litz <hlitz@ucsc.edu>
> > Sent: Friday, 19 June 2020 00.05
> > To: Keith Busch <kbusch@kernel.org>
> > Cc: Damien Le Moal <Damien.LeMoal@wdc.com>; Javier González
> > <javier@javigon.com>; Matias Bjørling <mb@lightnvm.io>; Matias Bjorling
> > <Matias.Bjorling@wdc.com>; Christoph Hellwig <hch@lst.de>; Keith Busch
> > <Keith.Busch@wdc.com>; linux-nvme@lists.infradead.org; linux-
> > block@vger.kernel.org; Sagi Grimberg <sagi@grimberg.me>; Jens Axboe
> > <axboe@kernel.dk>; Hans Holmberg <Hans.Holmberg@wdc.com>; Dmitry
> > Fomichev <Dmitry.Fomichev@wdc.com>; Ajay Joshi <Ajay.Joshi@wdc.com>;
> > Aravind Ramesh <Aravind.Ramesh@wdc.com>; Niklas Cassel
> > <Niklas.Cassel@wdc.com>; Judy Brock <judy.brock@samsung.com>
> > Subject: Re: [PATCH 5/5] nvme: support for zoned namespaces
> >
> > Matias, Keith,
> > thanks, this all sounds good and it makes total sense to hide striping from the
> > user.
> >
> > In the end, the real problem really seems to be that ZNS effectively requires in-
> > order IO delivery which the kernel cannot guarantee. I think fixing this problem
> > in the ZNS specification instead of in the communication substrate (kernel) is
> > problematic, especially as out-of-order delivery absolutely has no benefit in the
> > case of ZNS.
> > But I guess this has been discussed before..
>
> I'm a bit dense, by the above, is your conclusion that ZNS has a deficit/feature, which OCSSD didn't already have? They both had the same requirement that a chunk/zone must be written sequentially. It's the name of the game when deploying NAND-based media, I am not sure how ZNS should be able to help with this. The goal of ZNS is to align with the media (and OCSSD), which makes writes required to be sequential, and one thereby gets a bunch of benefits.
>
> If there was an understanding that ZNS would allow one to write randomly, I must probably disappoint. For random writes, typical implementations either use a write-back scheme, that stores data in random write media first, and then later write it out sequentially, or write a host-side FTL (with its usual overheads).
Keith Busch June 19, 2020, 6:10 p.m. UTC | #65
On Fri, Jun 19, 2020 at 11:08:26AM -0700, Heiner Litz wrote:
> Hi Matias,
> no, I am rather saying that the Linux kernel has a deficit or at least
> is not a good fit for ZNS because it cannot enforce in-order delivery.

FYI, the nvme protocol can't even enforce in-order delivery, so calling
out linux for this is a moot point.

> The requirement of sequential writes basically imposes this
> requirement. Append essentially a Linux specific fix on the ZNS level
> and that enforcing ordering would be a cleaner way to enable QD>1.
Heiner Litz June 19, 2020, 6:17 p.m. UTC | #66
> On Fri, Jun 19, 2020 at 11:08:26AM -0700, Heiner Litz wrote:
> > Hi Matias,
> > no, I am rather saying that the Linux kernel has a deficit or at least
> > is not a good fit for ZNS because it cannot enforce in-order delivery.
>
> FYI, the nvme protocol can't even enforce in-order delivery, so calling
> out linux for this is a moot point.

How does it work in SPDK then? I had understood that SPDK supported
QD>1 for ZNS devices.
I am not saying that Linux is the only problem. The fact remains that
out of order delivery is not a good fit for an interface that requires
sequential writes.

>
> > The requirement of sequential writes basically imposes this
> > requirement. Append essentially a Linux specific fix on the ZNS level
> > and that enforcing ordering would be a cleaner way to enable QD>1.
Matias Bjorling June 19, 2020, 6:18 p.m. UTC | #67
On 19/06/2020 20.08, Heiner Litz wrote:
> Hi Matias,
> no, I am rather saying that the Linux kernel has a deficit or at least
> is not a good fit for ZNS because it cannot enforce in-order delivery.
> The requirement of sequential writes basically imposes this
> requirement. Append essentially a Linux specific fix on the ZNS level
> and that enforcing ordering would be a cleaner way to enable QD>1.

Ah, I am not sure I agree with that statement. As Keith points out, 
there is not even in-order delivery in NVMe. Any system where high 
performance is required, has out of order mechanisms that improves 
parallelism and performance. If one wants to issue I/Os in order, it is 
as easy as supplying a write-back cache. Linux and any other systems are 
able to do that.

> On Fri, Jun 19, 2020 at 3:29 AM Matias Bjorling <Matias.Bjorling@wdc.com> wrote:
>>> -----Original Message-----
>>> From: Heiner Litz <hlitz@ucsc.edu>
>>> Sent: Friday, 19 June 2020 00.05
>>> To: Keith Busch <kbusch@kernel.org>
>>> Cc: Damien Le Moal <Damien.LeMoal@wdc.com>; Javier González
>>> <javier@javigon.com>; Matias Bjørling <mb@lightnvm.io>; Matias Bjorling
>>> <Matias.Bjorling@wdc.com>; Christoph Hellwig <hch@lst.de>; Keith Busch
>>> <Keith.Busch@wdc.com>; linux-nvme@lists.infradead.org; linux-
>>> block@vger.kernel.org; Sagi Grimberg <sagi@grimberg.me>; Jens Axboe
>>> <axboe@kernel.dk>; Hans Holmberg <Hans.Holmberg@wdc.com>; Dmitry
>>> Fomichev <Dmitry.Fomichev@wdc.com>; Ajay Joshi <Ajay.Joshi@wdc.com>;
>>> Aravind Ramesh <Aravind.Ramesh@wdc.com>; Niklas Cassel
>>> <Niklas.Cassel@wdc.com>; Judy Brock <judy.brock@samsung.com>
>>> Subject: Re: [PATCH 5/5] nvme: support for zoned namespaces
>>>
>>> Matias, Keith,
>>> thanks, this all sounds good and it makes total sense to hide striping from the
>>> user.
>>>
>>> In the end, the real problem really seems to be that ZNS effectively requires in-
>>> order IO delivery which the kernel cannot guarantee. I think fixing this problem
>>> in the ZNS specification instead of in the communication substrate (kernel) is
>>> problematic, especially as out-of-order delivery absolutely has no benefit in the
>>> case of ZNS.
>>> But I guess this has been discussed before..
>> I'm a bit dense, by the above, is your conclusion that ZNS has a deficit/feature, which OCSSD didn't already have? They both had the same requirement that a chunk/zone must be written sequentially. It's the name of the game when deploying NAND-based media, I am not sure how ZNS should be able to help with this. The goal of ZNS is to align with the media (and OCSSD), which makes writes required to be sequential, and one thereby gets a bunch of benefits.
>>
>> If there was an understanding that ZNS would allow one to write randomly, I must probably disappoint. For random writes, typical implementations either use a write-back scheme, that stores data in random write media first, and then later write it out sequentially, or write a host-side FTL (with its usual overheads).
Keith Busch June 19, 2020, 6:22 p.m. UTC | #68
On Fri, Jun 19, 2020 at 11:17:02AM -0700, Heiner Litz wrote:
> > On Fri, Jun 19, 2020 at 11:08:26AM -0700, Heiner Litz wrote:
> > > Hi Matias,
> > > no, I am rather saying that the Linux kernel has a deficit or at least
> > > is not a good fit for ZNS because it cannot enforce in-order delivery.
> >
> > FYI, the nvme protocol can't even enforce in-order delivery, so calling
> > out linux for this is a moot point.
> 
> How does it work in SPDK then? I had understood that SPDK supported
> QD>1 for ZNS devices.
> I am not saying that Linux is the only problem. The fact remains that
> out of order delivery is not a good fit for an interface that requires
> sequential writes.

The nvme protocol is absoltely clear that multiple commands outstanding
simultaneosly can be executed in any order. This is further made
difficult if you're dispatching these commands across multiple queues.
If SPDK is dispatching multiple commands and expecting them to execute
in order, then they're doing it wrong.

Further, you're not even guranteed the first write in a sequence will be
successful. If you've already dispatched a subsequent write, and the
first one fails, the second one may also fail when it's not at the wrong
write pointer.
Matias Bjorling June 19, 2020, 6:25 p.m. UTC | #69
On 19/06/2020 20.17, Heiner Litz wrote:
>> On Fri, Jun 19, 2020 at 11:08:26AM -0700, Heiner Litz wrote:
>>> Hi Matias,
>>> no, I am rather saying that the Linux kernel has a deficit or at least
>>> is not a good fit for ZNS because it cannot enforce in-order delivery.
>> FYI, the nvme protocol can't even enforce in-order delivery, so calling
>> out linux for this is a moot point.
> How does it work in SPDK then? I had understood that SPDK supported
> QD>1 for ZNS devices.
It doesn't. Out of order delivery is not guaranteed in NVMe.
> I am not saying that Linux is the only problem. The fact remains that
> out of order delivery is not a good fit for an interface that requires
> sequential writes.

That why zone append was introduced in ZNS. It removes this constraint, 
and makes it such that any process (or host) can write to a specific 
zone. It's neat! That is why the command was introduced.

It is not only Linux specific - it applies to everyone that wants to use 
it. It is solving a fundamental distributed system problem, as it 
removes the need for fine-grained coordinating between process or host. 
It allows the SSD to coordinate data placement, which historically has 
been done by the host. It is awesome!

>
>>> The requirement of sequential writes basically imposes this
>>> requirement. Append essentially a Linux specific fix on the ZNS level
>>> and that enforcing ordering would be a cleaner way to enable QD>1.
Heiner Litz June 19, 2020, 6:40 p.m. UTC | #70
That makes sense. We are generally paying a high price for
implementing in-order interfaces over out-of-order communication
channels (e.g. TCP buffers) and append() seems to be a much more
lightweight solution.

On Fri, Jun 19, 2020 at 11:25 AM Matias Bjørling <mb@lightnvm.io> wrote:
>
> On 19/06/2020 20.17, Heiner Litz wrote:
> >> On Fri, Jun 19, 2020 at 11:08:26AM -0700, Heiner Litz wrote:
> >>> Hi Matias,
> >>> no, I am rather saying that the Linux kernel has a deficit or at least
> >>> is not a good fit for ZNS because it cannot enforce in-order delivery.
> >> FYI, the nvme protocol can't even enforce in-order delivery, so calling
> >> out linux for this is a moot point.
> > How does it work in SPDK then? I had understood that SPDK supported
> > QD>1 for ZNS devices.
> It doesn't. Out of order delivery is not guaranteed in NVMe.
> > I am not saying that Linux is the only problem. The fact remains that
> > out of order delivery is not a good fit for an interface that requires
> > sequential writes.
>
> That why zone append was introduced in ZNS. It removes this constraint,
> and makes it such that any process (or host) can write to a specific
> zone. It's neat! That is why the command was introduced.
>
> It is not only Linux specific - it applies to everyone that wants to use
> it. It is solving a fundamental distributed system problem, as it
> removes the need for fine-grained coordinating between process or host.
> It allows the SSD to coordinate data placement, which historically has
> been done by the host. It is awesome!
>
> >
> >>> The requirement of sequential writes basically imposes this
> >>> requirement. Append essentially a Linux specific fix on the ZNS level
> >>> and that enforcing ordering would be a cleaner way to enable QD>1.
>
>
Christoph Hellwig June 20, 2020, 6:33 a.m. UTC | #71
On Fri, Jun 19, 2020 at 11:08:26AM -0700, Heiner Litz wrote:
> Hi Matias,
> no, I am rather saying that the Linux kernel has a deficit or at least
> is not a good fit for ZNS because it cannot enforce in-order delivery.

Seriously, if you don't understand the basics of NVMe can you please stop
spamming this list with your crap?
Heiner Litz June 20, 2020, 5:52 p.m. UTC | #72
I don't remember saying that I don't understand the basics of NVMe, so
I am not sure where you got this from.

That being said, the point I am trying to discuss is not about NVMe in
particular. It is the general question of: What is the benefit of
splitting and reordering (on whatever layer) for a hardware device
that requires sequential writes? I claim that there is no benefit.

I have worked with SSDs in the past that exposed raw flash blocks over
NVMe and that achieved maximum write bandwidth without append by
enforcing splitting/ordering guarantees, so I know it is possible.

I will accept that there is no interest in discussing the question
above, so I'll stop here.

On Fri, Jun 19, 2020 at 11:33 PM Christoph Hellwig <hch@lst.de> wrote:
>
> On Fri, Jun 19, 2020 at 11:08:26AM -0700, Heiner Litz wrote:
> > Hi Matias,
> > no, I am rather saying that the Linux kernel has a deficit or at least
> > is not a good fit for ZNS because it cannot enforce in-order delivery.
>
> Seriously, if you don't understand the basics of NVMe can you please stop
> spamming this list with your crap?
Christoph Hellwig June 22, 2020, 2:01 p.m. UTC | #73
On Sat, Jun 20, 2020 at 10:52:21AM -0700, Heiner Litz wrote:
> I don't remember saying that I don't understand the basics of NVMe, so
> I am not sure where you got this from.

You didn't say that.  But from you incoherent comments on the list it
is completely obvious.
Luis Chamberlain March 2, 2022, 9:11 p.m. UTC | #74
On Wed, Jun 17, 2020 at 09:23:05PM +0200, Matias Bjørling wrote:
> On 17/06/2020 21.09, Javier González wrote:
> > As you are well aware, there are some cases where append introduces
> > challenges. This is well-documented on the bibliography around nameless
> > writes.
> 
> The nameless writes idea is vastly different from Zone append, and have
> little of the drawbacks of nameless writes, which makes the well-documented
> literature not apply.

Sorry for joining late to the party!

Just curious if we have any public information on analysis on the
differences between zone append and nameless writes?

If we don't then this sould seem to be new territory? I'd expect many
drawbacks might not be known yet. In new filesystem APIs we know all these
things will always creep up. Not that there couldn't be unexpected issues for
nameless writes once and if someone did try to add support for it in a real
OS. But if nameless writes is pretty much append, it gives me a good idea
where to expect some issues ahead of time.

Thanks!

  Luis
diff mbox series

Patch

diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile
index fc7b26be692d..d7f6a87687b8 100644
--- a/drivers/nvme/host/Makefile
+++ b/drivers/nvme/host/Makefile
@@ -13,6 +13,7 @@  nvme-core-y				:= core.o
 nvme-core-$(CONFIG_TRACING)		+= trace.o
 nvme-core-$(CONFIG_NVME_MULTIPATH)	+= multipath.o
 nvme-core-$(CONFIG_NVM)			+= lightnvm.o
+nvme-core-$(CONFIG_BLK_DEV_ZONED)	+= zns.o
 nvme-core-$(CONFIG_FAULT_INJECTION_DEBUG_FS)	+= fault_inject.o
 nvme-core-$(CONFIG_NVME_HWMON)		+= hwmon.o
 
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 58f137b9f2c5..e961910da4ac 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -89,7 +89,7 @@  static dev_t nvme_chr_devt;
 static struct class *nvme_class;
 static struct class *nvme_subsys_class;
 
-static int nvme_revalidate_disk(struct gendisk *disk);
+static int _nvme_revalidate_disk(struct gendisk *disk);
 static void nvme_put_subsystem(struct nvme_subsystem *subsys);
 static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
 					   unsigned nsid);
@@ -287,6 +287,10 @@  void nvme_complete_rq(struct request *req)
 			nvme_retry_req(req);
 			return;
 		}
+	} else if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
+		   req_op(req) == REQ_OP_ZONE_APPEND) {
+		req->__sector = nvme_lba_to_sect(req->q->queuedata,
+			le64_to_cpu(nvme_req(req)->result.u64));
 	}
 
 	nvme_trace_bio_complete(req, status);
@@ -673,7 +677,8 @@  static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns,
 }
 
 static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
-		struct request *req, struct nvme_command *cmnd)
+		struct request *req, struct nvme_command *cmnd,
+		enum nvme_opcode op)
 {
 	struct nvme_ctrl *ctrl = ns->ctrl;
 	u16 control = 0;
@@ -687,7 +692,7 @@  static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
 	if (req->cmd_flags & REQ_RAHEAD)
 		dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
 
-	cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read);
+	cmnd->rw.opcode = op;
 	cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id);
 	cmnd->rw.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
 	cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
@@ -716,6 +721,8 @@  static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
 		case NVME_NS_DPS_PI_TYPE2:
 			control |= NVME_RW_PRINFO_PRCHK_GUARD |
 					NVME_RW_PRINFO_PRCHK_REF;
+			if (op == nvme_cmd_zone_append)
+				control |= NVME_RW_APPEND_PIREMAP;
 			cmnd->rw.reftag = cpu_to_le32(t10_pi_ref_tag(req));
 			break;
 		}
@@ -756,6 +763,19 @@  blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
 	case REQ_OP_FLUSH:
 		nvme_setup_flush(ns, cmd);
 		break;
+	case REQ_OP_ZONE_RESET_ALL:
+	case REQ_OP_ZONE_RESET:
+		ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_RESET);
+		break;
+	case REQ_OP_ZONE_OPEN:
+		ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_OPEN);
+		break;
+	case REQ_OP_ZONE_CLOSE:
+		ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_CLOSE);
+		break;
+	case REQ_OP_ZONE_FINISH:
+		ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_FINISH);
+		break;
 	case REQ_OP_WRITE_ZEROES:
 		ret = nvme_setup_write_zeroes(ns, req, cmd);
 		break;
@@ -763,8 +783,13 @@  blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
 		ret = nvme_setup_discard(ns, req, cmd);
 		break;
 	case REQ_OP_READ:
+		ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_read);
+		break;
 	case REQ_OP_WRITE:
-		ret = nvme_setup_rw(ns, req, cmd);
+		ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_write);
+		break;
+	case REQ_OP_ZONE_APPEND:
+		ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_zone_append);
 		break;
 	default:
 		WARN_ON_ONCE(1);
@@ -1392,14 +1417,23 @@  static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 	return effects;
 }
 
-static void nvme_update_formats(struct nvme_ctrl *ctrl)
+static void nvme_update_formats(struct nvme_ctrl *ctrl, u32 *effects)
 {
 	struct nvme_ns *ns;
 
 	down_read(&ctrl->namespaces_rwsem);
 	list_for_each_entry(ns, &ctrl->namespaces, list)
-		if (ns->disk && nvme_revalidate_disk(ns->disk))
+		if (ns->disk && _nvme_revalidate_disk(ns->disk))
 			nvme_set_queue_dying(ns);
+		else if (blk_queue_is_zoned(ns->disk->queue)) {
+			/*
+			 * IO commands are required to fully revalidate a zoned
+			 * device. Force the command effects to trigger rescan
+			 * work so report zones can run in a context with
+			 * unfrozen IO queues.
+			 */
+			*effects |= NVME_CMD_EFFECTS_NCC;
+		}
 	up_read(&ctrl->namespaces_rwsem);
 }
 
@@ -1411,7 +1445,7 @@  static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects)
 	 * this command.
 	 */
 	if (effects & NVME_CMD_EFFECTS_LBCC)
-		nvme_update_formats(ctrl);
+		nvme_update_formats(ctrl, &effects);
 	if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) {
 		nvme_unfreeze(ctrl);
 		nvme_mpath_unfreeze(ctrl->subsys);
@@ -1526,7 +1560,7 @@  static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
  * Issue ioctl requests on the first available path.  Note that unlike normal
  * block layer requests we will not retry failed request on another controller.
  */
-static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
+struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
 		struct nvme_ns_head **head, int *srcu_idx)
 {
 #ifdef CONFIG_NVME_MULTIPATH
@@ -1546,7 +1580,7 @@  static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
 	return disk->private_data;
 }
 
-static void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx)
+void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx)
 {
 	if (head)
 		srcu_read_unlock(&head->srcu, idx);
@@ -1939,21 +1973,28 @@  static void nvme_update_disk_info(struct gendisk *disk,
 
 static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
 {
+	unsigned lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK;
 	struct nvme_ns *ns = disk->private_data;
 	struct nvme_ctrl *ctrl = ns->ctrl;
+	int ret;
 	u32 iob;
 
 	/*
 	 * If identify namespace failed, use default 512 byte block size so
 	 * block layer can use before failing read/write for 0 capacity.
 	 */
-	ns->lba_shift = id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ds;
+	ns->lba_shift = id->lbaf[lbaf].ds;
 	if (ns->lba_shift == 0)
 		ns->lba_shift = 9;
 
 	switch (ns->head->ids.csi) {
 	case NVME_CSI_NVM:
 		break;
+	case NVME_CSI_ZNS:
+		ret = nvme_update_zone_info(disk, ns, lbaf);
+		if (ret)
+			return ret;
+		break;
 	default:
 		dev_warn(ctrl->device, "unknown csi:%d ns:%d\n",
 			ns->head->ids.csi, ns->head->ns_id);
@@ -1967,7 +2008,7 @@  static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
 		iob = nvme_lba_to_sect(ns, le16_to_cpu(id->noiob));
 
 	ns->features = 0;
-	ns->ms = le16_to_cpu(id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ms);
+	ns->ms = le16_to_cpu(id->lbaf[lbaf].ms);
 	/* the PI implementation requires metadata equal t10 pi tuple size */
 	if (ns->ms == sizeof(struct t10_pi_tuple))
 		ns->pi_type = id->dps & NVME_NS_DPS_PI_MASK;
@@ -2010,7 +2051,7 @@  static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
 	return 0;
 }
 
-static int nvme_revalidate_disk(struct gendisk *disk)
+static int _nvme_revalidate_disk(struct gendisk *disk)
 {
 	struct nvme_ns *ns = disk->private_data;
 	struct nvme_ctrl *ctrl = ns->ctrl;
@@ -2058,6 +2099,28 @@  static int nvme_revalidate_disk(struct gendisk *disk)
 	return ret;
 }
 
+static int nvme_revalidate_disk(struct gendisk *disk)
+{
+	int ret;
+
+	ret = _nvme_revalidate_disk(disk);
+	if (ret)
+		return ret;
+
+#ifdef CONFIG_BLK_DEV_ZONED
+	if (blk_queue_is_zoned(disk->queue)) {
+		struct nvme_ns *ns = disk->private_data;
+		struct nvme_ctrl *ctrl = ns->ctrl;
+
+		ret = blk_revalidate_disk_zones(disk, NULL);
+		if (!ret)
+			blk_queue_max_zone_append_sectors(disk->queue,
+							  ctrl->max_zone_append);
+	}
+#endif
+	return ret;
+}
+
 static char nvme_pr_type(enum pr_type type)
 {
 	switch (type) {
@@ -2188,6 +2251,7 @@  static const struct block_device_operations nvme_fops = {
 	.release	= nvme_release,
 	.getgeo		= nvme_getgeo,
 	.revalidate_disk= nvme_revalidate_disk,
+	.report_zones	= nvme_report_zones,
 	.pr_ops		= &nvme_pr_ops,
 };
 
@@ -2213,6 +2277,7 @@  const struct block_device_operations nvme_ns_head_ops = {
 	.ioctl		= nvme_ioctl,
 	.compat_ioctl	= nvme_compat_ioctl,
 	.getgeo		= nvme_getgeo,
+	.report_zones	= nvme_report_zones,
 	.pr_ops		= &nvme_pr_ops,
 };
 #endif /* CONFIG_NVME_MULTIPATH */
@@ -4439,6 +4504,8 @@  static inline void _nvme_check_size(void)
 	BUILD_BUG_ON(sizeof(struct nvme_command) != 64);
 	BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != NVME_IDENTIFY_DATA_SIZE);
 	BUILD_BUG_ON(sizeof(struct nvme_id_ns) != NVME_IDENTIFY_DATA_SIZE);
+	BUILD_BUG_ON(sizeof(struct nvme_id_ns_zns) != NVME_IDENTIFY_DATA_SIZE);
+	BUILD_BUG_ON(sizeof(struct nvme_id_ctrl_zns) != NVME_IDENTIFY_DATA_SIZE);
 	BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
 	BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
 	BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64);
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 58428e3a590e..662f95fbd909 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -239,6 +239,9 @@  struct nvme_ctrl {
 	u32 max_hw_sectors;
 	u32 max_segments;
 	u32 max_integrity_segments;
+#ifdef CONFIG_BLK_DEV_ZONED
+	u32 max_zone_append;
+#endif
 	u16 crdt[3];
 	u16 oncs;
 	u16 oacs;
@@ -403,6 +406,9 @@  struct nvme_ns {
 	u16 sgs;
 	u32 sws;
 	u8 pi_type;
+#ifdef CONFIG_BLK_DEV_ZONED
+	u64 zsze;
+#endif
 	unsigned long features;
 	unsigned long flags;
 #define NVME_NS_REMOVING	0
@@ -568,6 +574,9 @@  int nvme_delete_ctrl(struct nvme_ctrl *ctrl);
 
 int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi,
 		void *log, size_t size, u64 offset);
+struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
+		struct nvme_ns_head **head, int *srcu_idx);
+void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx);
 
 extern const struct attribute_group *nvme_ns_id_attr_groups[];
 extern const struct block_device_operations nvme_ns_head_ops;
@@ -689,6 +698,36 @@  static inline void nvme_mpath_start_freeze(struct nvme_subsystem *subsys)
 }
 #endif /* CONFIG_NVME_MULTIPATH */
 
+#ifdef CONFIG_BLK_DEV_ZONED
+int nvme_update_zone_info(struct gendisk *disk, struct nvme_ns *ns,
+			  unsigned lbaf);
+
+int nvme_report_zones(struct gendisk *disk, sector_t sector,
+		      unsigned int nr_zones, report_zones_cb cb, void *data);
+
+blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct request *req,
+				       struct nvme_command *cmnd,
+				       enum nvme_zone_mgmt_action action);
+#else
+#define nvme_report_zones NULL
+
+static inline blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns,
+		struct request *req, struct nvme_command *cmnd,
+		enum nvme_zone_mgmt_action action)
+{
+	return BLK_STS_NOTSUPP;
+}
+
+static inline int nvme_update_zone_info(struct gendisk *disk,
+					struct nvme_ns *ns,
+					unsigned lbaf)
+{
+	dev_warn(ns->ctrl->device,
+		 "Please enable CONFIG_BLK_DEV_ZONED to support ZNS devices\n");
+	return -EPROTONOSUPPORT;
+}
+#endif
+
 #ifdef CONFIG_NVM
 int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node);
 void nvme_nvm_unregister(struct nvme_ns *ns);
diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c
new file mode 100644
index 000000000000..c08f6281b614
--- /dev/null
+++ b/drivers/nvme/host/zns.c
@@ -0,0 +1,238 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2020 Western Digital Corporation or its affiliates.
+ */
+
+#include <linux/blkdev.h>
+#include <linux/vmalloc.h>
+#include "nvme.h"
+
+static int nvme_set_max_append(struct nvme_ctrl *ctrl)
+{
+	struct nvme_command c = { };
+	struct nvme_id_ctrl_zns *id;
+	int status;
+
+	id = kzalloc(sizeof(*id), GFP_KERNEL);
+	if (!id)
+		return -ENOMEM;
+
+	c.identify.opcode = nvme_admin_identify;
+	c.identify.cns = NVME_ID_CNS_CS_CTRL;
+	c.identify.csi = NVME_CSI_ZNS;
+
+	status = nvme_submit_sync_cmd(ctrl->admin_q, &c, id, sizeof(*id));
+	if (status) {
+		kfree(id);
+		return status;
+	}
+
+	ctrl->max_zone_append = 1 << (id->zamds + 3);
+	kfree(id);
+	return 0;
+}
+
+int nvme_update_zone_info(struct gendisk *disk, struct nvme_ns *ns,
+			  unsigned lbaf)
+{
+	struct nvme_effects_log *log = ns->head->effects;
+	struct request_queue *q = disk->queue;
+	struct nvme_command c = { };
+	struct nvme_id_ns_zns *id;
+	int status;
+
+	/* Driver requires zone append support */
+	if (!(log->iocs[nvme_cmd_zone_append] & NVME_CMD_EFFECTS_CSUPP))
+		return -ENODEV;
+
+	/* Lazily query controller append limit for the first zoned namespace */
+	if (!ns->ctrl->max_zone_append) {
+		status = nvme_set_max_append(ns->ctrl);
+		if (status)
+			return status;
+	}
+
+	id = kzalloc(sizeof(*id), GFP_KERNEL);
+	if (!id)
+		return -ENOMEM;
+
+	c.identify.opcode = nvme_admin_identify;
+	c.identify.nsid = cpu_to_le32(ns->head->ns_id);
+	c.identify.cns = NVME_ID_CNS_CS_NS;
+	c.identify.csi = NVME_CSI_ZNS;
+
+	status = nvme_submit_sync_cmd(ns->ctrl->admin_q, &c, id, sizeof(*id));
+	if (status)
+		goto free_data;
+
+	/*
+	 * We currently do not handle devices requiring any of the zoned
+	 * operation characteristics.
+	 */
+	if (id->zoc) {
+		status = -EINVAL;
+		goto free_data;
+	}
+
+	ns->zsze = nvme_lba_to_sect(ns, le64_to_cpu(id->lbafe[lbaf].zsze));
+	if (!ns->zsze) {
+		status = -EINVAL;
+		goto free_data;
+	}
+
+	q->limits.zoned = BLK_ZONED_HM;
+	blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q);
+free_data:
+	kfree(id);
+	return status;
+}
+
+static void *nvme_zns_alloc_report_buffer(struct nvme_ns *ns,
+					  unsigned int nr_zones, size_t *buflen)
+{
+	struct request_queue *q = ns->disk->queue;
+	size_t bufsize;
+	void *buf;
+
+	const size_t min_bufsize = sizeof(struct nvme_zone_report) +
+				   sizeof(struct nvme_zone_descriptor);
+
+	nr_zones = min_t(unsigned int, nr_zones,
+			 get_capacity(ns->disk) >> ilog2(ns->zsze));
+
+	bufsize = sizeof(struct nvme_zone_report) +
+		nr_zones * sizeof(struct nvme_zone_descriptor);
+	bufsize = min_t(size_t, bufsize,
+			queue_max_hw_sectors(q) << SECTOR_SHIFT);
+	bufsize = min_t(size_t, bufsize, queue_max_segments(q) << PAGE_SHIFT);
+
+	while (bufsize >= min_bufsize) {
+		buf = __vmalloc(bufsize,
+				GFP_KERNEL | __GFP_ZERO | __GFP_NORETRY);
+		if (buf) {
+			*buflen = bufsize;
+			return buf;
+		}
+		bufsize >>= 1;
+	}
+	return NULL;
+}
+
+static int __nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector,
+				  struct nvme_zone_report *report,
+				  size_t buflen)
+{
+	struct nvme_command c = { };
+	int ret;
+
+	c.zmr.opcode = nvme_cmd_zone_mgmt_recv;
+	c.zmr.nsid = cpu_to_le32(ns->head->ns_id);
+	c.zmr.slba = cpu_to_le64(nvme_sect_to_lba(ns, sector));
+	c.zmr.numd = cpu_to_le32(nvme_bytes_to_numd(buflen));
+	c.zmr.zra = NVME_ZRA_ZONE_REPORT;
+	c.zmr.zrasf = NVME_ZRASF_ZONE_REPORT_ALL;
+	c.zmr.pr = NVME_REPORT_ZONE_PARTIAL;
+
+	ret = nvme_submit_sync_cmd(ns->queue, &c, report, buflen);
+	if (ret)
+		return ret;
+
+	return le64_to_cpu(report->nr_zones);
+}
+
+static int nvme_zone_parse_entry(struct nvme_ns *ns,
+				 struct nvme_zone_descriptor *entry,
+				 unsigned int idx, report_zones_cb cb,
+				 void *data)
+{
+	struct blk_zone zone = { };
+
+	if ((entry->zt & 0xf) != NVME_ZONE_TYPE_SEQWRITE_REQ) {
+		dev_err(ns->ctrl->device, "invalid zone type %#x\n",
+				entry->zt);
+		return -EINVAL;
+	}
+
+	zone.type = BLK_ZONE_TYPE_SEQWRITE_REQ;
+	zone.cond = entry->zs >> 4;
+	zone.len = ns->zsze;
+	zone.capacity = nvme_lba_to_sect(ns, le64_to_cpu(entry->zcap));
+	zone.start = nvme_lba_to_sect(ns, le64_to_cpu(entry->zslba));
+	zone.wp = nvme_lba_to_sect(ns, le64_to_cpu(entry->wp));
+
+	return cb(&zone, idx, data);
+}
+
+static int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector,
+			unsigned int nr_zones, report_zones_cb cb, void *data)
+{
+	struct nvme_zone_report *report;
+	int ret, zone_idx = 0;
+	unsigned int nz, i;
+	size_t buflen;
+
+	report = nvme_zns_alloc_report_buffer(ns, nr_zones, &buflen);
+	if (!report)
+		return -ENOMEM;
+
+	sector &= ~(ns->zsze - 1);
+	while (zone_idx < nr_zones && sector < get_capacity(ns->disk)) {
+		memset(report, 0, buflen);
+		ret = __nvme_ns_report_zones(ns, sector, report, buflen);
+		if (ret < 0)
+			goto out_free;
+
+		nz = min_t(unsigned int, ret, nr_zones);
+		if (!nz)
+			break;
+
+		for (i = 0; i < nz && zone_idx < nr_zones; i++) {
+			ret = nvme_zone_parse_entry(ns, &report->entries[i],
+						    zone_idx, cb, data);
+			if (ret)
+				goto out_free;
+			zone_idx++;
+		}
+
+		sector += ns->zsze * nz;
+	}
+
+	ret = zone_idx;
+out_free:
+	kvfree(report);
+	return ret;
+}
+
+int nvme_report_zones(struct gendisk *disk, sector_t sector,
+		      unsigned int nr_zones, report_zones_cb cb, void *data)
+{
+	struct nvme_ns_head *head = NULL;
+	struct nvme_ns *ns;
+	int srcu_idx, ret;
+
+	ns = nvme_get_ns_from_disk(disk, &head, &srcu_idx);
+	if (unlikely(!ns))
+		return -EWOULDBLOCK;
+
+	if (ns->head->ids.csi == NVME_CSI_ZNS)
+		ret = nvme_ns_report_zones(ns, sector, nr_zones, cb, data);
+	else
+		ret = -EINVAL;
+	nvme_put_ns_from_disk(head, srcu_idx);
+
+	return ret;
+}
+
+blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct request *req,
+		struct nvme_command *c, enum nvme_zone_mgmt_action action)
+{
+	c->zms.opcode = nvme_cmd_zone_mgmt_send;
+	c->zms.nsid = cpu_to_le32(ns->head->ns_id);
+	c->zms.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
+	c->zms.action = action;
+
+	if (req_op(req) == REQ_OP_ZONE_RESET_ALL)
+		c->zms.select = 1;
+
+	return BLK_STS_OK;
+}
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index ea25da572eed..7b3fa7de07bd 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -374,6 +374,30 @@  struct nvme_id_ns {
 	__u8			vs[3712];
 };
 
+struct nvme_zns_lbafe {
+	__le64			zsze;
+	__u8			zdes;
+	__u8			rsvd9[7];
+};
+
+struct nvme_id_ns_zns {
+	__le16			zoc;
+	__le16			ozcs;
+	__le32			mar;
+	__le32			mor;
+	__le32			rrl;
+	__le32			frl;
+	__u8			rsvd20[2796];
+	struct nvme_zns_lbafe	lbafe[16];
+	__u8			rsvd3072[768];
+	__u8			vs[256];
+};
+
+struct nvme_id_ctrl_zns {
+	__u8	zamds;
+	__u8	rsvd1[4095];
+};
+
 enum {
 	NVME_ID_CNS_NS			= 0x00,
 	NVME_ID_CNS_CTRL		= 0x01,
@@ -392,6 +416,7 @@  enum {
 
 enum {
 	NVME_CSI_NVM			= 0,
+	NVME_CSI_ZNS			= 2,
 };
 
 enum {
@@ -532,6 +557,27 @@  struct nvme_ana_rsp_hdr {
 	__le16	rsvd10[3];
 };
 
+struct nvme_zone_descriptor {
+	__u8		zt;
+	__u8		zs;
+	__u8		za;
+	__u8		rsvd3[5];
+	__le64		zcap;
+	__le64		zslba;
+	__le64		wp;
+	__u8		rsvd32[32];
+};
+
+enum {
+	NVME_ZONE_TYPE_SEQWRITE_REQ	= 0x2,
+};
+
+struct nvme_zone_report {
+	__le64		nr_zones;
+	__u8		resv8[56];
+	struct nvme_zone_descriptor entries[];
+};
+
 enum {
 	NVME_SMART_CRIT_SPARE		= 1 << 0,
 	NVME_SMART_CRIT_TEMPERATURE	= 1 << 1,
@@ -626,6 +672,9 @@  enum nvme_opcode {
 	nvme_cmd_resv_report	= 0x0e,
 	nvme_cmd_resv_acquire	= 0x11,
 	nvme_cmd_resv_release	= 0x15,
+	nvme_cmd_zone_mgmt_send	= 0x79,
+	nvme_cmd_zone_mgmt_recv	= 0x7a,
+	nvme_cmd_zone_append	= 0x7d,
 };
 
 #define nvme_opcode_name(opcode)	{ opcode, #opcode }
@@ -764,6 +813,7 @@  struct nvme_rw_command {
 enum {
 	NVME_RW_LR			= 1 << 15,
 	NVME_RW_FUA			= 1 << 14,
+	NVME_RW_APPEND_PIREMAP		= 1 << 9,
 	NVME_RW_DSM_FREQ_UNSPEC		= 0,
 	NVME_RW_DSM_FREQ_TYPICAL	= 1,
 	NVME_RW_DSM_FREQ_RARE		= 2,
@@ -829,6 +879,53 @@  struct nvme_write_zeroes_cmd {
 	__le16			appmask;
 };
 
+enum nvme_zone_mgmt_action {
+	NVME_ZONE_CLOSE		= 0x1,
+	NVME_ZONE_FINISH	= 0x2,
+	NVME_ZONE_OPEN		= 0x3,
+	NVME_ZONE_RESET		= 0x4,
+	NVME_ZONE_OFFLINE	= 0x5,
+	NVME_ZONE_SET_DESC_EXT	= 0x10,
+};
+
+struct nvme_zone_mgmt_send_cmd {
+	__u8			opcode;
+	__u8			flags;
+	__u16			command_id;
+	__le32			nsid;
+	__le32			cdw2[2];
+	__le64			metadata;
+	union nvme_data_ptr	dptr;
+	__le64			slba;
+	__le32			cdw12;
+	__u8			action;
+	__u8			select;
+	__u8			rsvd13[2];
+	__le32			cdw14[2];
+};
+
+struct nvme_zone_mgmt_recv_cmd {
+	__u8			opcode;
+	__u8			flags;
+	__u16			command_id;
+	__le32			nsid;
+	__le64			rsvd2[2];
+	union nvme_data_ptr	dptr;
+	__le64			slba;
+	__le32			numd;
+	__u8			zra;
+	__u8			zrasf;
+	__u8			pr;
+	__u8			rsvd13;
+	__le32			cdw14[2];
+};
+
+enum {
+	NVME_ZRA_ZONE_REPORT		= 0,
+	NVME_ZRASF_ZONE_REPORT_ALL	= 0,
+	NVME_REPORT_ZONE_PARTIAL	= 1,
+};
+
 /* Features */
 
 enum {
@@ -1300,6 +1397,8 @@  struct nvme_command {
 		struct nvme_format_cmd format;
 		struct nvme_dsm_cmd dsm;
 		struct nvme_write_zeroes_cmd write_zeroes;
+		struct nvme_zone_mgmt_send_cmd zms;
+		struct nvme_zone_mgmt_recv_cmd zmr;
 		struct nvme_abort_cmd abort;
 		struct nvme_get_log_page_command get_log_page;
 		struct nvmf_common_command fabrics;
@@ -1433,6 +1532,18 @@  enum {
 	NVME_SC_DISCOVERY_RESTART	= 0x190,
 	NVME_SC_AUTH_REQUIRED		= 0x191,
 
+	/*
+	 * I/O Command Set Specific - Zoned commands:
+	 */
+	NVME_SC_ZONE_BOUNDARY_ERROR	= 0x1b8,
+	NVME_SC_ZONE_FULL		= 0x1b9,
+	NVME_SC_ZONE_READ_ONLY		= 0x1ba,
+	NVME_SC_ZONE_OFFLINE		= 0x1bb,
+	NVME_SC_ZONE_INVALID_WRITE	= 0x1bc,
+	NVME_SC_ZONE_TOO_MANY_ACTIVE	= 0x1bd,
+	NVME_SC_ZONE_TOO_MANY_OPEN	= 0x1be,
+	NVME_SC_ZONE_INVALID_TRANSITION	= 0x1bf,
+
 	/*
 	 * Media and Data Integrity Errors:
 	 */