diff mbox series

[V2,2/9] nvmet: add ZNS support for bdev-ns

Message ID 20201130032909.40638-3-chaitanya.kulkarni@wdc.com (mailing list archive)
State New, archived
Headers show
Series nvmet: add genblk ZBD backend | expand

Commit Message

Chaitanya Kulkarni Nov. 30, 2020, 3:29 a.m. UTC
Add zns-bdev-config, id-ctrl, id-ns, zns-cmd-effects, zone-mgmt-send,
zone-mgmt-recv and zone-append handlers for NVMeOF target to enable ZNS
support for bdev.

Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
---
 drivers/nvme/target/Makefile      |   2 +-
 drivers/nvme/target/admin-cmd.c   |   4 +-
 drivers/nvme/target/io-cmd-file.c |   2 +-
 drivers/nvme/target/nvmet.h       |  19 ++
 drivers/nvme/target/zns.c         | 463 ++++++++++++++++++++++++++++++
 5 files changed, 486 insertions(+), 4 deletions(-)
 create mode 100644 drivers/nvme/target/zns.c

Comments

Damien Le Moal Nov. 30, 2020, 11:57 a.m. UTC | #1
On 2020/11/30 12:29, Chaitanya Kulkarni wrote:
> Add zns-bdev-config, id-ctrl, id-ns, zns-cmd-effects, zone-mgmt-send,
> zone-mgmt-recv and zone-append handlers for NVMeOF target to enable ZNS
> support for bdev.
> 
> Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
> ---
>  drivers/nvme/target/Makefile      |   2 +-
>  drivers/nvme/target/admin-cmd.c   |   4 +-
>  drivers/nvme/target/io-cmd-file.c |   2 +-
>  drivers/nvme/target/nvmet.h       |  19 ++
>  drivers/nvme/target/zns.c         | 463 ++++++++++++++++++++++++++++++
>  5 files changed, 486 insertions(+), 4 deletions(-)
>  create mode 100644 drivers/nvme/target/zns.c
> 
> diff --git a/drivers/nvme/target/Makefile b/drivers/nvme/target/Makefile
> index ebf91fc4c72e..d050f829b43a 100644
> --- a/drivers/nvme/target/Makefile
> +++ b/drivers/nvme/target/Makefile
> @@ -10,7 +10,7 @@ obj-$(CONFIG_NVME_TARGET_FCLOOP)	+= nvme-fcloop.o
>  obj-$(CONFIG_NVME_TARGET_TCP)		+= nvmet-tcp.o
>  
>  nvmet-y		+= core.o configfs.o admin-cmd.o fabrics-cmd.o \
> -			discovery.o io-cmd-file.o io-cmd-bdev.o
> +		   zns.o discovery.o io-cmd-file.o io-cmd-bdev.o
>  nvmet-$(CONFIG_NVME_TARGET_PASSTHRU)	+= passthru.o
>  nvme-loop-y	+= loop.o
>  nvmet-rdma-y	+= rdma.o
> diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
> index dca34489a1dc..509fd8dcca0c 100644
> --- a/drivers/nvme/target/admin-cmd.c
> +++ b/drivers/nvme/target/admin-cmd.c
> @@ -579,8 +579,8 @@ static void nvmet_execute_identify_nslist(struct nvmet_req *req)
>  	nvmet_req_complete(req, status);
>  }
>  
> -static u16 nvmet_copy_ns_identifier(struct nvmet_req *req, u8 type, u8 len,
> -				    void *id, off_t *off)
> +u16 nvmet_copy_ns_identifier(struct nvmet_req *req, u8 type, u8 len,
> +			     void *id, off_t *off)
>  {
>  	struct nvme_ns_id_desc desc = {
>  		.nidt = type,
> diff --git a/drivers/nvme/target/io-cmd-file.c b/drivers/nvme/target/io-cmd-file.c
> index 0abbefd9925e..2bd10960fa50 100644
> --- a/drivers/nvme/target/io-cmd-file.c
> +++ b/drivers/nvme/target/io-cmd-file.c
> @@ -89,7 +89,7 @@ int nvmet_file_ns_enable(struct nvmet_ns *ns)
>  	return ret;
>  }
>  
> -static void nvmet_file_init_bvec(struct bio_vec *bv, struct scatterlist *sg)
> +void nvmet_file_init_bvec(struct bio_vec *bv, struct scatterlist *sg)
>  {
>  	bv->bv_page = sg_page(sg);
>  	bv->bv_offset = sg->offset;
> diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
> index 592763732065..eee7866ae512 100644
> --- a/drivers/nvme/target/nvmet.h
> +++ b/drivers/nvme/target/nvmet.h
> @@ -81,6 +81,10 @@ struct nvmet_ns {
>  	struct pci_dev		*p2p_dev;
>  	int			pi_type;
>  	int			metadata_size;
> +#ifdef CONFIG_BLK_DEV_ZONED
> +	struct nvme_id_ns_zns	id_zns;
> +	unsigned int		zasl;
> +#endif
>  };
>  
>  static inline struct nvmet_ns *to_nvmet_ns(struct config_item *item)
> @@ -251,6 +255,10 @@ struct nvmet_subsys {
>  	unsigned int		admin_timeout;
>  	unsigned int		io_timeout;
>  #endif /* CONFIG_NVME_TARGET_PASSTHRU */
> +
> +#ifdef CONFIG_BLK_DEV_ZONED
> +	struct nvme_id_ctrl_zns	id_ctrl_zns;
> +#endif
>  };
>  
>  static inline struct nvmet_subsys *to_subsys(struct config_item *item)
> @@ -603,4 +611,15 @@ static inline bool nvmet_ns_has_pi(struct nvmet_ns *ns)
>  	return ns->pi_type && ns->metadata_size == sizeof(struct t10_pi_tuple);
>  }
>  
> +void nvmet_execute_identify_cns_cs_ctrl(struct nvmet_req *req);
> +void nvmet_execute_identify_cns_cs_ns(struct nvmet_req *req);
> +u16 nvmet_process_zns_cis(struct nvmet_req *req, off_t *off);
> +bool nvmet_bdev_zns_enable(struct nvmet_ns *ns);
> +void nvmet_bdev_execute_zone_mgmt_recv(struct nvmet_req *req);
> +void nvmet_bdev_execute_zone_mgmt_send(struct nvmet_req *req);
> +void nvmet_bdev_execute_zone_append(struct nvmet_req *req);
> +void nvmet_zns_add_cmd_effects(struct nvme_effects_log *log);
> +u16 nvmet_copy_ns_identifier(struct nvmet_req *req, u8 type, u8 len,
> +			     void *id, off_t *off);
> +void nvmet_file_init_bvec(struct bio_vec *bv, struct scatterlist *sg);
>  #endif /* _NVMET_H */
> diff --git a/drivers/nvme/target/zns.c b/drivers/nvme/target/zns.c
> new file mode 100644
> index 000000000000..40dedfd51fd6
> --- /dev/null
> +++ b/drivers/nvme/target/zns.c
> @@ -0,0 +1,463 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * NVMe ZNS-ZBD command implementation.
> + * Copyright (c) 2020-2021 HGST, a Western Digital Company.
> + */
> +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
> +#include <linux/uio.h>
> +#include <linux/nvme.h>
> +#include <linux/xarray.h>
> +#include <linux/blkdev.h>
> +#include <linux/module.h>
> +#include "nvmet.h"
> +
> +#ifdef CONFIG_BLK_DEV_ZONED
> +#define NVMET_MPSMIN_SHIFT	12
> +
> +static u16 nvmet_bdev_zns_checks(struct nvmet_req *req)
> +{
> +	u16 status = 0;
> +
> +	if (!bdev_is_zoned(req->ns->bdev)) {
> +		status = NVME_SC_INVALID_NS | NVME_SC_DNR;
> +		goto out;
> +	}
> +
> +	if (req->cmd->zmr.zra != NVME_ZRA_ZONE_REPORT) {
> +		status = NVME_SC_INVALID_FIELD;
> +		goto out;
> +	}
> +
> +	if (req->cmd->zmr.zrasf != NVME_ZRASF_ZONE_REPORT_ALL) {
> +		status = NVME_SC_INVALID_FIELD;
> +		goto out;
> +	}
> +
> +	if (req->cmd->zmr.pr != NVME_REPORT_ZONE_PARTIAL)
> +		status = NVME_SC_INVALID_FIELD;
> +out:
> +	return status;
> +}
> +
> +static inline struct block_device *nvmet_bdev(struct nvmet_req *req)
> +{
> +	return req->ns->bdev;
> +}
> +
> +static inline  u64 nvmet_zones_to_desc_size(unsigned int nr_zones)
> +{
> +	return sizeof(struct nvme_zone_report) +
> +		(sizeof(struct nvme_zone_descriptor) * nr_zones);
> +}
> +
> +static inline u64 nvmet_sect_to_lba(struct nvmet_ns *ns, sector_t sect)
> +{
> +	return sect >> (ns->blksize_shift - SECTOR_SHIFT);
> +}
> +
> +static inline sector_t nvmet_lba_to_sect(struct nvmet_ns *ns, __le64 lba)
> +{
> +	return le64_to_cpu(lba) << (ns->blksize_shift - SECTOR_SHIFT);
> +}
> +
> +/*
> + *  ZNS related command implementation and helpers.
> + */
> +
> +u16 nvmet_process_zns_cis(struct nvmet_req *req, off_t *off)
> +{
> +	u16 nvme_cis_zns = NVME_CSI_ZNS;
> +
> +	if (!bdev_is_zoned(nvmet_bdev(req)))
> +		return NVME_SC_SUCCESS;
> +
> +	return nvmet_copy_ns_identifier(req, NVME_NIDT_CSI, NVME_NIDT_CSI_LEN,
> +					&nvme_cis_zns, off);
> +}
> +
> +void nvmet_zns_add_cmd_effects(struct nvme_effects_log *log)
> +{
> +	log->iocs[nvme_cmd_zone_append]		= cpu_to_le32(1 << 0);
> +	log->iocs[nvme_cmd_zone_mgmt_send]	= cpu_to_le32(1 << 0);
> +	log->iocs[nvme_cmd_zone_mgmt_recv]	= cpu_to_le32(1 << 0);
> +}
> +
> +static int nvmet_bdev_validate_zns_zones_cb(struct blk_zone *z,
> +					    unsigned int idx, void *data)
> +{
> +	struct blk_zone *zone = data;
> +
> +	memcpy(zone, z, sizeof(struct blk_zone));
> +
> +	return 0;
> +}
> +
> +static inline bool nvmet_bdev_validate_zns_zones(struct nvmet_ns *ns)
> +{
> +	sector_t last_sect = get_capacity(ns->bdev->bd_disk) - 1;
> +	struct blk_zone last_zone, first_zone;
> +	int reported_zones;
> +
> +	reported_zones = blkdev_report_zones(ns->bdev, 0, 1,
> +					     nvmet_bdev_validate_zns_zones_cb,
> +					     &first_zone);
> +	if (reported_zones != 1)
> +		return false;
> +
> +	reported_zones = blkdev_report_zones(ns->bdev, last_sect, 1,
> +					     nvmet_bdev_validate_zns_zones_cb,
> +					     &last_zone);
> +	if (reported_zones != 1)
> +		return false;
> +
> +	return first_zone.capacity == last_zone.capacity ? true : false;
> +}

That is really heavy handed... Why not just simply:

return !(get_capacity(ns->bdev->bd_disk) & (bdev_zone_sectors(ns->bdev) - 1));

That does the same: true if capacity is a multiple of the zone size (no runt
zone), false otherwise.

> +
> +static inline u8 nvmet_zasl(unsigned int zone_append_sects)
> +{
> +	unsigned int npages = (zone_append_sects << 9) >> NVMET_MPSMIN_SHIFT;
> +	u8 zasl = ilog2(npages);
> +
> +	/*
> +	 * Zone Append Size Limit is the value experessed in the units
> +	 * of minimum memory page size (i.e. 12) and is reported power of 2.
> +	 */
> +	return zasl;

Why not just:

	return ilog2((zone_append_sects << 9) >> NVMET_MPSMIN_SHIFT);

And are you sure that there is no possibility that the u8 type overflows here ?

> +}
> +
> +static inline void nvmet_zns_update_zasl(struct nvmet_ns *ns)
> +{
> +	u8 bio_max_zasl = nvmet_zasl((BIO_MAX_PAGES * PAGE_SIZE) >> 9);
> +	struct request_queue *q = ns->bdev->bd_disk->queue;
> +	struct nvmet_ns *ins;
> +	unsigned long idx;
> +	u8 min_zasl;
> +
> +	/*
> +	 * Calculate new ctrl->zasl value when enabling the new ns. This value
> +	 * has to be the minimum of the max_zone appned values from available

s/appned/append

> +	 * namespaces.
> +	 */
> +	min_zasl = ns->zasl = nvmet_zasl(queue_max_zone_append_sectors(q));
> +
> +	xa_for_each(&(ns->subsys->namespaces), idx, ins) {
> +		struct request_queue *iq = ins->bdev->bd_disk->queue;
> +		unsigned int imax_za_sects = queue_max_zone_append_sectors(iq);
> +		u8 izasl = nvmet_zasl(imax_za_sects);
> +
> +		if (!bdev_is_zoned(ins->bdev))
> +			continue;
> +
> +		min_zasl = min_zasl > izasl ? izasl : min_zasl;
> +	}
> +
> +	ns->subsys->id_ctrl_zns.zasl = min_t(u8, min_zasl, bio_max_zasl);

I do not understand what bio_max_zasl is used for here, as it does not represent
anything related to the zoned namespaces backends.

> +}> +
> +bool nvmet_bdev_zns_enable(struct nvmet_ns *ns)
> +{
> +	if (ns->bdev->bd_disk->queue->conv_zones_bitmap) {
> +		pr_err("block devices with conventional zones are not supported.");
> +		return false;
> +	}

It may be better to move this test in nvmet_bdev_validate_zns_zones(), so that
all backend zone configuration checks are together.

> +
> +	if (!nvmet_bdev_validate_zns_zones(ns))
> +		return false;
> +
> +	/*
> +	 * For ZBC and ZAC devices, writes into sequential zones must be aligned
> +	 * to the device physical block size. So use this value as the logical
> +	 * block size to avoid errors.
> +	 */
> +	ns->blksize_shift = blksize_bits(bdev_physical_block_size(ns->bdev));
> +
> +	nvmet_zns_update_zasl(ns);
> +
> +	return true;
> +}
> +
> +/*
> + * ZNS related Admin and I/O command handlers.
> + */
> +void nvmet_execute_identify_cns_cs_ctrl(struct nvmet_req *req)
> +{
> +	struct nvmet_ctrl *ctrl = req->sq->ctrl;
> +	struct nvme_id_ctrl_zns *id;
> +	u16 status = 0;
> +	u8 mdts;
> +
> +	id = kzalloc(sizeof(*id), GFP_KERNEL);
> +	if (!id) {
> +		status = NVME_SC_INTERNAL;
> +		goto out;
> +	}
> +
> +	/*
> +	 * Even though this function sets Zone Append Size Limit to 0,
> +	 * the 0 value here indicates that the maximum data transfer size for
> +	 * the Zone Append command is indicated by the ctrl
> +	 * Maximum Data Transfer Size (MDTS).
> +	 */

I do not understand this comment. It does not really exactly match what the code
below is doing.

> +
> +	mdts = ctrl->ops->get_mdts ? ctrl->ops->get_mdts(ctrl) : 0;
> +
> +	id->zasl = min_t(u8, mdts, req->sq->ctrl->subsys->id_ctrl_zns.zasl);
> +
> +	status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id));
> +
> +	kfree(id);
> +out:
> +	nvmet_req_complete(req, status);
> +}
> +
> +void nvmet_execute_identify_cns_cs_ns(struct nvmet_req *req)
> +{
> +	struct nvme_id_ns_zns *id_zns;
> +	u16 status = 0;
> +	u64 zsze;
> +
> +	if (le32_to_cpu(req->cmd->identify.nsid) == NVME_NSID_ALL) {
> +		req->error_loc = offsetof(struct nvme_identify, nsid);
> +		status = NVME_SC_INVALID_NS | NVME_SC_DNR;
> +		goto out;
> +	}
> +
> +	id_zns = kzalloc(sizeof(*id_zns), GFP_KERNEL);
> +	if (!id_zns) {
> +		status = NVME_SC_INTERNAL;
> +		goto out;
> +	}
> +
> +	req->ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->identify.nsid);
> +	if (!req->ns) {
> +		status = NVME_SC_INTERNAL;
> +		goto done;
> +	}
> +
> +	if (!bdev_is_zoned(nvmet_bdev(req))) {
> +		req->error_loc = offsetof(struct nvme_identify, nsid);
> +		status = NVME_SC_INVALID_NS | NVME_SC_DNR;
> +		goto done;
> +	}
> +
> +	nvmet_ns_revalidate(req->ns);
> +	zsze = (bdev_zone_sectors(nvmet_bdev(req)) << 9) >>
> +					req->ns->blksize_shift;
> +	id_zns->lbafe[0].zsze = cpu_to_le64(zsze);
> +	id_zns->mor = cpu_to_le32(bdev_max_open_zones(nvmet_bdev(req)));
> +	id_zns->mar = cpu_to_le32(bdev_max_active_zones(nvmet_bdev(req)));
> +
> +done:
> +	status = nvmet_copy_to_sgl(req, 0, id_zns, sizeof(*id_zns));
> +	kfree(id_zns);
> +out:
> +	nvmet_req_complete(req, status);
> +}
> +
> +struct nvmet_report_zone_data {
> +	struct nvmet_ns *ns;
> +	struct nvme_zone_report *rz;
> +};
> +
> +static int nvmet_bdev_report_zone_cb(struct blk_zone *z, unsigned int idx,
> +				     void *data)
> +{
> +	struct nvmet_report_zone_data *report_zone_data = data;
> +	struct nvme_zone_descriptor *entries = report_zone_data->rz->entries;
> +	struct nvmet_ns *ns = report_zone_data->ns;
> +
> +	entries[idx].zcap = cpu_to_le64(nvmet_sect_to_lba(ns, z->capacity));
> +	entries[idx].zslba = cpu_to_le64(nvmet_sect_to_lba(ns, z->start));
> +	entries[idx].wp = cpu_to_le64(nvmet_sect_to_lba(ns, z->wp));
> +	entries[idx].za = z->reset ? 1 << 2 : 0;
> +	entries[idx].zt = z->type;
> +	entries[idx].zs = z->cond << 4;
> +
> +	return 0;
> +}
> +
> +void nvmet_bdev_execute_zone_mgmt_recv(struct nvmet_req *req)
> +{
> +	u32 bufsize = (le32_to_cpu(req->cmd->zmr.numd) + 1) << 2;

size_t ?

> +	struct nvmet_report_zone_data data = { .ns = req->ns };
> +	struct nvme_zone_mgmt_recv_cmd *zmr = &req->cmd->zmr;
> +	sector_t sect = nvmet_lba_to_sect(req->ns, le64_to_cpu(zmr->slba));
> +	unsigned int nr_zones = bufsize / nvmet_zones_to_desc_size(1);

I think this is wrong since nvmet_zones_to_desc_size includes sizeof(struct
nvme_zone_report). I think what you want here is:

nr_zones = (bufsize - sizeof(struct nvme_zone_report)) /
	sizeof(struct nvme_zone_descriptor);

And then you can get rid of nvmet_zones_to_desc_size();

> +	int reported_zones;
> +	u16 status;
> +
> +	status = nvmet_bdev_zns_checks(req);
> +	if (status)
> +		goto out;
> +
> +	data.rz = __vmalloc(bufsize, GFP_KERNEL | __GFP_NORETRY);
> +	if (!data.rz) {
> +		status = NVME_SC_INTERNAL;
> +		goto out;
> +	}
> +
> +	reported_zones = blkdev_report_zones(nvmet_bdev(req), sect, nr_zones,
> +					     nvmet_bdev_report_zone_cb,
> +					     &data);
> +	if (reported_zones < 0) {
> +		status = NVME_SC_INTERNAL;
> +		goto out_free_report_zones;
> +	}
> +
> +	data.rz->nr_zones = cpu_to_le64(reported_zones);
> +
> +	status = nvmet_copy_to_sgl(req, 0, data.rz, bufsize);
> +
> +out_free_report_zones:
> +	kvfree(data.rz);
> +out:
> +	nvmet_req_complete(req, status);
> +}
> +
> +void nvmet_bdev_execute_zone_mgmt_send(struct nvmet_req *req)
> +{
> +	sector_t nr_sect = bdev_zone_sectors(nvmet_bdev(req));
> +	struct nvme_zone_mgmt_send_cmd *c = &req->cmd->zms;
> +	enum req_opf op = REQ_OP_LAST;
> +	u16 status = NVME_SC_SUCCESS;
> +	sector_t sect;
> +	int ret;
> +
> +	sect = nvmet_lba_to_sect(req->ns, le64_to_cpu(req->cmd->zms.slba));
> +
> +	switch (c->zsa) {
> +	case NVME_ZONE_OPEN:
> +		op = REQ_OP_ZONE_OPEN;
> +		break;
> +	case NVME_ZONE_CLOSE:
> +		op = REQ_OP_ZONE_CLOSE;
> +		break;
> +	case NVME_ZONE_FINISH:
> +		op = REQ_OP_ZONE_FINISH;
> +		break;
> +	case NVME_ZONE_RESET:
> +		if (c->select_all)
> +			nr_sect = get_capacity(nvmet_bdev(req)->bd_disk);
> +		op = REQ_OP_ZONE_RESET;
> +		break;
> +	default:
> +		status = NVME_SC_INVALID_FIELD;
> +		goto out;
> +	}
> +
> +	ret = blkdev_zone_mgmt(nvmet_bdev(req), op, sect, nr_sect, GFP_KERNEL);
> +	if (ret)
> +		status = NVME_SC_INTERNAL;
> +
> +out:
> +	nvmet_req_complete(req, status);
> +}
> +
> +void nvmet_bdev_execute_zone_append(struct nvmet_req *req)
> +{
> +	unsigned long bv_cnt = req->sg_cnt;
> +	int op = REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE;
> +	u64 slba = le64_to_cpu(req->cmd->rw.slba);
> +	sector_t sect = nvmet_lba_to_sect(req->ns, slba);
> +	u16 status = NVME_SC_SUCCESS;
> +	size_t mapped_data_len = 0;
> +	int sg_cnt = req->sg_cnt;
> +	struct scatterlist *sg;
> +	struct iov_iter from;
> +	struct bio_vec *bvec;
> +	size_t mapped_cnt;
> +	struct bio *bio;
> +	int ret;
> +
> +	if (!nvmet_check_transfer_len(req, nvmet_rw_data_len(req)))
> +		return;
> +
> +	/*
> +	 * When setting the ctrl->zasl we consider the BIO_MAX_PAGES so that we
> +	 * don't have to split the bio, i.e. we shouldn't get
> +	 * sg_cnt > BIO_MAX_PAGES since zasl on the host will limit the I/Os
> +	 * with the size that considers the BIO_MAX_PAGES.
> +	 */

What ? Unclear... You advertize max zone append as ctrl->zasl = min(all ns max
zone append sectors). What does BIO_MAX_PAGES has to do with anything ?

> +	if (!req->sg_cnt)
> +		goto out;
> +
> +	if (WARN_ON(req->sg_cnt > BIO_MAX_PAGES)) {
> +		status = NVME_SC_INTERNAL | NVME_SC_DNR;
> +		goto out;
> +	}
> +
> +	bvec = kmalloc_array(bv_cnt, sizeof(*bvec), GFP_KERNEL);
> +	if (!bvec) {
> +		status = NVME_SC_INTERNAL | NVME_SC_DNR;
> +		goto out;
> +	}
> +
> +	for_each_sg(req->sg, sg, req->sg_cnt, mapped_cnt) {
> +		nvmet_file_init_bvec(bvec, sg);
> +		mapped_data_len += bvec[mapped_cnt].bv_len;
> +		sg_cnt--;
> +		if (mapped_cnt == bv_cnt)
> +			break;
> +	}
> +
> +	if (WARN_ON(sg_cnt)) {
> +		status = NVME_SC_INTERNAL | NVME_SC_DNR;
> +		goto out;
> +	}
> +
> +	iov_iter_bvec(&from, WRITE, bvec, mapped_cnt, mapped_data_len);
> +
> +	bio = bio_alloc(GFP_KERNEL, bv_cnt);
> +	bio_set_dev(bio, nvmet_bdev(req));
> +	bio->bi_iter.bi_sector = sect;
> +	bio->bi_opf = op;

The op variable is used only here. It is not necessary.

> +
> +	ret =  __bio_iov_append_get_pages(bio, &from);

I still do not see why bio_iov_iter_get_pages() does not work here. Would you
care to explain please ?

> +	if (unlikely(ret)) {
> +		status = NVME_SC_INTERNAL | NVME_SC_DNR;
> +		bio_io_error(bio);
> +		goto bvec_free;
> +	}
> +
> +	ret = submit_bio_wait(bio);
> +	status = ret < 0 ? NVME_SC_INTERNAL : status;
> +	bio_put(bio);
> +
> +	sect += (mapped_data_len >> 9);
> +	req->cqe->result.u64 = le64_to_cpu(nvmet_sect_to_lba(req->ns, sect));
> +
> +bvec_free:
> +	kfree(bvec);
> +
> +out:
> +	nvmet_req_complete(req, status);
> +}
> +
> +#else  /* CONFIG_BLK_DEV_ZONED */
> +void nvmet_execute_identify_cns_cs_ctrl(struct nvmet_req *req)
> +{
> +}
> +void nvmet_execute_identify_cns_cs_ns(struct nvmet_req *req)
> +{
> +}
> +u16 nvmet_process_zns_cis(struct nvmet_req *req, off_t *off)
> +{
> +	return 0;
> +}
> +bool nvmet_bdev_zns_config(struct nvmet_ns *ns)
> +{
> +	return false;
> +}
> +void nvmet_bdev_execute_zone_mgmt_recv(struct nvmet_req *req)
> +{
> +}
> +void nvmet_bdev_execute_zone_mgmt_send(struct nvmet_req *req)
> +{
> +}
> +void nvmet_bdev_execute_zone_append(struct nvmet_req *req)
> +{
> +}
> +void nvmet_zns_add_cmd_effects(struct nvme_effects_log *log)
> +{
> +}
> +#endif /* CONFIG_BLK_DEV_ZONED */
>
Johannes Thumshirn Nov. 30, 2020, 12:29 p.m. UTC | #2
On 30/11/2020 04:32, Chaitanya Kulkarni wrote:
> +	ret =  __bio_iov_append_get_pages(bio, &from);

Can't you just use bio_iov_iter_get_pages() here?

It does have a 

if (WARN_ON_ONCE(is_bvec))
	return -EINVAL;

in it but I think that can be deleted.
Chaitanya Kulkarni Dec. 1, 2020, 3:38 a.m. UTC | #3
On 11/30/20 03:57, Damien Le Moal wrote:
> On 2020/11/30 12:29, Chaitanya Kulkarni wrote:
>> Add zns-bdev-config, id-ctrl, id-ns, zns-cmd-effects, zone-mgmt-send,
>> zone-mgmt-recv and zone-append handlers for NVMeOF target to enable ZNS
>> support for bdev.
>>
>> Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
>> ---
>>  drivers/nvme/target/Makefile      |   2 +-
>>  drivers/nvme/target/admin-cmd.c   |   4 +-
>>  drivers/nvme/target/io-cmd-file.c |   2 +-
>>  drivers/nvme/target/nvmet.h       |  19 ++
>>  drivers/nvme/target/zns.c         | 463 ++++++++++++++++++++++++++++++
>>  5 files changed, 486 insertions(+), 4 deletions(-)
>>  create mode 100644 drivers/nvme/target/zns.c
>>
>> diff --git a/drivers/nvme/target/Makefile b/drivers/nvme/target/Makefile
>> index ebf91fc4c72e..d050f829b43a 100644
>> --- a/drivers/nvme/target/Makefile
>> +++ b/drivers/nvme/target/Makefile
>> @@ -10,7 +10,7 @@ obj-$(CONFIG_NVME_TARGET_FCLOOP)	+= nvme-fcloop.o
>>  obj-$(CONFIG_NVME_TARGET_TCP)		+= nvmet-tcp.o
>>  
>>  nvmet-y		+= core.o configfs.o admin-cmd.o fabrics-cmd.o \
>> -			discovery.o io-cmd-file.o io-cmd-bdev.o
>> +		   zns.o discovery.o io-cmd-file.o io-cmd-bdev.o
>>  nvmet-$(CONFIG_NVME_TARGET_PASSTHRU)	+= passthru.o
>>  nvme-loop-y	+= loop.o
>>  nvmet-rdma-y	+= rdma.o
>> diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
>> index dca34489a1dc..509fd8dcca0c 100644
>> --- a/drivers/nvme/target/admin-cmd.c
>> +++ b/drivers/nvme/target/admin-cmd.c
>> @@ -579,8 +579,8 @@ static void nvmet_execute_identify_nslist(struct nvmet_req *req)
>>  	nvmet_req_complete(req, status);
>>  }
>>  
>> -static u16 nvmet_copy_ns_identifier(struct nvmet_req *req, u8 type, u8 len,
>> -				    void *id, off_t *off)
>> +u16 nvmet_copy_ns_identifier(struct nvmet_req *req, u8 type, u8 len,
>> +			     void *id, off_t *off)
>>  {
>>  	struct nvme_ns_id_desc desc = {
>>  		.nidt = type,
>> diff --git a/drivers/nvme/target/io-cmd-file.c b/drivers/nvme/target/io-cmd-file.c
>> index 0abbefd9925e..2bd10960fa50 100644
>> --- a/drivers/nvme/target/io-cmd-file.c
>> +++ b/drivers/nvme/target/io-cmd-file.c
>> @@ -89,7 +89,7 @@ int nvmet_file_ns_enable(struct nvmet_ns *ns)
>>  	return ret;bio_iov_iter_get_pages
>>  }
>>  
>> -static void nvmet_file_init_bvec(struct bio_vec *bv, struct scatterlist *sg)
>> +void nvmet_file_init_bvec(struct bio_vec *bv, struct scatterlist *sg)
>>  {
>>  	bv->bv_page = sg_page(sg);
>>  	bv->bv_offset = sg->offset;
>> diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
>> index 592763732065..eee7866ae512 100644
>> --- a/drivers/nvme/target/nvmet.h
>> +++ b/drivers/nvme/target/nvmet.h
>> @@ -81,6 +81,10 @@ struct nvmet_ns {
>>  	struct pci_dev		*p2p_dev;
>>  	int			pi_type;
>>  	int			metadata_size;
>> +#ifdef CONFIG_BLK_DEV_ZONED
>> +	struct nvme_id_ns_zns	id_zns;
>> +	unsigned int		zasl;
>> +#endif
>>  };
>>  
>>  static inline struct nvmet_ns *to_nvmet_ns(struct config_item *item)
>> @@ -251,6 +255,10 @@ struct nvmet_subsys {
>>  	unsigned int		admin_timeout;
>>  	unsigned int		io_timeout;
>>  #endif /* CONFIG_NVME_TARGET_PASSTHRU */
>> +
>> +#ifdef CONFIG_BLK_DEV_ZONED
>> +	struct nvme_id_ctrl_zns	id_ctrl_zns;
>> +#endif
>>  };
>>  
>>  static inline struct nvmet_subsys *to_subsys(struct config_item *item)
>> @@ -603,4 +611,15 @@ static inline bool nvmet_ns_has_pi(struct nvmet_ns *ns)
>>  	return ns->pi_type && ns->metadata_size == sizeof(struct t10_pi_tuple);
>>  }
>>  
>> +void nvmet_execute_identify_cns_cs_ctrl(struct nvmet_req *req);
>> +void nvmet_execute_identify_cns_cs_ns(struct nvmet_req *req);
>> +u16 nvmet_process_zns_cis(struct nvmet_req *req, off_t *off);
>> +bool nvmet_bdev_zns_enable(struct nvmet_ns *ns);
>> +void nvmet_bdev_execute_zone_mgmt_recv(struct nvmet_req *req);
>> +void nvmet_bdev_execute_zone_mgmt_send(struct nvmet_req *req);
>> +void nvmet_bdev_execute_zone_append(struct nvmet_req *req);
>> +void nvmet_zns_add_cmd_effects(struct nvme_effects_log *log);
>> +u16 nvmet_copy_ns_identifier(struct nvmet_req *req, u8 type, u8 len,
>> +			     void *id, off_t *off);
>> +void nvmet_file_init_bvec(struct bio_vec *bv, struct scatterlist *sg);
>>  #endif /* _NVMET_H */
>> diff --git a/drivers/nvme/target/zns.c b/drivers/nvme/target/zns.c
>> new file mode 100644
>> index 000000000000..40dedfd51fd6
>> --- /dev/null
>> +++ b/drivers/nvme/target/zns.c
>> @@ -0,0 +1,463 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +/*
>> + * NVMe ZNS-ZBD command implementation.
>> + * Copyright (c) 2020-2021 HGST, a Western Digital Company.
>> + */
>> +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
>> +#include <linux/uio.h>
>> +#include <linux/nvme.h>
>> +#include <linux/xarray.h>
>> +#include <linux/blkdev.h>
>> +#include <linux/module.h>
>> +#include "nvmet.h"
>> +
>> +#ifdef CONFIG_BLK_DEV_ZONED
>> +#define NVMET_MPSMIN_SHIFT	12
>> +
>> +static u16 nvmet_bdev_zns_checks(struct nvmet_req *req)
>> +{
>> +	u16 status = 0;
>> +
>> +	if (!bdev_is_zoned(req->ns->bdev)) {
>> +		status = NVME_SC_INVALID_NS | NVME_SC_DNR;
>> +		goto out;
>> +	}
>> +
>> +	if (req->cmd->zmr.zra != NVME_ZRA_ZONE_REPORT) {
>> +		status = NVME_SC_INVALID_FIELD;
>> +		goto out;
>> +	}
>> +
>> +	if (req->cmd->zmr.zrasf != NVME_ZRASF_ZONE_REPORT_ALL) {
>> +		status = NVME_SC_INVALID_FIELD;
>> +		goto out;
>> +	}
>> +
>> +	if (req->cmd->zmr.pr != NVME_REPORT_ZONE_PARTIAL)
>> +		status = NVME_SC_INVALID_FIELD;
>> +out:
>> +	return status;
>> +}
>> +
>> +static inline struct block_device *nvmet_bdev(struct nvmet_req *req)
>> +{
>> +	return req->ns->bdev;
>> +}
>> +
>> +static inline  u64 nvmet_zones_to_desc_size(unsigned int nr_zones)
>> +{
>> +	return sizeof(struct nvme_zone_report) +
>> +		(sizeof(struct nvme_zone_descriptor) * nr_zones);
>> +}
>> +
>> +static inline u64 nvmet_sect_to_lba(struct nvbio_iov_iter_get_pagesmet_ns *ns, sector_t sect)
>> +{
>> +	return sect >> (ns->blksize_shift - SECTOR_SHIFT);
>> +}
>> +
>> +static inline sector_t nvmet_lba_to_sect(struct nvmet_ns *ns, __le64 lba)
>> +{
>> +	return le64_to_cpu(lba) << (ns->blksize_shift - SECTOR_SHIFT);
>> +}
>> +
>> +/*
>> + *  ZNS related command implementation and helpers.
>> + */
>> +
>> +u16 nvmet_process_zns_cis(struct nvmet_req *req, off_t *off)
>> +{
>> +	u16 nvme_cis_zns = NVME_CSI_ZNS;
>> +
>> +	if (!bdev_is_zoned(nvmet_bdev(req)))
>> +		return NVME_SC_SUCCESS;
>> +
>> +	return nvmet_copy_ns_identifier(req, NVME_NIDT_CSI, NVME_NIDT_CSI_LEN,
>> +					&nvme_cis_zns, off);
>> +}
>> +
>> +void nvmet_zns_add_cmd_effects(struct nvme_effects_log *log)
>> +{
>> +	log->iocs[nvme_cmd_zone_append]		= cpu_to_le32(1 << 0);
>> +	log->iocs[nvme_cmd_zone_mgmt_send]	= cpu_to_le32(1 << 0);
>> +	log->iocs[nvme_cmd_zone_mgmt_recv]	= cpu_to_le32(1 << 0);
>> +}
>> +
>> +static int nvmet_bdev_validate_zns_zones_cb(struct blk_zone *z,
>> +					    unsigned int idx, void *data)
>> +{
>> +	struct blk_zone *zone = data;
>> +
>> +	memcpy(zone, z, sizeof(struct blk_zone));
>> +
>> +	return 0;
>> +}
>> +
>> +static inline bool nvmet_bdev_validate_zns_zones(struct nvmet_ns *ns)
>> +{
>> +	sector_t last_sect = get_capacity(ns->bdev->bd_disk) - 1;
>> +	struct blk_zone last_zone, first_zone;
>> +	int reported_zones;
>> +
>> +	reported_zones = blkdev_report_zones(ns->bdev, 0, 1,
>> +					     nvmet_bdev_validate_zns_zones_cb,
>> +					     &first_zone);
>> +	if (reported_zones != 1)
>> +		return false;
>> +
>> +	reported_zones = blkdev_report_zones(ns->bdev, last_sect, 1,
>> +					     nvmet_bdev_validate_zns_zones_cb,
>> +					     &last_zone);
>> +	if (reported_zones != 1)
>> +		return false;
>> +
>> +	return first_zone.capacity == last_zone.capacity ? true : false;
>> +}
> That is really heavy handed... Why not just simply:
>
> return !(get_capacity(ns->bdev->bd_disk) & (bdev_zone_sectors(ns->bdev) - 1));
>
> That does the same: true if capacity is a multiple of the zone size (no runt
> zone
Okay.
> ), false otherwise.
>
>> +
>> +static inline u8 nvmet_zasl(unsigned int zone_append_sects)
>> +{
>> +	unsigned int npages = (zone_append_sects << 9) >> NVMET_MPSMIN_SHIFT;
>> +	u8 zasl = ilog2(npages);
>> +
>> +	/*
>> +	 * Zone Append Size Limit is the value experessed in the units
>> +	 * of minimum memory page size (i.e. 12) and is reported power of 2.
>> +	 */
>> +	return zasl;
> Why not just:
>
> 	return ilog2((zone_append_sects << 9) >> NVMET_MPSMIN_SHIFT);
Okay.
> And are you sure that there is no possibility that the u8 type overflows here ?
nvmet_zasl() is called with an argument which are unsigned ints :-
1. (BIO_MAX_PAGES * PAGE_SIZE) >> 9.
2. queue_max_zone_append_sectors().
3. queue_max_zone_append_sectors().

size of unsigned int on 32bit machine should be 4bytes, so
0xFFFFFFFF >> 12 = 0xFFFFF and log2(0XFFFFF) should fit into the u8,
unless I'm completely wrong.
>> +}
>> +
>> +static inline void nvmet_zns_update_zasl(struct nvmet_ns *ns)
>> +{
>> +	u8 bio_max_zasl = nvmet_zasl((BIO_MAX_PAGES * PAGE_SIZE) >> 9);
>> +	struct request_queue *q = ns->bdev->bd_disk->queue;
>> +	struct nvmet_ns *ins;
>> +	unsigned long idx;
>> +	u8 min_zasl;
>> +
>> +	/*
>> +	 * Calculate new ctrl->zasl value when enabling the new ns. This value
>> +	 * has to be the minimum of the max_zone appned values from available
> s/appned/append
Okay.
>> +	 * namespaces.
>> +	 */
>> +	min_zasl = ns->zasl = nvmet_zasl(queue_max_zone_append_sectors(q));
>> +
>> +	xa_for_each(&(ns->subsys->namespaces), idx, ins) {
>> +		struct request_queue *iq = ins->bdev->bd_disk->queue;
>> +		unsigned int imax_za_sects = queue_max_zone_append_sectors(iq);
>> +		u8 izasl = nvmet_zasl(imax_za_sects);
>> +
>> +		if (!bdev_is_zoned(ins->bdev))
>> +			continue;
>> +
>> +		min_zasl = min_zasl > izasl ? izasl : min_zasl;
>> +	}
>> +
>> +	ns->subsys->id_ctrl_zns.zasl = min_t(u8, min_zasl, bio_max_zasl);
> I do not understand what bio_max_zasl is used for here, as it does not represent
> anything related to the zoned namespaces backends.
If we don't consider the bio max zasl then we will get the request more than
one bio can handle for zone append and will lead to bio split whichneeds to
be avoided.
>> +}> +
>> +bool nvmet_bdev_zns_enable(struct nvmet_ns *ns)
>> +{
>> +	if (ns->bdev->bd_disk->queue->conv_zones_bitmap) {
>> +		pr_err("block devices with conventional zones are not supported.");
>> +		return false;
>> +	}
> It may be better to move this test in nvmet_bdev_validate_zns_zones(), so that
> all backend zone configuration checks are together.
Okay.
>> +
>> +	if (!nvmet_bdev_validate_zns_zones(ns))
>> +		return false;
>> +
>> +	/*
>> +	 * For ZBC and ZAC devices, writes into sequential zones must be aligned
>> +	 * to the device physical block size. So use this value as the logical
>> +	 * block size to avoid errors.
>> +	 */
>> +	ns->blksize_shift = blksize_bits(bdev_physical_block_size(ns->bdev));
>> +
>> +	nvmet_zns_update_zasl(ns);
>> +
>> +	return true;
>> +}
>> +
>> +/*
>> + * ZNS related Admin and I/O command handlers.
>> + */
>> +void nvmet_execute_identify_cns_cs_ctrl(struct nvmet_req *req)
>> +{
>> +	struct nvmet_ctrl *ctrl = req->sq->ctrl;
>> +	struct nvme_id_ctrl_zns *id;
>> +	u16 status = 0;
>> +	u8 mdts;
>> +
>> +	id = kzalloc(sizeof(*id), GFP_KERNEL);
>> +	if (!id) {
>> +		status = NVME_SC_INTERNAL;
>> +		goto out;
>> +	}
>> +
>> +	/*
>> +	 * Even though this function sets Zone Append Size Limit to 0,
>> +	 * the 0 value here indicates that the maximum data transfer size for
>> +	 * the Zone Append command is indicated by the ctrl
>> +	 * Maximum Data Transfer Size (MDTS).
>> +	 */
> I do not understand this comment. It does not really exactly match what the code
> below is doing.
Let me fix the code and the comment in next version.
>> +
>> +	mdts = ctrl->ops->get_mdts ? ctrl->ops->get_mdts(ctrl) : 0;
>> +
>> +	id->zasl = min_t(u8, mdts, req->sq->ctrl->subsys->id_ctrl_zns.zasl);
>> +
>> +	status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id));
>> +
>> +	kfree(id);
>> +out:
>> +	nvmet_req_complete(req, status);
>> +}
>> +
>> +void nvmet_execute_identify_cns_cs_ns(struct nvmet_req *req)
>> +{
>> +	struct nvme_id_ns_zns *id_zns;
>> +	u16 status = 0;
>> +	u64 zsze;
>> +
>> +	if (le32_to_cpu(req->cmd->identify.nsid) == NVME_NSID_ALL) {
>> +		req->error_loc = offsetof(struct nvme_identify, nsid);
>> +		status = NVME_SC_INVALID_NS | NVME_SC_DNR;
>> +		goto out;
>> +	}
>> +
>> +	id_zns = kzalloc(sizeof(*id_zns), GFP_KERNEL);
>> +	if (!id_zns) {
>> +		status = NVME_SC_INTERNAL;
>> +		goto out;
>> +	}
>> +
>> +	req->ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->identify.nsid);
>> +	if (!req->ns) {
>> +		status = NVME_SC_INTERNAL;
>> +		goto done;
>> +	}
>> +
>> +	if (!bdev_is_zoned(nvmet_bdev(req))) {
>> +		req->error_loc = offsetof(struct nvme_identify, nsid);
>> +		status = NVME_SC_INVALID_NS | NVME_SC_DNR;
>> +		goto done;
>> +	}
>> +
>> +	nvmet_ns_revalidate(req->ns);
>> +	zsze = (bdev_zone_sectors(nvmet_bdev(req)) << 9) >>
>> +					req->ns->blksize_shift;
>> +	id_zns->lbafe[0].zsze = cpu_to_le64(zsze);
>> +	id_zns->mor = cpu_to_le32(bdev_max_open_zones(nvmet_bdev(req)));
>> +	id_zns->mar = cpu_to_le32(bdev_max_active_zones(nvmet_bdev(req)));
>> +
>> +done:
>> +	status = nvmet_copy_to_sgl(req, 0, id_zns, sizeof(*id_zns));
>> +	kfree(id_zns);
>> +out:
>> +	nvmet_req_complete(req, status);
>> +}
>> +
>> +struct nvmet_report_zone_data {
>> +	struct nvmet_ns *ns;
>> +	struct nvme_zone_report *rz;
>> +};
>> +
>> +static int nvmet_bdev_report_zone_cb(struct blk_zone *z, unsigned int idx,
>> +				     void *data)
>> +{
>> +	struct nvmet_report_zone_data *report_zone_data = data;
>> +	struct nvme_zone_descriptor *entries = report_zone_data->rz->entries;
>> +	struct nvmet_ns *ns = report_zone_data->ns;
>> +
>> +	entries[idx].zcap = cpu_to_le64(nvmet_sect_to_lba(ns, z->capacity));
>> +	entries[idx].zslba = cpu_to_le64(nvmet_sect_to_lba(ns, z->start));
>> +	entries[idx].wp = cpu_to_le64(nvmet_sect_to_lba(ns, z->wp));
>> +	entries[idx].za = z->reset ? 1 << 2 : 0;
>> +	entries[idx].zt = z->type;
>> +	entries[idx].zs = z->cond << 4;
>> +
>> +	return 0;
>> +}
>> +
>> +void nvmet_bdev_execute_zone_mgmt_recv(struct nvmet_req *req)
>> +{
>> +	u32 bufsize = (le32_to_cpu(req->cmd->zmr.numd) + 1) << 2;
> size_t ?
Yes.
>> +	struct nvmet_report_zone_data data = { .ns = req->ns };
>> +	struct nvme_zone_mgmt_recv_cmd *zmr = &req->cmd->zmr;
>> +	sector_t sect = nvmet_lba_to_sect(req->ns, le64_to_cpu(zmr->slba));
>> +	unsigned int nr_zones = bufsize / nvmet_zones_to_desc_size(1);
> I think this is wrong since nvmet_zones_to_desc_size includes sizeof(struct
> nvme_zone_report). I think what you want here is:
>
> nr_zones = (bufsize - sizeof(struct nvme_zone_report)) /
> 	sizeof(struct nvme_zone_descriptor);
>
> And then you can get rid of nvmet_zones_to_desc_size();
Yes, it doesn't need nvmet_zones_to_desc_size() anymore from v1.
>> +	int reported_zones;Maybe there is better way.
>> +	u16 status;
>> +
>> +	status = nvmet_bdev_zns_checks(req);
>> +	if (status)
>> +		goto out;
>> +
>> +	data.rz = __vmalloc(bufsize, GFP_KERNEL | __GFP_NORETRY);
>> +	if (!data.rz) {
>> +		status = NVME_SC_INTERNAL;
>> +		goto out;
>> +	}
>> +
>> +	reported_zones = blkdev_report_zones(nvmet_bdev(req), sect, nr_zones,
>> +					     nvmet_bdev_report_zone_cb,
>> +					     &data);
>> +	if (reported_zones < 0) {
>> +		status = NVME_SC_INTERNAL;
>> +		goto out_free_report_zones;
>> +	}
>> +
>> +	data.rz->nr_zones = cpu_to_le64(reported_zones);
>> +
>> +	status = nvmet_copy_to_sgl(req, 0, data.rz, bufsize);
>> +
>> +out_free_report_zones:
>> +	kvfree(data.rz);
>> +out:
>> +	nvmet_req_complete(req, status);
>> +}
>> +
>> +void nvmet_bdev_execute_zone_mgmt_send(struct nvmet_req *req)
>> +{
>> +	sector_t nr_sect = bdev_zone_sectors(nvmet_bdev(req));
>> +	struct nvme_zone_mgmt_send_cmd *c = &req->cmd->zms;
>> +	enum req_opf op = REQ_OP_LAST;
>> +	u16 status = NVME_SC_SUCCESS;
>> +	sector_t sect;
>> +	int ret;
>> +
>> +	sect = nvmet_lba_to_sect(req->ns, le64_to_cpu(req->cmd->zms.slba));
>> +
>> +	switch (c->zsa) {
>> +	case NVME_ZONE_OPEN:
>> +		op = REQ_OP_ZONE_OPEN;
>> +		break;
>> +	case NVME_ZONE_CLOSE:
>> +		op = REQ_OP_ZONE_CLOSE;
>> +		break;
>> +	case NVME_ZONE_FINISH:
>> +		op = REQ_OP_ZONE_FINISH;
>> +		break;
>> +	case NVME_ZONE_RESET:
>> +		if (c->select_all)
>> +			nr_sect = get_capacity(nvmet_bdev(req)->bd_disk);
>> +		op = REQ_OP_ZONE_RESET;
>> +		break;
>> +	default:
>> +		status = NVME_SC_INVALID_FIELD;
>> +		goto out;
>> +	}
>> +
>> +	ret = blkdev_zone_mgmt(nvmet_bdev(req), op, sect, nr_sect, GFP_KERNEL);
>> +	if (ret)
>> +		status = NVME_SC_INTERNAL;
>> +
>> +out:
>> +	nvmet_req_complete(req, status);
>> +}
>> +
>> +void nvmet_bdev_execute_zone_append(struct nvmet_req *req)
>> +{
>> +	unsigned long bv_cnt = req->sg_cnt;
>> +	int op = REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE;
>> +	u64 slba = le64_to_cpu(req->cmd->rw.slba);
>> +	sector_t sect = nvmet_lba_to_sect(req->ns, slba);
>> +	u16 status = NVME_SC_SUCCESS;
>> +	size_t mapped_data_len = 0;
>> +	int sg_cnt = req->sg_cnt;
>> +	struct scatterlist *sg;
>> +	struct iov_iter from;
>> +	struct bio_vec *bvec;
>> +	size_t mapped_cnt;
>> +	struct bio *bio;
>> +	int ret;
>> +
>> +	if (!nvmet_check_transfer_len(req, nvmet_rw_data_len(req)))
>> +		return;
>> +
>> +	/*
>> +	 * When setting the ctrl->zasl we consider the BIO_MAX_PAGES so that we
>> +	 * don't have to split the bio, i.e. we shouldn't get
>> +	 * sg_cnt > BIO_MAX_PAGES since zasl on the host will limit the I/Os
>> +	 * with the size that considers the BIO_MAX_PAGES.
>> +	 */
> What ? Unclear... You advertize max zone append as ctrl->zasl = min(all ns max
> zone append sectors). What does BIO_MAX_PAGES has to do with anything ?
That is not true, ctrl->zasl is advertised
min(min all ns max zone append sectors), bio_max_zasl) to avoid the
splitting
of the bio on the target side:-

See nvmet_zns_update_zasl()

+	ns->subsys->id_ctrl_zns.zasl = min_t(u8, min_zasl, bio_max_zasl);

Without considering the bio_max_pages we may have to set the ctrl->zasl
value
thatis > bio_max_pages_zasl, then we'll get a request that is greater
than what
one bio can handle, that will lead to splitting the bios, which we want to
avoid as per the comment in the V1.

Can you please explain what is wrong with this approach which regulates the
zasl with all the possible namespaces zasl and bio_zasl?

May be there is better way?
>> +	if (!req->sg_cnt)
>> +		goto out;
>> +
>> +	if (WARN_ON(req->sg_cnt > BIO_MAX_PAGES)) {
>> +		status = NVME_SC_INTERNAL | NVME_SC_DNR;
>> +		goto out;
>> +	}
>> +
>> +	bvec = kmalloc_array(bv_cnt, sizeof(*bvec), GFP_KERNEL);
>> +	if (!bvec) {
>> +		status = NVME_SC_INTERNAL | NVME_SC_DNR;
>> +		goto out;
>> +	}
>> +
>> +	for_each_sg(req->sg, sg, req->sg_cnt, mapped_cnt) {
>> +		nvmet_file_init_bvec(bvec, sg);
>> +		mapped_data_len += bvec[mapped_cnt].bv_len;
>> +		sg_cnt--;
>> +		if (mapped_cnt == bv_cnt)
>> +			brhigh frequency I/O operationeak;
>> +	}
>> +
>> +	if (WARN_ON(sg_cnt)) {
>> +		status = NVME_SC_INTERNAL | NVME_SC_DNR;
>> +		goto out;
>> +	}
>> +
>> +	iov_iter_bvec(&from, WRITE, bvec, mapped_cnt, mapped_data_len);
>> +
>> +	bio = bio_alloc(GFP_KERNEL, bv_cnt);
>> +	bio_set_dev(bio, nvmet_bdev(req));
>> +	bio->bi_iter.bi_sector = sect;
>> +	bio->bi_opf = op;
> The op variable is used only here. It is not necessary.
Yes.
>> +
>> +	ret =  __bio_iov_append_get_pages(bio, &from);
> I still do not see why bio_iov_iter_get_pages() does not work here. Would you
> care to explain please ?
>
The same reason pointed out by the Johannes. Instead of calling wrapper,
call the underlaying core API, since we don't care about the rest of the
generic code. This also avoids an extra function callfor zone-append
fast path.
>> +	if (unlikely(ret)) {
>> +		status = NVME_SC_INTERNAL | NVME_SC_DNR;
>> +		bio_io_error(bio);
>> +		goto bvec_free;
>> +	}
>> +
>> +	ret = submit_bio_wait(bio);
>> +	status = ret < 0 ? NVME_SC_INTERNAL : status;
>> +	bio_put(bio);
>> +
>> +	sect += (mapped_data_len >> 9);
>> +	req->cqe->result.u64 = le64_to_cpu(nvmet_sect_to_lba(req->ns, sect));
>> +
>> +bvec_free:
>> +	kfree(bvec);
>> +
>> +out:
>> +	nvmet_req_complete(req, status);
>> +}
>> +
>> +#else  /* CONFIG_BLK_DEV_ZONED */
>> +void nvmet_execute_identify_cns_cs_ctrl(struct nvmet_req *req)
>> +{
>> +}
>> +void nvmet_execute_identify_cns_cs_ns(struct nvmet_req *req)
>> +{
>> +}
>> +u16 nvmet_process_zns_cis(struct nvmet_req *req, off_t *off)
>> +{
>> +	return 0;
>> +}
>> +bool nvmet_bdev_zns_config(struct nvmet_ns *ns)
>> +{
>> +	return false;
>> +}
>> +void nvmet_bdev_execute_zone_mgmt_recv(struct nvmet_req *req)
>> +{
>> +}
>> +void nvmet_bdev_execute_zone_mgmt_send(struct nvmet_req *req)
>> +{
>> +}
>> +void nvmet_bdev_execute_zone_append(struct nvmet_req *req)
>> +{
>> +}
>> +void nvmet_zns_add_cmd_effects(struct nvme_effects_log *log)
>> +{
>> +}
>> +#endif /* CONFIG_BLK_DEV_ZONED */
>>
>
Chaitanya Kulkarni Dec. 1, 2020, 3:49 a.m. UTC | #4
On 11/30/20 04:29, Johannes Thumshirn wrote:
> On 30/11/2020 04:32, Chaitanya Kulkarni wrote:
>> +	ret =  __bio_iov_append_get_pages(bio, &from);
> Can't you just use bio_iov_iter_get_pages() here?
>
> It does have a 
>
> if (WARN_ON_ONCE(is_bvec))
> 	return -EINVAL;
>
> in it but I think that can be deleted.
>
That was my initial patch but it adds an extra function call to the

fast patch for NVMeOF. We don't need any of the generic functionality from

bio_iov_iter_get_pages() anyway.


Why add an extra function call overhead in the hot path for each I/O ?
Damien Le Moal Dec. 1, 2020, 5:44 a.m. UTC | #5
On 2020/12/01 12:38, Chaitanya Kulkarni wrote:
[...]
>>> +	 * namespaces.
>>> +	 */
>>> +	min_zasl = ns->zasl = nvmet_zasl(queue_max_zone_append_sectors(q));
>>> +
>>> +	xa_for_each(&(ns->subsys->namespaces), idx, ins) {
>>> +		struct request_queue *iq = ins->bdev->bd_disk->queue;
>>> +		unsigned int imax_za_sects = queue_max_zone_append_sectors(iq);
>>> +		u8 izasl = nvmet_zasl(imax_za_sects);
>>> +
>>> +		if (!bdev_is_zoned(ins->bdev))
>>> +			continue;
>>> +
>>> +		min_zasl = min_zasl > izasl ? izasl : min_zasl;
>>> +	}
>>> +
>>> +	ns->subsys->id_ctrl_zns.zasl = min_t(u8, min_zasl, bio_max_zasl);
>> I do not understand what bio_max_zasl is used for here, as it does not represent
>> anything related to the zoned namespaces backends.
> If we don't consider the bio max zasl then we will get the request more than
> one bio can handle for zone append and will lead to bio split whichneeds to
> be avoided.

Why ? zasl comes from the target backend max zone append sectors, which we
already know is OK and does not lead to BIO splitting for zone append. So why do
you need an extra verification against that bio_max_zasl ?

>>> +}> +
>>> +bool nvmet_bdev_zns_enable(struct nvmet_ns *ns)
>>> +{
>>> +	if (ns->bdev->bd_disk->queue->conv_zones_bitmap) {
>>> +		pr_err("block devices with conventional zones are not supported.");
>>> +		return false;
>>> +	}
>> It may be better to move this test in nvmet_bdev_validate_zns_zones(), so that
>> all backend zone configuration checks are together.
> Okay.
>>> +
>>> +	if (!nvmet_bdev_validate_zns_zones(ns))
>>> +		return false;
>>> +
>>> +	/*
>>> +	 * For ZBC and ZAC devices, writes into sequential zones must be aligned
>>> +	 * to the device physical block size. So use this value as the logical
>>> +	 * block size to avoid errors.
>>> +	 */
>>> +	ns->blksize_shift = blksize_bits(bdev_physical_block_size(ns->bdev));
>>> +
>>> +	nvmet_zns_update_zasl(ns);
>>> +
>>> +	return true;
>>> +}
>>> +
>>> +/*
>>> + * ZNS related Admin and I/O command handlers.
>>> + */
>>> +void nvmet_execute_identify_cns_cs_ctrl(struct nvmet_req *req)
>>> +{
>>> +	struct nvmet_ctrl *ctrl = req->sq->ctrl;
>>> +	struct nvme_id_ctrl_zns *id;
>>> +	u16 status = 0;
>>> +	u8 mdts;
>>> +
>>> +	id = kzalloc(sizeof(*id), GFP_KERNEL);
>>> +	if (!id) {
>>> +		status = NVME_SC_INTERNAL;
>>> +		goto out;
>>> +	}
>>> +
>>> +	/*
>>> +	 * Even though this function sets Zone Append Size Limit to 0,
>>> +	 * the 0 value here indicates that the maximum data transfer size for
>>> +	 * the Zone Append command is indicated by the ctrl
>>> +	 * Maximum Data Transfer Size (MDTS).
>>> +	 */
>> I do not understand this comment. It does not really exactly match what the code
>> below is doing.
> Let me fix the code and the comment in next version.
>>> +
>>> +	mdts = ctrl->ops->get_mdts ? ctrl->ops->get_mdts(ctrl) : 0;
>>> +
>>> +	id->zasl = min_t(u8, mdts, req->sq->ctrl->subsys->id_ctrl_zns.zasl);
>>> +
>>> +	status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id));
>>> +
>>> +	kfree(id);
>>> +out:
>>> +	nvmet_req_complete(req, status);
>>> +}
>>> +
>>> +void nvmet_execute_identify_cns_cs_ns(struct nvmet_req *req)
>>> +{
>>> +	struct nvme_id_ns_zns *id_zns;
>>> +	u16 status = 0;
>>> +	u64 zsze;
>>> +
>>> +	if (le32_to_cpu(req->cmd->identify.nsid) == NVME_NSID_ALL) {
>>> +		req->error_loc = offsetof(struct nvme_identify, nsid);
>>> +		status = NVME_SC_INVALID_NS | NVME_SC_DNR;
>>> +		goto out;
>>> +	}
>>> +
>>> +	id_zns = kzalloc(sizeof(*id_zns), GFP_KERNEL);
>>> +	if (!id_zns) {
>>> +		status = NVME_SC_INTERNAL;
>>> +		goto out;
>>> +	}
>>> +
>>> +	req->ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->identify.nsid);
>>> +	if (!req->ns) {
>>> +		status = NVME_SC_INTERNAL;
>>> +		goto done;
>>> +	}
>>> +
>>> +	if (!bdev_is_zoned(nvmet_bdev(req))) {
>>> +		req->error_loc = offsetof(struct nvme_identify, nsid);
>>> +		status = NVME_SC_INVALID_NS | NVME_SC_DNR;
>>> +		goto done;
>>> +	}
>>> +
>>> +	nvmet_ns_revalidate(req->ns);
>>> +	zsze = (bdev_zone_sectors(nvmet_bdev(req)) << 9) >>
>>> +					req->ns->blksize_shift;
>>> +	id_zns->lbafe[0].zsze = cpu_to_le64(zsze);
>>> +	id_zns->mor = cpu_to_le32(bdev_max_open_zones(nvmet_bdev(req)));
>>> +	id_zns->mar = cpu_to_le32(bdev_max_active_zones(nvmet_bdev(req)));
>>> +
>>> +done:
>>> +	status = nvmet_copy_to_sgl(req, 0, id_zns, sizeof(*id_zns));
>>> +	kfree(id_zns);
>>> +out:
>>> +	nvmet_req_complete(req, status);
>>> +}
>>> +
>>> +struct nvmet_report_zone_data {
>>> +	struct nvmet_ns *ns;
>>> +	struct nvme_zone_report *rz;
>>> +};
>>> +
>>> +static int nvmet_bdev_report_zone_cb(struct blk_zone *z, unsigned int idx,
>>> +				     void *data)
>>> +{
>>> +	struct nvmet_report_zone_data *report_zone_data = data;
>>> +	struct nvme_zone_descriptor *entries = report_zone_data->rz->entries;
>>> +	struct nvmet_ns *ns = report_zone_data->ns;
>>> +
>>> +	entries[idx].zcap = cpu_to_le64(nvmet_sect_to_lba(ns, z->capacity));
>>> +	entries[idx].zslba = cpu_to_le64(nvmet_sect_to_lba(ns, z->start));
>>> +	entries[idx].wp = cpu_to_le64(nvmet_sect_to_lba(ns, z->wp));
>>> +	entries[idx].za = z->reset ? 1 << 2 : 0;
>>> +	entries[idx].zt = z->type;
>>> +	entries[idx].zs = z->cond << 4;
>>> +
>>> +	return 0;
>>> +}
>>> +
>>> +void nvmet_bdev_execute_zone_mgmt_recv(struct nvmet_req *req)
>>> +{
>>> +	u32 bufsize = (le32_to_cpu(req->cmd->zmr.numd) + 1) << 2;
>> size_t ?
> Yes.
>>> +	struct nvmet_report_zone_data data = { .ns = req->ns };
>>> +	struct nvme_zone_mgmt_recv_cmd *zmr = &req->cmd->zmr;
>>> +	sector_t sect = nvmet_lba_to_sect(req->ns, le64_to_cpu(zmr->slba));
>>> +	unsigned int nr_zones = bufsize / nvmet_zones_to_desc_size(1);
>> I think this is wrong since nvmet_zones_to_desc_size includes sizeof(struct
>> nvme_zone_report). I think what you want here is:
>>
>> nr_zones = (bufsize - sizeof(struct nvme_zone_report)) /
>> 	sizeof(struct nvme_zone_descriptor);
>>
>> And then you can get rid of nvmet_zones_to_desc_size();
> Yes, it doesn't need nvmet_zones_to_desc_size() anymore from v1.
>>> +	int reported_zones;Maybe there is better way.
>>> +	u16 status;
>>> +
>>> +	status = nvmet_bdev_zns_checks(req);
>>> +	if (status)
>>> +		goto out;
>>> +
>>> +	data.rz = __vmalloc(bufsize, GFP_KERNEL | __GFP_NORETRY);
>>> +	if (!data.rz) {
>>> +		status = NVME_SC_INTERNAL;
>>> +		goto out;
>>> +	}
>>> +
>>> +	reported_zones = blkdev_report_zones(nvmet_bdev(req), sect, nr_zones,
>>> +					     nvmet_bdev_report_zone_cb,
>>> +					     &data);
>>> +	if (reported_zones < 0) {
>>> +		status = NVME_SC_INTERNAL;
>>> +		goto out_free_report_zones;
>>> +	}
>>> +
>>> +	data.rz->nr_zones = cpu_to_le64(reported_zones);
>>> +
>>> +	status = nvmet_copy_to_sgl(req, 0, data.rz, bufsize);
>>> +
>>> +out_free_report_zones:
>>> +	kvfree(data.rz);
>>> +out:
>>> +	nvmet_req_complete(req, status);
>>> +}
>>> +
>>> +void nvmet_bdev_execute_zone_mgmt_send(struct nvmet_req *req)
>>> +{
>>> +	sector_t nr_sect = bdev_zone_sectors(nvmet_bdev(req));
>>> +	struct nvme_zone_mgmt_send_cmd *c = &req->cmd->zms;
>>> +	enum req_opf op = REQ_OP_LAST;
>>> +	u16 status = NVME_SC_SUCCESS;
>>> +	sector_t sect;
>>> +	int ret;
>>> +
>>> +	sect = nvmet_lba_to_sect(req->ns, le64_to_cpu(req->cmd->zms.slba));
>>> +
>>> +	switch (c->zsa) {
>>> +	case NVME_ZONE_OPEN:
>>> +		op = REQ_OP_ZONE_OPEN;
>>> +		break;
>>> +	case NVME_ZONE_CLOSE:
>>> +		op = REQ_OP_ZONE_CLOSE;
>>> +		break;
>>> +	case NVME_ZONE_FINISH:
>>> +		op = REQ_OP_ZONE_FINISH;
>>> +		break;
>>> +	case NVME_ZONE_RESET:
>>> +		if (c->select_all)
>>> +			nr_sect = get_capacity(nvmet_bdev(req)->bd_disk);
>>> +		op = REQ_OP_ZONE_RESET;
>>> +		break;
>>> +	default:
>>> +		status = NVME_SC_INVALID_FIELD;
>>> +		goto out;
>>> +	}
>>> +
>>> +	ret = blkdev_zone_mgmt(nvmet_bdev(req), op, sect, nr_sect, GFP_KERNEL);
>>> +	if (ret)
>>> +		status = NVME_SC_INTERNAL;
>>> +
>>> +out:
>>> +	nvmet_req_complete(req, status);
>>> +}
>>> +
>>> +void nvmet_bdev_execute_zone_append(struct nvmet_req *req)
>>> +{
>>> +	unsigned long bv_cnt = req->sg_cnt;
>>> +	int op = REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE;
>>> +	u64 slba = le64_to_cpu(req->cmd->rw.slba);
>>> +	sector_t sect = nvmet_lba_to_sect(req->ns, slba);
>>> +	u16 status = NVME_SC_SUCCESS;
>>> +	size_t mapped_data_len = 0;
>>> +	int sg_cnt = req->sg_cnt;
>>> +	struct scatterlist *sg;
>>> +	struct iov_iter from;
>>> +	struct bio_vec *bvec;
>>> +	size_t mapped_cnt;
>>> +	struct bio *bio;
>>> +	int ret;
>>> +
>>> +	if (!nvmet_check_transfer_len(req, nvmet_rw_data_len(req)))
>>> +		return;
>>> +
>>> +	/*
>>> +	 * When setting the ctrl->zasl we consider the BIO_MAX_PAGES so that we
>>> +	 * don't have to split the bio, i.e. we shouldn't get
>>> +	 * sg_cnt > BIO_MAX_PAGES since zasl on the host will limit the I/Os
>>> +	 * with the size that considers the BIO_MAX_PAGES.
>>> +	 */
>> What ? Unclear... You advertize max zone append as ctrl->zasl = min(all ns max
>> zone append sectors). What does BIO_MAX_PAGES has to do with anything ?
> That is not true, ctrl->zasl is advertised
> min(min all ns max zone append sectors), bio_max_zasl) to avoid the
> splitting
> of the bio on the target side:-

We already know that zasl for each NS, given by the max zone append sector of
the backends, are already OK, regardless of BIO_MAX_PAGES (see below).

> 
> See nvmet_zns_update_zasl()
> 
> +	ns->subsys->id_ctrl_zns.zasl = min_t(u8, min_zasl, bio_max_zasl);
> 
> Without considering the bio_max_pages we may have to set the ctrl->zasl
> value
> thatis > bio_max_pages_zasl, then we'll get a request that is greater
> than what
> one bio can handle, that will lead to splitting the bios, which we want to
> avoid as per the comment in the V1.
> 
> Can you please explain what is wrong with this approach which regulates the
> zasl with all the possible namespaces zasl and bio_zasl?

For starter, with multi-page bvecs now, I am not sure BIO_MAX_PAGES makes any
sense anymore. Next, on the target side, the max zone append sectors limit for
each NS guarantee that a single BIO can be used without splitting needed. Taking
the min  of all of them will NOT remove that guarantee. Hence I think that
BIO_MAX_PAGES has nothing to do in the middle of all this.

> 
> May be there is better way?
>>> +	if (!req->sg_cnt)
>>> +		goto out;
>>> +
>>> +	if (WARN_ON(req->sg_cnt > BIO_MAX_PAGES)) {
>>> +		status = NVME_SC_INTERNAL | NVME_SC_DNR;
>>> +		goto out;
>>> +	}
>>> +
>>> +	bvec = kmalloc_array(bv_cnt, sizeof(*bvec), GFP_KERNEL);
>>> +	if (!bvec) {
>>> +		status = NVME_SC_INTERNAL | NVME_SC_DNR;
>>> +		goto out;
>>> +	}
>>> +
>>> +	for_each_sg(req->sg, sg, req->sg_cnt, mapped_cnt) {
>>> +		nvmet_file_init_bvec(bvec, sg);
>>> +		mapped_data_len += bvec[mapped_cnt].bv_len;
>>> +		sg_cnt--;
>>> +		if (mapped_cnt == bv_cnt)
>>> +			brhigh frequency I/O operationeak;
>>> +	}
>>> +
>>> +	if (WARN_ON(sg_cnt)) {
>>> +		status = NVME_SC_INTERNAL | NVME_SC_DNR;
>>> +		goto out;
>>> +	}
>>> +
>>> +	iov_iter_bvec(&from, WRITE, bvec, mapped_cnt, mapped_data_len);
>>> +
>>> +	bio = bio_alloc(GFP_KERNEL, bv_cnt);
>>> +	bio_set_dev(bio, nvmet_bdev(req));
>>> +	bio->bi_iter.bi_sector = sect;
>>> +	bio->bi_opf = op;
>> The op variable is used only here. It is not necessary.
> Yes.
>>> +
>>> +	ret =  __bio_iov_append_get_pages(bio, &from);
>> I still do not see why bio_iov_iter_get_pages() does not work here. Would you
>> care to explain please ?
>>
> The same reason pointed out by the Johannes. Instead of calling wrapper,
> call the underlaying core API, since we don't care about the rest of the
> generic code. This also avoids an extra function callfor zone-append
> fast path.

I am still not convinced that __bio_iov_append_get_pages() will do the right
thing as it lacks the loop that bio_iov_iter_get_pages() has.

>>> +	if (unlikely(ret)) {
>>> +		status = NVME_SC_INTERNAL | NVME_SC_DNR;
>>> +		bio_io_error(bio);
>>> +		goto bvec_free;
>>> +	}
>>> +
>>> +	ret = submit_bio_wait(bio);
>>> +	status = ret < 0 ? NVME_SC_INTERNAL : status;
>>> +	bio_put(bio);
>>> +
>>> +	sect += (mapped_data_len >> 9);
>>> +	req->cqe->result.u64 = le64_to_cpu(nvmet_sect_to_lba(req->ns, sect));
>>> +
>>> +bvec_free:
>>> +	kfree(bvec);
>>> +
>>> +out:
>>> +	nvmet_req_complete(req, status);
>>> +}
>>> +
>>> +#else  /* CONFIG_BLK_DEV_ZONED */
>>> +void nvmet_execute_identify_cns_cs_ctrl(struct nvmet_req *req)
>>> +{
>>> +}
>>> +void nvmet_execute_identify_cns_cs_ns(struct nvmet_req *req)
>>> +{
>>> +}
>>> +u16 nvmet_process_zns_cis(struct nvmet_req *req, off_t *off)
>>> +{
>>> +	return 0;
>>> +}
>>> +bool nvmet_bdev_zns_config(struct nvmet_ns *ns)
>>> +{
>>> +	return false;
>>> +}
>>> +void nvmet_bdev_execute_zone_mgmt_recv(struct nvmet_req *req)
>>> +{
>>> +}
>>> +void nvmet_bdev_execute_zone_mgmt_send(struct nvmet_req *req)
>>> +{
>>> +}
>>> +void nvmet_bdev_execute_zone_append(struct nvmet_req *req)
>>> +{
>>> +}
>>> +void nvmet_zns_add_cmd_effects(struct nvme_effects_log *log)
>>> +{
>>> +}
>>> +#endif /* CONFIG_BLK_DEV_ZONED */
>>>
>>
> 
>
Johannes Thumshirn Dec. 1, 2020, 7:51 a.m. UTC | #6
On 01/12/2020 04:49, Chaitanya Kulkarni wrote:
> On 11/30/20 04:29, Johannes Thumshirn wrote:
>> On 30/11/2020 04:32, Chaitanya Kulkarni wrote:
>>> +	ret =  __bio_iov_append_get_pages(bio, &from);
>> Can't you just use bio_iov_iter_get_pages() here?
>>
>> It does have a 
>>
>> if (WARN_ON_ONCE(is_bvec))
>> 	return -EINVAL;
>>
>> in it but I think that can be deleted.
>>
> That was my initial patch but it adds an extra function call to the
> 
> fast patch for NVMeOF. We don't need any of the generic functionality from
> 
> bio_iov_iter_get_pages() anyway.
> 
> 
> Why add an extra function call overhead in the hot path for each I/O ?

At least in my compilation (gcc 10.1) there's now extra function call overhead.
__bio_iov_append_get_pages() get's fully inlined into bio_iov_iter_get_pages().

$ make block/bio.s
  CALL    scripts/checksyscalls.sh
  CALL    scripts/atomic/check-atomics.sh
  DESCEND  objtool
  CC      block/bio.s
$ grep __bio_iov_append_get_pages block/bio.s
$ grep bio_iov_iter_get_pages block/bio.s
__kstrtab_bio_iov_iter_get_pages:
        .asciz  "bio_iov_iter_get_pages"
__kstrtabns_bio_iov_iter_get_pages:
        .section "___ksymtab_gpl+bio_iov_iter_get_pages", "a"
__ksymtab_bio_iov_iter_get_pages:
        .long   bio_iov_iter_get_pages- .
        .long   __kstrtab_bio_iov_iter_get_pages- .
        .long   __kstrtabns_bio_iov_iter_get_pages- .
        .globl  bio_iov_iter_get_pages
        .type   bio_iov_iter_get_pages, @function
bio_iov_iter_get_pages:
        .type   bio_iov_iter_get_pages.cold, @function
bio_iov_iter_get_pages.cold:
        .size   bio_iov_iter_get_pages, .-bio_iov_iter_get_pages
        .size   bio_iov_iter_get_pages.cold, .-bio_iov_iter_get_pages.cold
        .type   __UNIQUE_ID___addressable_bio_iov_iter_get_pages499, @object
        .size   __UNIQUE_ID___addressable_bio_iov_iter_get_pages499, 8
__UNIQUE_ID___addressable_bio_iov_iter_get_pages499:
        .quad   bio_iov_iter_get_pages
Chaitanya Kulkarni Dec. 1, 2020, 10:36 p.m. UTC | #7
On 11/30/20 23:51, Johannes Thumshirn wrote:
>> Why add an extra function call overhead in the hot path for each I/O ?
> At least in my compilation (gcc 10.1) there's now extra function call overhead.
> __bio_iov_append_get_pages() get's fully inlined into bio_iov_iter_get_pages().
>
> $ make block/bio.s
>   CALL    scripts/checksyscalls.sh
>   CALL    scripts/atomic/check-atomics.sh
>   DESCEND  objtool
>   CC      block/bio.s
> $ grep __bio_iov_append_get_pages block/bio.s
> $ grep bio_iov_iter_get_pages block/bio.s
> __kstrtab_bio_iov_iter_get_pages:
>         .asciz  "bio_iov_iter_get_pages"
> __kstrtabns_bio_iov_iter_get_pages:
>         .section "___ksymtab_gpl+bio_iov_iter_get_pages", "a"
> __ksymtab_bio_iov_iter_get_pages:
>         .long   bio_iov_iter_get_pages- .
>         .long   __kstrtab_bio_iov_iter_get_pages- .
>         .long   __kstrtabns_bio_iov_iter_get_pages- .
>         .globl  bio_iov_iter_get_pages
>         .type   bio_iov_iter_get_pages, @function
> bio_iov_iter_get_pages:
>         .type   bio_iov_iter_get_pages.cold, @function
> bio_iov_iter_get_pages.cold:
>         .size   bio_iov_iter_get_pages, .-bio_iov_iter_get_pages
>         .size   bio_iov_iter_get_pages.cold, .-bio_iov_iter_get_pages.cold
>         .type   __UNIQUE_ID___addressable_bio_iov_iter_get_pages499, @object
>         .size   __UNIQUE_ID___addressable_bio_iov_iter_get_pages499, 8
> __UNIQUE_ID___addressable_bio_iov_iter_get_pages499:
>         .quad   bio_iov_iter_get_pages
>
>
So it does on mine too, but it doesn't guarantee all the platforms.
Chaitanya Kulkarni Dec. 1, 2020, 10:37 p.m. UTC | #8
On 11/30/20 21:44, Damien Le Moal wrote:
> On 2020/12/01 12:38, Chaitanya Kulkarni wrote:
> [...]
>>>> +	 * namespaces.
>>>> +	 */
>>>> +	min_zasl = ns->zasl = nvmet_zasl(queue_max_zone_append_sectors(q));
>>>> +
>>>> +	xa_for_each(&(ns->subsys->namespaces), idx, ins) {
>>>> +		struct request_queue *iq = ins->bdev->bd_disk->queue;
>>>> +		unsigned int imax_za_sects = queue_max_zone_append_sectors(iq);
>>>> +		u8 izasl = nvmet_zasl(imax_za_sects);
>>>> +
>>>> +		if (!bdev_is_zoned(ins->bdev))
>>>> +			continue;
>>>> +
>>>> +		min_zasl = min_zasl > izasl ? izasl : min_zasl;
>>>> +	}
>>>> +
>>>> +	ns->subsys->id_ctrl_zns.zasl = min_t(u8, min_zasl, bio_max_zasl);
>>> I do not understand what bio_max_zasl is used for here, as it does not represent
>>> anything related to the zoned namespaces backends.
>> If we don't consider the bio max zasl then we will get the request more than
>> one bio can handle for zone append and will lead to bio split whichneeds to
>> be avoided.
> Why ? zasl comes from the target backend max zone append sectors, which we
> already know is OK and does not lead to BIO splitting for zone append. So why do
> you need an extra verification against that bio_max_zasl ?
>
>>>> +}> +
>>>> +bool nvmet_bdev_zns_enable(struct nvmet_ns *ns)
>>>> +{
>>>> +	if (ns->bdev->bd_disk->queue->conv_zones_bitmap) {
>>>> +		pr_err("block devices with conventional zones are not supported.");
>>>> +		return false;
>>>> +	}
>>> It may be better to move this test in nvmet_bdev_validate_zns_zones(), so that
>>> all backend zone configuration checks are together.
>> Okay.
>>>> +
>>>> +	if (!nvmet_bdev_validate_zns_zones(ns))
>>>> +		return false;
>>>> +
>>>> +	/*
>>>> +	 * For ZBC and ZAC devices, writes into sequential zones must be aligned
>>>> +	 * to the device physical block size. So use this value as the logical
>>>> +	 * block size to avoid errors.
>>>> +	 */
>>>> +	ns->blksize_shift = blksize_bits(bdev_physical_block_size(ns->bdev));
>>>> +
>>>> +	nvmet_zns_update_zasl(ns);
>>>> +
>>>> +	return true;
>>>> +}
>>>> +
>>>> +/*
>>>> + * ZNS related Admin and I/O command handlers.
>>>> + */
>>>> +void nvmet_execute_identify_cns_cs_ctrl(struct nvmet_req *req)
>>>> +{
>>>> +	struct nvmet_ctrl *ctrl = req->sq->ctrl;
>>>> +	struct nvme_id_ctrl_zns *id;
>>>> +	u16 status = 0;
>>>> +	u8 mdts;
>>>> +
>>>> +	id = kzalloc(sizeof(*id), GFP_KERNEL);
>>>> +	if (!id) {
>>>> +		status = NVME_SC_INTERNAL;
>>>> +		goto out;
>>>> +	}
>>>> +
>>>> +	/*
>>>> +	 * Even though this function sets Zone Append Size Limit to 0,
>>>> +	 * the 0 value here indicates that the maximum data transfer size for
>>>> +	 * the Zone Append command is indicated by the ctrl
>>>> +	 * Maximum Data Transfer Size (MDTS).
>>>> +	 */
>>> I do not understand this comment. It does not really exactly match what the code
>>> below is doing.
>> Let me fix the code and the comment in next version.
>>>> +
>>>> +	mdts = ctrl->ops->get_mdts ? ctrl->ops->get_mdts(ctrl) : 0;
>>>> +
>>>> +	id->zasl = min_t(u8, mdts, req->sq->ctrl->subsys->id_ctrl_zns.zasl);
>>>> +
>>>> +	status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id));
>>>> +
>>>> +	kfree(id);
>>>> +out:
>>>> +	nvmet_req_complete(req, status);
>>>> +}
>>>> +
>>>> +void nvmet_execute_identify_cns_cs_ns(struct nvmet_req *req)
>>>> +{
>>>> +	struct nvme_id_ns_zns *id_zns;
>>>> +	u16 status = 0;
>>>> +	u64 zsze;
>>>> +
>>>> +	if (le32_to_cpu(req->cmd->identify.nsid) == NVME_NSID_ALL) {
>>>> +		req->error_loc = offsetof(struct nvme_identify, nsid);
>>>> +		status = NVME_SC_INVALID_NS | NVME_SC_DNR;
>>>> +		goto out;
>>>> +	}
>>>> +
>>>> +	id_zns = kzalloc(sizeof(*id_zns), GFP_KERNEL);
>>>> +	if (!id_zns) {
>>>> +		status = NVME_SC_INTERNAL;
>>>> +		goto out;
>>>> +	}
>>>> +
>>>> +	req->ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->identify.nsid);
>>>> +	if (!req->ns) {
>>>> +		status = NVME_SC_INTERNAL;
>>>> +		goto done;
>>>> +	}
>>>> +
>>>> +	if (!bdev_is_zoned(nvmet_bdev(req))) {
>>>> +		req->error_loc = offsetof(struct nvme_identify, nsid);
>>>> +		status = NVME_SC_INVALID_NS | NVME_SC_DNR;
>>>> +		goto done;
>>>> +	}
>>>> +
>>>> +	nvmet_ns_revalidate(req->ns);
>>>> +	zsze = (bdev_zone_sectors(nvmet_bdev(req)) << 9) >>
>>>> +					req->ns->blksize_shift;
>>>> +	id_zns->lbafe[0].zsze = cpu_to_le64(zsze);
>>>> +	id_zns->mor = cpu_to_le32(bdev_max_open_zones(nvmet_bdev(req)));
>>>> +	id_zns->mar = cpu_to_le32(bdev_max_active_zones(nvmet_bdev(req)));
>>>> +
>>>> +done:
>>>> +	status = nvmet_copy_to_sgl(req, 0, id_zns, sizeof(*id_zns));
>>>> +	kfree(id_zns);
>>>> +out:
>>>> +	nvmet_req_complete(req, status);
>>>> +}
>>>> +
>>>> +struct nvmet_report_zone_data {
>>>> +	struct nvmet_ns *ns;
>>>> +	struct nvme_zone_report *rz;
>>>> +};
>>>> +
>>>> +static int nvmet_bdev_report_zone_cb(struct blk_zone *z, unsigned int idx,
>>>> +				     void *data)
>>>> +{
>>>> +	struct nvmet_report_zone_data *report_zone_data = data;
>>>> +	struct nvme_zone_descriptor *entries = report_zone_data->rz->entries;
>>>> +	struct nvmet_ns *ns = report_zone_data->ns;
>>>> +
>>>> +	entries[idx].zcap = cpu_to_le64(nvmet_sect_to_lba(ns, z->capacity));
>>>> +	entries[idx].zslba = cpu_to_le64(nvmet_sect_to_lba(ns, z->start));
>>>> +	entries[idx].wp = cpu_to_le64(nvmet_sect_to_lba(ns, z->wp));
>>>> +	entries[idx].za = z->reset ? 1 << 2 : 0;
>>>> +	entries[idx].zt = z->type;
>>>> +	entries[idx].zs = z->cond << 4;
>>>> +
>>>> +	return 0;
>>>> +}
>>>> +
>>>> +void nvmet_bdev_execute_zone_mgmt_recv(struct nvmet_req *req)
>>>> +{
>>>> +	u32 bufsize = (le32_to_cpu(req->cmd->zmr.numd) + 1) << 2;
>>> size_t ?
>> Yes.
>>>> +	struct nvmet_report_zone_data data = { .ns = req->ns };
>>>> +	struct nvme_zone_mgmt_recv_cmd *zmr = &req->cmd->zmr;
>>>> +	sector_t sect = nvmet_lba_to_sect(req->ns, le64_to_cpu(zmr->slba));
>>>> +	unsigned int nr_zones = bufsize / nvmet_zones_to_desc_size(1);
>>> I think this is wrong since nvmet_zones_to_desc_size includes sizeof(struct
>>> nvme_zone_report). I think what you want here is:
>>>
>>> nr_zones = (bufsize - sizeof(struct nvme_zone_report)) /
>>> 	sizeof(struct nvme_zone_descriptor);
>>>
>>> And then you can get rid of nvmet_zones_to_desc_size();
>> Yes, it doesn't need nvmet_zones_to_desc_size() anymore from v1.
>>>> +	int reported_zones;Maybe there is better way.
>>>> +	u16 status;
>>>> +
>>>> +	status = nvmet_bdev_zns_checks(req);
>>>> +	if (status)
>>>> +		goto out;
>>>> +
>>>> +	data.rz = __vmalloc(bufsize, GFP_KERNEL | __GFP_NORETRY);
>>>> +	if (!data.rz) {
>>>> +		status = NVME_SC_INTERNAL;
>>>> +		goto out;
>>>> +	}
>>>> +
>>>> +	reported_zones = blkdev_report_zones(nvmet_bdev(req), sect, nr_zones,
>>>> +					     nvmet_bdev_report_zone_cb,
>>>> +					     &data);
>>>> +	if (reported_zones < 0) {
>>>> +		status = NVME_SC_INTERNAL;
>>>> +		goto out_free_report_zones;
>>>> +	}
>>>> +
>>>> +	data.rz->nr_zones = cpu_to_le64(reported_zones);
>>>> +
>>>> +	status = nvmet_copy_to_sgl(req, 0, data.rz, bufsize);
>>>> +
>>>> +out_free_report_zones:
>>>> +	kvfree(data.rz);
>>>> +out:
>>>> +	nvmet_req_complete(req, status);
>>>> +}
>>>> +
>>>> +void nvmet_bdev_execute_zone_mgmt_send(struct nvmet_req *req)
>>>> +{
>>>> +	sector_t nr_sect = bdev_zone_sectors(nvmet_bdev(req));
>>>> +	struct nvme_zone_mgmt_send_cmd *c = &req->cmd->zms;
>>>> +	enum req_opf op = REQ_OP_LAST;
>>>> +	u16 status = NVME_SC_SUCCESS;
>>>> +	sector_t sect;
>>>> +	int ret;
>>>> +
>>>> +	sect = nvmet_lba_to_sect(req->ns, le64_to_cpu(req->cmd->zms.slba));
>>>> +
>>>> +	switch (c->zsa) {
>>>> +	case NVME_ZONE_OPEN:
>>>> +		op = REQ_OP_ZONE_OPEN;
>>>> +		break;
>>>> +	case NVME_ZONE_CLOSE:
>>>> +		op = REQ_OP_ZONE_CLOSE;
>>>> +		break;
>>>> +	case NVME_ZONE_FINISH:
>>>> +		op = REQ_OP_ZONE_FINISH;
>>>> +		break;
>>>> +	case NVME_ZONE_RESET:
>>>> +		if (c->select_all)
>>>> +			nr_sect = get_capacity(nvmet_bdev(req)->bd_disk);
>>>> +		op = REQ_OP_ZONE_RESET;
>>>> +		break;
>>>> +	default:
>>>> +		status = NVME_SC_INVALID_FIELD;
>>>> +		goto out;
>>>> +	}
>>>> +
>>>> +	ret = blkdev_zone_mgmt(nvmet_bdev(req), op, sect, nr_sect, GFP_KERNEL);
>>>> +	if (ret)
>>>> +		status = NVME_SC_INTERNAL;
>>>> +
>>>> +out:
>>>> +	nvmet_req_complete(req, status);
>>>> +}
>>>> +
>>>> +void nvmet_bdev_execute_zone_append(struct nvmet_req *req)
>>>> +{
>>>> +	unsigned long bv_cnt = req->sg_cnt;
>>>> +	int op = REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE;
>>>> +	u64 slba = le64_to_cpu(req->cmd->rw.slba);
>>>> +	sector_t sect = nvmet_lba_to_sect(req->ns, slba);
>>>> +	u16 status = NVME_SC_SUCCESS;
>>>> +	size_t mapped_data_len = 0;
>>>> +	int sg_cnt = req->sg_cnt;
>>>> +	struct scatterlist *sg;
>>>> +	struct iov_iter from;
>>>> +	struct bio_vec *bvec;
>>>> +	size_t mapped_cnt;
>>>> +	struct bio *bio;
>>>> +	int ret;
>>>> +
>>>> +	if (!nvmet_check_transfer_len(req, nvmet_rw_data_len(req)))
>>>> +		return;
>>>> +
>>>> +	/*
>>>> +	 * When setting the ctrl->zasl we consider the BIO_MAX_PAGES so that we
>>>> +	 * don't have to split the bio, i.e. we shouldn't get
>>>> +	 * sg_cnt > BIO_MAX_PAGES since zasl on the host will limit the I/Os
>>>> +	 * with the size that considers the BIO_MAX_PAGES.
>>>> +	 */
>>> What ? Unclear... You advertize max zone append as ctrl->zasl = min(all ns max
>>> zone append sectors). What does BIO_MAX_PAGES has to do with anything ?
>> That is not true, ctrl->zasl is advertised
>> min(min all ns max zone append sectors), bio_max_zasl) to avoid the
>> splitting
>> of the bio on the target side:-
> We already know that zasl for each NS, given by the max zone append sector of
> the backends, are already OK, regardless of BIO_MAX_PAGES (see below).
>
>> See nvmet_zns_update_zasl()
>>
>> +	ns->subsys->id_ctrl_zns.zasl = min_t(u8, min_zasl, bio_max_zasl);
>>
>> Without considering the bio_max_pages we may have to set the ctrl->zasl
>> value
>> thatis > bio_max_pages_zasl, then we'll get a request that is greater
>> than what
>> one bio can handle, that will lead to splitting the bios, which we want to
>> avoid as per the comment in the V1.
>>
>> Can you please explain what is wrong with this approach which regulates the
>> zasl with all the possible namespaces zasl and bio_zasl?
> For starter, with multi-page bvecs now, I am not sure BIO_MAX_PAGES makes any
> sense anymore. Next, on the target side, the max zone append sectors limit for
> each NS guarantee that a single BIO can be used without splitting needed. Taking
> the min  of all of them will NOT remove that guarantee. Hence I think that
> BIO_MAX_PAGES has nothing to do in the middle of all this.
>
Okay, let me remove the bio size check and send v4.
>> May be there is better way?
>>>> +	if (!req->sg_cnt)
>>>> +		goto out;
>>>> +
>>>> +	if (WARN_ON(req->sg_cnt > BIO_MAX_PAGES)) {
>>>> +		status = NVME_SC_INTERNAL | NVME_SC_DNR;
>>>> +		goto out;
>>>> +	}
>>>> +
>>>> +	bvec = kmalloc_array(bv_cnt, sizeof(*bvec), GFP_KERNEL);
>>>> +	if (!bvec) {
>>>> +		status = NVME_SC_INTERNAL | NVME_SC_DNR;
>>>> +		goto out;
>>>> +	}
>>>> +
>>>> +	for_each_sg(req->sg, sg, req->sg_cnt, mapped_cnt) {
>>>> +		nvmet_file_init_bvec(bvec, sg);
>>>> +		mapped_data_len += bvec[mapped_cnt].bv_len;
>>>> +		sg_cnt--;
>>>> +		if (mapped_cnt == bv_cnt)
>>>> +			brhigh frequency I/O operationeak;
>>>> +	}
>>>> +
>>>> +	if (WARN_ON(sg_cnt)) {
>>>> +		status = NVME_SC_INTERNAL | NVME_SC_DNR;
>>>> +		goto out;
>>>> +	}
>>>> +
>>>> +	iov_iter_bvec(&from, WRITE, bvec, mapped_cnt, mapped_data_len);
>>>> +
>>>> +	bio = bio_alloc(GFP_KERNEL, bv_cnt);
>>>> +	bio_set_dev(bio, nvmet_bdev(req));
>>>> +	bio->bi_iter.bi_sector = sect;
>>>> +	bio->bi_opf = op;
>>> The op variable is used only here. It is not necessary.
>> Yes.
>>>> +
>>>> +	ret =  __bio_iov_append_get_pages(bio, &from);
>>> I still do not see why bio_iov_iter_get_pages() does not work here. Would you
>>> care to explain please ?
>>>
>> The same reason pointed out by the Johannes. Instead of calling wrapper,
>> call the underlaying core API, since we don't care about the rest of the
>> generic code. This also avoids an extra function callfor zone-append
>> fast path.
> I am still not convinced that __bio_iov_append_get_pages() will do the right
> thing as it lacks the loop that bio_iov_iter_get_pages() has.
>
>>>> +	if (unlikely(ret)) {
>>>> +		status = NVME_SC_INTERNAL | NVME_SC_DNR;
>>>> +		bio_io_error(bio);
>>>> +		goto bvec_free;
>>>> +	}
>>>> +
>>>> +	ret = submit_bio_wait(bio);
>>>> +	status = ret < 0 ? NVME_SC_INTERNAL : status;
>>>> +	bio_put(bio);
>>>> +
>>>> +	sect += (mapped_data_len >> 9);
>>>> +	req->cqe->result.u64 = le64_to_cpu(nvmet_sect_to_lba(req->ns, sect));
>>>> +
>>>> +bvec_free:
>>>> +	kfree(bvec);
>>>> +
>>>> +out:
>>>> +	nvmet_req_complete(req, status);
>>>> +}
>>>> +
>>>> +#else  /* CONFIG_BLK_DEV_ZONED */
>>>> +void nvmet_execute_identify_cns_cs_ctrl(struct nvmet_req *req)
>>>> +{
>>>> +}
>>>> +void nvmet_execute_identify_cns_cs_ns(struct nvmet_req *req)
>>>> +{
>>>> +}
>>>> +u16 nvmet_process_zns_cis(struct nvmet_req *req, off_t *off)
>>>> +{
>>>> +	return 0;
>>>> +}
>>>> +bool nvmet_bdev_zns_config(struct nvmet_ns *ns)
>>>> +{
>>>> +	return false;
>>>> +}
>>>> +void nvmet_bdev_execute_zone_mgmt_recv(struct nvmet_req *req)
>>>> +{
>>>> +}
>>>> +void nvmet_bdev_execute_zone_mgmt_send(struct nvmet_req *req)
>>>> +{
>>>> +}
>>>> +void nvmet_bdev_execute_zone_append(struct nvmet_req *req)
>>>> +{
>>>> +}
>>>> +void nvmet_zns_add_cmd_effects(struct nvme_effects_log *log)
>>>> +{
>>>> +}
>>>> +#endif /* CONFIG_BLK_DEV_ZONED */
>>>>
>>
>
diff mbox series

Patch

diff --git a/drivers/nvme/target/Makefile b/drivers/nvme/target/Makefile
index ebf91fc4c72e..d050f829b43a 100644
--- a/drivers/nvme/target/Makefile
+++ b/drivers/nvme/target/Makefile
@@ -10,7 +10,7 @@  obj-$(CONFIG_NVME_TARGET_FCLOOP)	+= nvme-fcloop.o
 obj-$(CONFIG_NVME_TARGET_TCP)		+= nvmet-tcp.o
 
 nvmet-y		+= core.o configfs.o admin-cmd.o fabrics-cmd.o \
-			discovery.o io-cmd-file.o io-cmd-bdev.o
+		   zns.o discovery.o io-cmd-file.o io-cmd-bdev.o
 nvmet-$(CONFIG_NVME_TARGET_PASSTHRU)	+= passthru.o
 nvme-loop-y	+= loop.o
 nvmet-rdma-y	+= rdma.o
diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index dca34489a1dc..509fd8dcca0c 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -579,8 +579,8 @@  static void nvmet_execute_identify_nslist(struct nvmet_req *req)
 	nvmet_req_complete(req, status);
 }
 
-static u16 nvmet_copy_ns_identifier(struct nvmet_req *req, u8 type, u8 len,
-				    void *id, off_t *off)
+u16 nvmet_copy_ns_identifier(struct nvmet_req *req, u8 type, u8 len,
+			     void *id, off_t *off)
 {
 	struct nvme_ns_id_desc desc = {
 		.nidt = type,
diff --git a/drivers/nvme/target/io-cmd-file.c b/drivers/nvme/target/io-cmd-file.c
index 0abbefd9925e..2bd10960fa50 100644
--- a/drivers/nvme/target/io-cmd-file.c
+++ b/drivers/nvme/target/io-cmd-file.c
@@ -89,7 +89,7 @@  int nvmet_file_ns_enable(struct nvmet_ns *ns)
 	return ret;
 }
 
-static void nvmet_file_init_bvec(struct bio_vec *bv, struct scatterlist *sg)
+void nvmet_file_init_bvec(struct bio_vec *bv, struct scatterlist *sg)
 {
 	bv->bv_page = sg_page(sg);
 	bv->bv_offset = sg->offset;
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index 592763732065..eee7866ae512 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -81,6 +81,10 @@  struct nvmet_ns {
 	struct pci_dev		*p2p_dev;
 	int			pi_type;
 	int			metadata_size;
+#ifdef CONFIG_BLK_DEV_ZONED
+	struct nvme_id_ns_zns	id_zns;
+	unsigned int		zasl;
+#endif
 };
 
 static inline struct nvmet_ns *to_nvmet_ns(struct config_item *item)
@@ -251,6 +255,10 @@  struct nvmet_subsys {
 	unsigned int		admin_timeout;
 	unsigned int		io_timeout;
 #endif /* CONFIG_NVME_TARGET_PASSTHRU */
+
+#ifdef CONFIG_BLK_DEV_ZONED
+	struct nvme_id_ctrl_zns	id_ctrl_zns;
+#endif
 };
 
 static inline struct nvmet_subsys *to_subsys(struct config_item *item)
@@ -603,4 +611,15 @@  static inline bool nvmet_ns_has_pi(struct nvmet_ns *ns)
 	return ns->pi_type && ns->metadata_size == sizeof(struct t10_pi_tuple);
 }
 
+void nvmet_execute_identify_cns_cs_ctrl(struct nvmet_req *req);
+void nvmet_execute_identify_cns_cs_ns(struct nvmet_req *req);
+u16 nvmet_process_zns_cis(struct nvmet_req *req, off_t *off);
+bool nvmet_bdev_zns_enable(struct nvmet_ns *ns);
+void nvmet_bdev_execute_zone_mgmt_recv(struct nvmet_req *req);
+void nvmet_bdev_execute_zone_mgmt_send(struct nvmet_req *req);
+void nvmet_bdev_execute_zone_append(struct nvmet_req *req);
+void nvmet_zns_add_cmd_effects(struct nvme_effects_log *log);
+u16 nvmet_copy_ns_identifier(struct nvmet_req *req, u8 type, u8 len,
+			     void *id, off_t *off);
+void nvmet_file_init_bvec(struct bio_vec *bv, struct scatterlist *sg);
 #endif /* _NVMET_H */
diff --git a/drivers/nvme/target/zns.c b/drivers/nvme/target/zns.c
new file mode 100644
index 000000000000..40dedfd51fd6
--- /dev/null
+++ b/drivers/nvme/target/zns.c
@@ -0,0 +1,463 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * NVMe ZNS-ZBD command implementation.
+ * Copyright (c) 2020-2021 HGST, a Western Digital Company.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/uio.h>
+#include <linux/nvme.h>
+#include <linux/xarray.h>
+#include <linux/blkdev.h>
+#include <linux/module.h>
+#include "nvmet.h"
+
+#ifdef CONFIG_BLK_DEV_ZONED
+#define NVMET_MPSMIN_SHIFT	12
+
+static u16 nvmet_bdev_zns_checks(struct nvmet_req *req)
+{
+	u16 status = 0;
+
+	if (!bdev_is_zoned(req->ns->bdev)) {
+		status = NVME_SC_INVALID_NS | NVME_SC_DNR;
+		goto out;
+	}
+
+	if (req->cmd->zmr.zra != NVME_ZRA_ZONE_REPORT) {
+		status = NVME_SC_INVALID_FIELD;
+		goto out;
+	}
+
+	if (req->cmd->zmr.zrasf != NVME_ZRASF_ZONE_REPORT_ALL) {
+		status = NVME_SC_INVALID_FIELD;
+		goto out;
+	}
+
+	if (req->cmd->zmr.pr != NVME_REPORT_ZONE_PARTIAL)
+		status = NVME_SC_INVALID_FIELD;
+out:
+	return status;
+}
+
+static inline struct block_device *nvmet_bdev(struct nvmet_req *req)
+{
+	return req->ns->bdev;
+}
+
+static inline  u64 nvmet_zones_to_desc_size(unsigned int nr_zones)
+{
+	return sizeof(struct nvme_zone_report) +
+		(sizeof(struct nvme_zone_descriptor) * nr_zones);
+}
+
+static inline u64 nvmet_sect_to_lba(struct nvmet_ns *ns, sector_t sect)
+{
+	return sect >> (ns->blksize_shift - SECTOR_SHIFT);
+}
+
+static inline sector_t nvmet_lba_to_sect(struct nvmet_ns *ns, __le64 lba)
+{
+	return le64_to_cpu(lba) << (ns->blksize_shift - SECTOR_SHIFT);
+}
+
+/*
+ *  ZNS related command implementation and helpers.
+ */
+
+u16 nvmet_process_zns_cis(struct nvmet_req *req, off_t *off)
+{
+	u16 nvme_cis_zns = NVME_CSI_ZNS;
+
+	if (!bdev_is_zoned(nvmet_bdev(req)))
+		return NVME_SC_SUCCESS;
+
+	return nvmet_copy_ns_identifier(req, NVME_NIDT_CSI, NVME_NIDT_CSI_LEN,
+					&nvme_cis_zns, off);
+}
+
+void nvmet_zns_add_cmd_effects(struct nvme_effects_log *log)
+{
+	log->iocs[nvme_cmd_zone_append]		= cpu_to_le32(1 << 0);
+	log->iocs[nvme_cmd_zone_mgmt_send]	= cpu_to_le32(1 << 0);
+	log->iocs[nvme_cmd_zone_mgmt_recv]	= cpu_to_le32(1 << 0);
+}
+
+static int nvmet_bdev_validate_zns_zones_cb(struct blk_zone *z,
+					    unsigned int idx, void *data)
+{
+	struct blk_zone *zone = data;
+
+	memcpy(zone, z, sizeof(struct blk_zone));
+
+	return 0;
+}
+
+static inline bool nvmet_bdev_validate_zns_zones(struct nvmet_ns *ns)
+{
+	sector_t last_sect = get_capacity(ns->bdev->bd_disk) - 1;
+	struct blk_zone last_zone, first_zone;
+	int reported_zones;
+
+	reported_zones = blkdev_report_zones(ns->bdev, 0, 1,
+					     nvmet_bdev_validate_zns_zones_cb,
+					     &first_zone);
+	if (reported_zones != 1)
+		return false;
+
+	reported_zones = blkdev_report_zones(ns->bdev, last_sect, 1,
+					     nvmet_bdev_validate_zns_zones_cb,
+					     &last_zone);
+	if (reported_zones != 1)
+		return false;
+
+	return first_zone.capacity == last_zone.capacity ? true : false;
+}
+
+static inline u8 nvmet_zasl(unsigned int zone_append_sects)
+{
+	unsigned int npages = (zone_append_sects << 9) >> NVMET_MPSMIN_SHIFT;
+	u8 zasl = ilog2(npages);
+
+	/*
+	 * Zone Append Size Limit is the value experessed in the units
+	 * of minimum memory page size (i.e. 12) and is reported power of 2.
+	 */
+	return zasl;
+}
+
+static inline void nvmet_zns_update_zasl(struct nvmet_ns *ns)
+{
+	u8 bio_max_zasl = nvmet_zasl((BIO_MAX_PAGES * PAGE_SIZE) >> 9);
+	struct request_queue *q = ns->bdev->bd_disk->queue;
+	struct nvmet_ns *ins;
+	unsigned long idx;
+	u8 min_zasl;
+
+	/*
+	 * Calculate new ctrl->zasl value when enabling the new ns. This value
+	 * has to be the minimum of the max_zone appned values from available
+	 * namespaces.
+	 */
+	min_zasl = ns->zasl = nvmet_zasl(queue_max_zone_append_sectors(q));
+
+	xa_for_each(&(ns->subsys->namespaces), idx, ins) {
+		struct request_queue *iq = ins->bdev->bd_disk->queue;
+		unsigned int imax_za_sects = queue_max_zone_append_sectors(iq);
+		u8 izasl = nvmet_zasl(imax_za_sects);
+
+		if (!bdev_is_zoned(ins->bdev))
+			continue;
+
+		min_zasl = min_zasl > izasl ? izasl : min_zasl;
+	}
+
+	ns->subsys->id_ctrl_zns.zasl = min_t(u8, min_zasl, bio_max_zasl);
+}
+
+bool nvmet_bdev_zns_enable(struct nvmet_ns *ns)
+{
+	if (ns->bdev->bd_disk->queue->conv_zones_bitmap) {
+		pr_err("block devices with conventional zones are not supported.");
+		return false;
+	}
+
+	if (!nvmet_bdev_validate_zns_zones(ns))
+		return false;
+
+	/*
+	 * For ZBC and ZAC devices, writes into sequential zones must be aligned
+	 * to the device physical block size. So use this value as the logical
+	 * block size to avoid errors.
+	 */
+	ns->blksize_shift = blksize_bits(bdev_physical_block_size(ns->bdev));
+
+	nvmet_zns_update_zasl(ns);
+
+	return true;
+}
+
+/*
+ * ZNS related Admin and I/O command handlers.
+ */
+void nvmet_execute_identify_cns_cs_ctrl(struct nvmet_req *req)
+{
+	struct nvmet_ctrl *ctrl = req->sq->ctrl;
+	struct nvme_id_ctrl_zns *id;
+	u16 status = 0;
+	u8 mdts;
+
+	id = kzalloc(sizeof(*id), GFP_KERNEL);
+	if (!id) {
+		status = NVME_SC_INTERNAL;
+		goto out;
+	}
+
+	/*
+	 * Even though this function sets Zone Append Size Limit to 0,
+	 * the 0 value here indicates that the maximum data transfer size for
+	 * the Zone Append command is indicated by the ctrl
+	 * Maximum Data Transfer Size (MDTS).
+	 */
+
+	mdts = ctrl->ops->get_mdts ? ctrl->ops->get_mdts(ctrl) : 0;
+
+	id->zasl = min_t(u8, mdts, req->sq->ctrl->subsys->id_ctrl_zns.zasl);
+
+	status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id));
+
+	kfree(id);
+out:
+	nvmet_req_complete(req, status);
+}
+
+void nvmet_execute_identify_cns_cs_ns(struct nvmet_req *req)
+{
+	struct nvme_id_ns_zns *id_zns;
+	u16 status = 0;
+	u64 zsze;
+
+	if (le32_to_cpu(req->cmd->identify.nsid) == NVME_NSID_ALL) {
+		req->error_loc = offsetof(struct nvme_identify, nsid);
+		status = NVME_SC_INVALID_NS | NVME_SC_DNR;
+		goto out;
+	}
+
+	id_zns = kzalloc(sizeof(*id_zns), GFP_KERNEL);
+	if (!id_zns) {
+		status = NVME_SC_INTERNAL;
+		goto out;
+	}
+
+	req->ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->identify.nsid);
+	if (!req->ns) {
+		status = NVME_SC_INTERNAL;
+		goto done;
+	}
+
+	if (!bdev_is_zoned(nvmet_bdev(req))) {
+		req->error_loc = offsetof(struct nvme_identify, nsid);
+		status = NVME_SC_INVALID_NS | NVME_SC_DNR;
+		goto done;
+	}
+
+	nvmet_ns_revalidate(req->ns);
+	zsze = (bdev_zone_sectors(nvmet_bdev(req)) << 9) >>
+					req->ns->blksize_shift;
+	id_zns->lbafe[0].zsze = cpu_to_le64(zsze);
+	id_zns->mor = cpu_to_le32(bdev_max_open_zones(nvmet_bdev(req)));
+	id_zns->mar = cpu_to_le32(bdev_max_active_zones(nvmet_bdev(req)));
+
+done:
+	status = nvmet_copy_to_sgl(req, 0, id_zns, sizeof(*id_zns));
+	kfree(id_zns);
+out:
+	nvmet_req_complete(req, status);
+}
+
+struct nvmet_report_zone_data {
+	struct nvmet_ns *ns;
+	struct nvme_zone_report *rz;
+};
+
+static int nvmet_bdev_report_zone_cb(struct blk_zone *z, unsigned int idx,
+				     void *data)
+{
+	struct nvmet_report_zone_data *report_zone_data = data;
+	struct nvme_zone_descriptor *entries = report_zone_data->rz->entries;
+	struct nvmet_ns *ns = report_zone_data->ns;
+
+	entries[idx].zcap = cpu_to_le64(nvmet_sect_to_lba(ns, z->capacity));
+	entries[idx].zslba = cpu_to_le64(nvmet_sect_to_lba(ns, z->start));
+	entries[idx].wp = cpu_to_le64(nvmet_sect_to_lba(ns, z->wp));
+	entries[idx].za = z->reset ? 1 << 2 : 0;
+	entries[idx].zt = z->type;
+	entries[idx].zs = z->cond << 4;
+
+	return 0;
+}
+
+void nvmet_bdev_execute_zone_mgmt_recv(struct nvmet_req *req)
+{
+	u32 bufsize = (le32_to_cpu(req->cmd->zmr.numd) + 1) << 2;
+	struct nvmet_report_zone_data data = { .ns = req->ns };
+	struct nvme_zone_mgmt_recv_cmd *zmr = &req->cmd->zmr;
+	sector_t sect = nvmet_lba_to_sect(req->ns, le64_to_cpu(zmr->slba));
+	unsigned int nr_zones = bufsize / nvmet_zones_to_desc_size(1);
+	int reported_zones;
+	u16 status;
+
+	status = nvmet_bdev_zns_checks(req);
+	if (status)
+		goto out;
+
+	data.rz = __vmalloc(bufsize, GFP_KERNEL | __GFP_NORETRY);
+	if (!data.rz) {
+		status = NVME_SC_INTERNAL;
+		goto out;
+	}
+
+	reported_zones = blkdev_report_zones(nvmet_bdev(req), sect, nr_zones,
+					     nvmet_bdev_report_zone_cb,
+					     &data);
+	if (reported_zones < 0) {
+		status = NVME_SC_INTERNAL;
+		goto out_free_report_zones;
+	}
+
+	data.rz->nr_zones = cpu_to_le64(reported_zones);
+
+	status = nvmet_copy_to_sgl(req, 0, data.rz, bufsize);
+
+out_free_report_zones:
+	kvfree(data.rz);
+out:
+	nvmet_req_complete(req, status);
+}
+
+void nvmet_bdev_execute_zone_mgmt_send(struct nvmet_req *req)
+{
+	sector_t nr_sect = bdev_zone_sectors(nvmet_bdev(req));
+	struct nvme_zone_mgmt_send_cmd *c = &req->cmd->zms;
+	enum req_opf op = REQ_OP_LAST;
+	u16 status = NVME_SC_SUCCESS;
+	sector_t sect;
+	int ret;
+
+	sect = nvmet_lba_to_sect(req->ns, le64_to_cpu(req->cmd->zms.slba));
+
+	switch (c->zsa) {
+	case NVME_ZONE_OPEN:
+		op = REQ_OP_ZONE_OPEN;
+		break;
+	case NVME_ZONE_CLOSE:
+		op = REQ_OP_ZONE_CLOSE;
+		break;
+	case NVME_ZONE_FINISH:
+		op = REQ_OP_ZONE_FINISH;
+		break;
+	case NVME_ZONE_RESET:
+		if (c->select_all)
+			nr_sect = get_capacity(nvmet_bdev(req)->bd_disk);
+		op = REQ_OP_ZONE_RESET;
+		break;
+	default:
+		status = NVME_SC_INVALID_FIELD;
+		goto out;
+	}
+
+	ret = blkdev_zone_mgmt(nvmet_bdev(req), op, sect, nr_sect, GFP_KERNEL);
+	if (ret)
+		status = NVME_SC_INTERNAL;
+
+out:
+	nvmet_req_complete(req, status);
+}
+
+void nvmet_bdev_execute_zone_append(struct nvmet_req *req)
+{
+	unsigned long bv_cnt = req->sg_cnt;
+	int op = REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE;
+	u64 slba = le64_to_cpu(req->cmd->rw.slba);
+	sector_t sect = nvmet_lba_to_sect(req->ns, slba);
+	u16 status = NVME_SC_SUCCESS;
+	size_t mapped_data_len = 0;
+	int sg_cnt = req->sg_cnt;
+	struct scatterlist *sg;
+	struct iov_iter from;
+	struct bio_vec *bvec;
+	size_t mapped_cnt;
+	struct bio *bio;
+	int ret;
+
+	if (!nvmet_check_transfer_len(req, nvmet_rw_data_len(req)))
+		return;
+
+	/*
+	 * When setting the ctrl->zasl we consider the BIO_MAX_PAGES so that we
+	 * don't have to split the bio, i.e. we shouldn't get
+	 * sg_cnt > BIO_MAX_PAGES since zasl on the host will limit the I/Os
+	 * with the size that considers the BIO_MAX_PAGES.
+	 */
+	if (!req->sg_cnt)
+		goto out;
+
+	if (WARN_ON(req->sg_cnt > BIO_MAX_PAGES)) {
+		status = NVME_SC_INTERNAL | NVME_SC_DNR;
+		goto out;
+	}
+
+	bvec = kmalloc_array(bv_cnt, sizeof(*bvec), GFP_KERNEL);
+	if (!bvec) {
+		status = NVME_SC_INTERNAL | NVME_SC_DNR;
+		goto out;
+	}
+
+	for_each_sg(req->sg, sg, req->sg_cnt, mapped_cnt) {
+		nvmet_file_init_bvec(bvec, sg);
+		mapped_data_len += bvec[mapped_cnt].bv_len;
+		sg_cnt--;
+		if (mapped_cnt == bv_cnt)
+			break;
+	}
+
+	if (WARN_ON(sg_cnt)) {
+		status = NVME_SC_INTERNAL | NVME_SC_DNR;
+		goto out;
+	}
+
+	iov_iter_bvec(&from, WRITE, bvec, mapped_cnt, mapped_data_len);
+
+	bio = bio_alloc(GFP_KERNEL, bv_cnt);
+	bio_set_dev(bio, nvmet_bdev(req));
+	bio->bi_iter.bi_sector = sect;
+	bio->bi_opf = op;
+
+	ret =  __bio_iov_append_get_pages(bio, &from);
+	if (unlikely(ret)) {
+		status = NVME_SC_INTERNAL | NVME_SC_DNR;
+		bio_io_error(bio);
+		goto bvec_free;
+	}
+
+	ret = submit_bio_wait(bio);
+	status = ret < 0 ? NVME_SC_INTERNAL : status;
+	bio_put(bio);
+
+	sect += (mapped_data_len >> 9);
+	req->cqe->result.u64 = le64_to_cpu(nvmet_sect_to_lba(req->ns, sect));
+
+bvec_free:
+	kfree(bvec);
+
+out:
+	nvmet_req_complete(req, status);
+}
+
+#else  /* CONFIG_BLK_DEV_ZONED */
+void nvmet_execute_identify_cns_cs_ctrl(struct nvmet_req *req)
+{
+}
+void nvmet_execute_identify_cns_cs_ns(struct nvmet_req *req)
+{
+}
+u16 nvmet_process_zns_cis(struct nvmet_req *req, off_t *off)
+{
+	return 0;
+}
+bool nvmet_bdev_zns_config(struct nvmet_ns *ns)
+{
+	return false;
+}
+void nvmet_bdev_execute_zone_mgmt_recv(struct nvmet_req *req)
+{
+}
+void nvmet_bdev_execute_zone_mgmt_send(struct nvmet_req *req)
+{
+}
+void nvmet_bdev_execute_zone_append(struct nvmet_req *req)
+{
+}
+void nvmet_zns_add_cmd_effects(struct nvme_effects_log *log)
+{
+}
+#endif /* CONFIG_BLK_DEV_ZONED */