diff mbox series

[PATCHv3,3/5] nvme: implement I/O Command Sets Command Set support

Message ID 20200622162530.1287650-4-kbusch@kernel.org (mailing list archive)
State New, archived
Headers show
Series nvme support for zoned namespace command set | expand

Commit Message

Keith Busch June 22, 2020, 4:25 p.m. UTC
From: Niklas Cassel <niklas.cassel@wdc.com>

Implements support for the I/O Command Sets command set. The command set
introduces a method to enumerate multiple command sets per namespace. If
the command set is exposed, this method for enumeration will be used
instead of the traditional method that uses the CC.CSS register command
set register for command set identification.

For namespaces where the Command Set Identifier is not supported or
recognized, the specific namespace will not be created.

Reviewed-by: Javier González <javier.gonz@samsung.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Matias Bjørling <matias.bjorling@wdc.com>
Reviewed-by: Daniel Wagner <dwagner@suse.de>
Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
---
 drivers/nvme/host/core.c | 48 +++++++++++++++++++++++++++++++++-------
 drivers/nvme/host/nvme.h |  1 +
 include/linux/nvme.h     | 19 ++++++++++++++--
 3 files changed, 58 insertions(+), 10 deletions(-)

Comments

Hannes Reinecke June 23, 2020, 6:20 a.m. UTC | #1
On 6/22/20 6:25 PM, Keith Busch wrote:
> From: Niklas Cassel <niklas.cassel@wdc.com>
> 
> Implements support for the I/O Command Sets command set. The command set
> introduces a method to enumerate multiple command sets per namespace. If
> the command set is exposed, this method for enumeration will be used
> instead of the traditional method that uses the CC.CSS register command
> set register for command set identification.
> 
> For namespaces where the Command Set Identifier is not supported or
> recognized, the specific namespace will not be created.
> 
> Reviewed-by: Javier González <javier.gonz@samsung.com>
> Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
> Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
> Reviewed-by: Matias Bjørling <matias.bjorling@wdc.com>
> Reviewed-by: Daniel Wagner <dwagner@suse.de>
> Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
> ---
>   drivers/nvme/host/core.c | 48 +++++++++++++++++++++++++++++++++-------
>   drivers/nvme/host/nvme.h |  1 +
>   include/linux/nvme.h     | 19 ++++++++++++++--
>   3 files changed, 58 insertions(+), 10 deletions(-)
> 
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index 9491dbcfe81a..45a3cb5a35bd 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -1056,8 +1056,13 @@ static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id)
>   	return error;
>   }
>   
> +static bool nvme_multi_css(struct nvme_ctrl *ctrl)
> +{
> +	return (ctrl->ctrl_config & NVME_CC_CSS_MASK) == NVME_CC_CSS_CSI;
> +}
> +
>   static int nvme_process_ns_desc(struct nvme_ctrl *ctrl, struct nvme_ns_ids *ids,
> -		struct nvme_ns_id_desc *cur)
> +		struct nvme_ns_id_desc *cur, bool *csi_seen)
>   {
>   	const char *warn_str = "ctrl returned bogus length:";
>   	void *data = cur;
> @@ -1087,6 +1092,15 @@ static int nvme_process_ns_desc(struct nvme_ctrl *ctrl, struct nvme_ns_ids *ids,
>   		}
>   		uuid_copy(&ids->uuid, data + sizeof(*cur));
>   		return NVME_NIDT_UUID_LEN;
> +	case NVME_NIDT_CSI:
> +		if (cur->nidl != NVME_NIDT_CSI_LEN) {
> +			dev_warn(ctrl->device, "%s %d for NVME_NIDT_CSI\n",
> +				 warn_str, cur->nidl);
> +			return -1;
> +		}
> +		memcpy(&ids->csi, data + sizeof(*cur), NVME_NIDT_CSI_LEN);
> +		*csi_seen = true;
> +		return NVME_NIDT_CSI_LEN;
>   	default:
>   		/* Skip unknown types */
>   		return cur->nidl;
> @@ -1097,10 +1111,9 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid,
>   		struct nvme_ns_ids *ids)
>   {
>   	struct nvme_command c = { };
> -	int status;
> +	bool csi_seen = false;
> +	int status, pos, len;
>   	void *data;
> -	int pos;
> -	int len;
>   
>   	c.identify.opcode = nvme_admin_identify;
>   	c.identify.nsid = cpu_to_le32(nsid);
> @@ -1130,13 +1143,19 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid,
>   		if (cur->nidl == 0)
>   			break;
>   
> -		len = nvme_process_ns_desc(ctrl, ids, cur);
> +		len = nvme_process_ns_desc(ctrl, ids, cur, &csi_seen);
>   		if (len < 0)
>   			goto free_data;
>   
>   		len += sizeof(*cur);
>   	}
>   free_data:
> +	if (!status && nvme_multi_css(ctrl) && !csi_seen) {
> +		dev_warn(ctrl->device, "Command set not reported for nsid:%d\n",
> +			 nsid);
> +		status = -EINVAL;
> +	}
> +
>   	kfree(data);
>   	return status;
>   }
> @@ -1792,7 +1811,7 @@ static int nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid,
>   		memcpy(ids->eui64, id->eui64, sizeof(id->eui64));
>   	if (ctrl->vs >= NVME_VS(1, 2, 0))
>   		memcpy(ids->nguid, id->nguid, sizeof(id->nguid));
> -	if (ctrl->vs >= NVME_VS(1, 3, 0))
> +	if (ctrl->vs >= NVME_VS(1, 3, 0) || nvme_multi_css(ctrl))
>   		return nvme_identify_ns_descs(ctrl, nsid, ids);
>   	return 0;
>   }

Hmm? Are command sets even defined for something earlier than 1.3?

> @@ -1808,7 +1827,8 @@ static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b)
>   {
>   	return uuid_equal(&a->uuid, &b->uuid) &&
>   		memcmp(&a->nguid, &b->nguid, sizeof(a->nguid)) == 0 &&
> -		memcmp(&a->eui64, &b->eui64, sizeof(a->eui64)) == 0;
> +		memcmp(&a->eui64, &b->eui64, sizeof(a->eui64)) == 0 &&
> +		a->csi == b->csi;
>   }
>   
>   static int nvme_setup_streams_ns(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
> @@ -1930,6 +1950,15 @@ static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
>   	if (ns->lba_shift == 0)
>   		ns->lba_shift = 9;
>   
> +	switch (ns->head->ids.csi) {
> +	case NVME_CSI_NVM:
> +		break;
> +	default:
> +		dev_warn(ctrl->device, "unknown csi:%d ns:%d\n",
> +			ns->head->ids.csi, ns->head->ns_id);
> +		return -ENODEV;
> +	}
> +
>   	if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) &&
>   	    is_power_of_2(ctrl->max_hw_sectors))
>   		iob = ctrl->max_hw_sectors;
> @@ -2264,7 +2293,10 @@ int nvme_enable_ctrl(struct nvme_ctrl *ctrl)
>   
>   	ctrl->page_size = 1 << page_shift;
>   
> -	ctrl->ctrl_config = NVME_CC_CSS_NVM;
> +	if (NVME_CAP_CSS(ctrl->cap) & NVME_CAP_CSS_CSI)
> +		ctrl->ctrl_config = NVME_CC_CSS_CSI;
> +	else
> +		ctrl->ctrl_config = NVME_CC_CSS_NVM;
>   	ctrl->ctrl_config |= (page_shift - 12) << NVME_CC_MPS_SHIFT;
>   	ctrl->ctrl_config |= NVME_CC_AMS_RR | NVME_CC_SHN_NONE;
>   	ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
> diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
> index c0f4226d3299..a84f71459caa 100644
> --- a/drivers/nvme/host/nvme.h
> +++ b/drivers/nvme/host/nvme.h
> @@ -339,6 +339,7 @@ struct nvme_ns_ids {
>   	u8	eui64[8];
>   	u8	nguid[16];
>   	uuid_t	uuid;
> +	u8	csi;
>   };
>   
>   /*
> diff --git a/include/linux/nvme.h b/include/linux/nvme.h
> index 5ce51ab4c50e..81ffe5247505 100644
> --- a/include/linux/nvme.h
> +++ b/include/linux/nvme.h
> @@ -132,6 +132,7 @@ enum {
>   #define NVME_CAP_TIMEOUT(cap)	(((cap) >> 24) & 0xff)
>   #define NVME_CAP_STRIDE(cap)	(((cap) >> 32) & 0xf)
>   #define NVME_CAP_NSSRC(cap)	(((cap) >> 36) & 0x1)
> +#define NVME_CAP_CSS(cap)	(((cap) >> 37) & 0xff)
>   #define NVME_CAP_MPSMIN(cap)	(((cap) >> 48) & 0xf)
>   #define NVME_CAP_MPSMAX(cap)	(((cap) >> 52) & 0xf)
>   

Indentation?

> @@ -162,7 +163,6 @@ enum {
>   
>   enum {
>   	NVME_CC_ENABLE		= 1 << 0,
> -	NVME_CC_CSS_NVM		= 0 << 4,
>   	NVME_CC_EN_SHIFT	= 0,
>   	NVME_CC_CSS_SHIFT	= 4,
>   	NVME_CC_MPS_SHIFT	= 7,
> @@ -170,6 +170,9 @@ enum {
>   	NVME_CC_SHN_SHIFT	= 14,
>   	NVME_CC_IOSQES_SHIFT	= 16,
>   	NVME_CC_IOCQES_SHIFT	= 20,
> +	NVME_CC_CSS_NVM		= 0 << NVME_CC_CSS_SHIFT,
> +	NVME_CC_CSS_CSI		= 6 << NVME_CC_CSS_SHIFT,
> +	NVME_CC_CSS_MASK	= 7 << NVME_CC_CSS_SHIFT,
>   	NVME_CC_AMS_RR		= 0 << NVME_CC_AMS_SHIFT,
>   	NVME_CC_AMS_WRRU	= 1 << NVME_CC_AMS_SHIFT,
>   	NVME_CC_AMS_VS		= 7 << NVME_CC_AMS_SHIFT,
> @@ -179,6 +182,8 @@ enum {
>   	NVME_CC_SHN_MASK	= 3 << NVME_CC_SHN_SHIFT,
>   	NVME_CC_IOSQES		= NVME_NVM_IOSQES << NVME_CC_IOSQES_SHIFT,
>   	NVME_CC_IOCQES		= NVME_NVM_IOCQES << NVME_CC_IOCQES_SHIFT,
> +	NVME_CAP_CSS_NVM	= 1 << 0,
> +	NVME_CAP_CSS_CSI	= 1 << 6,
>   	NVME_CSTS_RDY		= 1 << 0,
>   	NVME_CSTS_CFS		= 1 << 1,
>   	NVME_CSTS_NSSRO		= 1 << 4,
> @@ -374,6 +379,8 @@ enum {
>   	NVME_ID_CNS_CTRL		= 0x01,
>   	NVME_ID_CNS_NS_ACTIVE_LIST	= 0x02,
>   	NVME_ID_CNS_NS_DESC_LIST	= 0x03,
> +	NVME_ID_CNS_CS_NS		= 0x05,
> +	NVME_ID_CNS_CS_CTRL		= 0x06,
>   	NVME_ID_CNS_NS_PRESENT_LIST	= 0x10,
>   	NVME_ID_CNS_NS_PRESENT		= 0x11,
>   	NVME_ID_CNS_CTRL_NS_LIST	= 0x12,
> @@ -383,6 +390,10 @@ enum {
>   	NVME_ID_CNS_UUID_LIST		= 0x17,
>   };
>   
> +enum {
> +	NVME_CSI_NVM			= 0,
> +};
> +
>   enum {
>   	NVME_DIR_IDENTIFY		= 0x00,
>   	NVME_DIR_STREAMS		= 0x01,
> @@ -435,11 +446,13 @@ struct nvme_ns_id_desc {
>   #define NVME_NIDT_EUI64_LEN	8
>   #define NVME_NIDT_NGUID_LEN	16
>   #define NVME_NIDT_UUID_LEN	16
> +#define NVME_NIDT_CSI_LEN	1
>   
>   enum {
>   	NVME_NIDT_EUI64		= 0x01,
>   	NVME_NIDT_NGUID		= 0x02,
>   	NVME_NIDT_UUID		= 0x03,
> +	NVME_NIDT_CSI		= 0x04,
>   };
>   
>   struct nvme_smart_log {
> @@ -972,7 +985,9 @@ struct nvme_identify {
>   	__u8			cns;
>   	__u8			rsvd3;
>   	__le16			ctrlid;
> -	__u32			rsvd11[5];
> +	__u8			rsvd11[3];
> +	__u8			csi;
> +	__u32			rsvd12[4];
>   };
>   
>   #define NVME_IDENTIFY_DATA_SIZE 4096
> 
Otherwise looks okay.

Cheers,

Hannes
Sagi Grimberg June 23, 2020, 8:53 a.m. UTC | #2
On 6/22/20 9:25 AM, Keith Busch wrote:
> From: Niklas Cassel <niklas.cassel@wdc.com>
> 
> Implements support for the I/O Command Sets command set. The command set
> introduces a method to enumerate multiple command sets per namespace. If
> the command set is exposed, this method for enumeration will be used
> instead of the traditional method that uses the CC.CSS register command
> set register for command set identification.
> 
> For namespaces where the Command Set Identifier is not supported or
> recognized, the specific namespace will not be created.
> 
> Reviewed-by: Javier González <javier.gonz@samsung.com>
> Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
> Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
> Reviewed-by: Matias Bjørling <matias.bjorling@wdc.com>
> Reviewed-by: Daniel Wagner <dwagner@suse.de>
> Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
> ---
>   drivers/nvme/host/core.c | 48 +++++++++++++++++++++++++++++++++-------
>   drivers/nvme/host/nvme.h |  1 +
>   include/linux/nvme.h     | 19 ++++++++++++++--
>   3 files changed, 58 insertions(+), 10 deletions(-)
> 
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index 9491dbcfe81a..45a3cb5a35bd 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -1056,8 +1056,13 @@ static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id)
>   	return error;
>   }
>   
> +static bool nvme_multi_css(struct nvme_ctrl *ctrl)
> +{
> +	return (ctrl->ctrl_config & NVME_CC_CSS_MASK) == NVME_CC_CSS_CSI;
> +}
> +
>   static int nvme_process_ns_desc(struct nvme_ctrl *ctrl, struct nvme_ns_ids *ids,
> -		struct nvme_ns_id_desc *cur)
> +		struct nvme_ns_id_desc *cur, bool *csi_seen)
>   {
>   	const char *warn_str = "ctrl returned bogus length:";
>   	void *data = cur;
> @@ -1087,6 +1092,15 @@ static int nvme_process_ns_desc(struct nvme_ctrl *ctrl, struct nvme_ns_ids *ids,
>   		}
>   		uuid_copy(&ids->uuid, data + sizeof(*cur));
>   		return NVME_NIDT_UUID_LEN;
> +	case NVME_NIDT_CSI:
> +		if (cur->nidl != NVME_NIDT_CSI_LEN) {
> +			dev_warn(ctrl->device, "%s %d for NVME_NIDT_CSI\n",
> +				 warn_str, cur->nidl);
> +			return -1;
> +		}
> +		memcpy(&ids->csi, data + sizeof(*cur), NVME_NIDT_CSI_LEN);
> +		*csi_seen = true;
> +		return NVME_NIDT_CSI_LEN;
>   	default:
>   		/* Skip unknown types */
>   		return cur->nidl;
> @@ -1097,10 +1111,9 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid,
>   		struct nvme_ns_ids *ids)
>   {
>   	struct nvme_command c = { };
> -	int status;
> +	bool csi_seen = false;
> +	int status, pos, len;
>   	void *data;
> -	int pos;
> -	int len;
>   
>   	c.identify.opcode = nvme_admin_identify;
>   	c.identify.nsid = cpu_to_le32(nsid);
> @@ -1130,13 +1143,19 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid,
>   		if (cur->nidl == 0)
>   			break;
>   
> -		len = nvme_process_ns_desc(ctrl, ids, cur);
> +		len = nvme_process_ns_desc(ctrl, ids, cur, &csi_seen);
>   		if (len < 0)
>   			goto free_data;
>   
>   		len += sizeof(*cur);
>   	}
>   free_data:
> +	if (!status && nvme_multi_css(ctrl) && !csi_seen) {

We will clear the status if we detect a path error, that is to
avoid needlessly removing the ns for path failures, so you should
check at the goto site.

> +		dev_warn(ctrl->device, "Command set not reported for nsid:%d\n",
> +			 nsid);
> +		status = -EINVAL;
> +	}
> +
>   	kfree(data);
>   	return status;
>   }
> @@ -1792,7 +1811,7 @@ static int nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid,
>   		memcpy(ids->eui64, id->eui64, sizeof(id->eui64));
>   	if (ctrl->vs >= NVME_VS(1, 2, 0))
>   		memcpy(ids->nguid, id->nguid, sizeof(id->nguid));
> -	if (ctrl->vs >= NVME_VS(1, 3, 0))
> +	if (ctrl->vs >= NVME_VS(1, 3, 0) || nvme_multi_css(ctrl))
>   		return nvme_identify_ns_descs(ctrl, nsid, ids);
>   	return 0;
>   }
> @@ -1808,7 +1827,8 @@ static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b)
>   {
>   	return uuid_equal(&a->uuid, &b->uuid) &&
>   		memcmp(&a->nguid, &b->nguid, sizeof(a->nguid)) == 0 &&
> -		memcmp(&a->eui64, &b->eui64, sizeof(a->eui64)) == 0;
> +		memcmp(&a->eui64, &b->eui64, sizeof(a->eui64)) == 0 &&
> +		a->csi == b->csi;
>   }
>   
>   static int nvme_setup_streams_ns(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
> @@ -1930,6 +1950,15 @@ static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
>   	if (ns->lba_shift == 0)
>   		ns->lba_shift = 9;
>   
> +	switch (ns->head->ids.csi) {
> +	case NVME_CSI_NVM:
> +		break;
> +	default:
> +		dev_warn(ctrl->device, "unknown csi:%d ns:%d\n",
> +			ns->head->ids.csi, ns->head->ns_id);
> +		return -ENODEV;
> +	}

Not sure we need a switch-case statement for a single case target...
Niklas Cassel June 23, 2020, 9:20 a.m. UTC | #3
On Tue, Jun 23, 2020 at 08:20:23AM +0200, Hannes Reinecke wrote:
> On 6/22/20 6:25 PM, Keith Busch wrote:
> > From: Niklas Cassel <niklas.cassel@wdc.com>
> > 
> > Implements support for the I/O Command Sets command set. The command set
> > introduces a method to enumerate multiple command sets per namespace. If
> > the command set is exposed, this method for enumeration will be used
> > instead of the traditional method that uses the CC.CSS register command
> > set register for command set identification.
> > 
> > For namespaces where the Command Set Identifier is not supported or
> > recognized, the specific namespace will not be created.
> > 
> > Reviewed-by: Javier González <javier.gonz@samsung.com>
> > Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
> > Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
> > Reviewed-by: Matias Bjørling <matias.bjorling@wdc.com>
> > Reviewed-by: Daniel Wagner <dwagner@suse.de>
> > Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
> > ---
> > @@ -1792,7 +1811,7 @@ static int nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid,
> >   		memcpy(ids->eui64, id->eui64, sizeof(id->eui64));
> >   	if (ctrl->vs >= NVME_VS(1, 2, 0))
> >   		memcpy(ids->nguid, id->nguid, sizeof(id->nguid));
> > -	if (ctrl->vs >= NVME_VS(1, 3, 0))
> > +	if (ctrl->vs >= NVME_VS(1, 3, 0) || nvme_multi_css(ctrl))
> >   		return nvme_identify_ns_descs(ctrl, nsid, ids);
> >   	return 0;
> >   }
> 
> Hmm? Are command sets even defined for something earlier than 1.3?

According to Keith, usually new features are not really tied to a
specific NVMe version.

So if someone implements/enables multiple command sets feature on
an older base spec of NVMe, we still want to support/allow that.


Kind regards,
Niklas
Niklas Cassel June 23, 2020, 11:25 a.m. UTC | #4
On Tue, Jun 23, 2020 at 01:53:47AM -0700, Sagi Grimberg wrote:
> 
> 
> On 6/22/20 9:25 AM, Keith Busch wrote:
> > From: Niklas Cassel <niklas.cassel@wdc.com>
> > 
> > Implements support for the I/O Command Sets command set. The command set
> > introduces a method to enumerate multiple command sets per namespace. If
> > the command set is exposed, this method for enumeration will be used
> > instead of the traditional method that uses the CC.CSS register command
> > set register for command set identification.
> > 
> > For namespaces where the Command Set Identifier is not supported or
> > recognized, the specific namespace will not be created.
> > 
> > Reviewed-by: Javier González <javier.gonz@samsung.com>
> > Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
> > Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
> > Reviewed-by: Matias Bjørling <matias.bjorling@wdc.com>
> > Reviewed-by: Daniel Wagner <dwagner@suse.de>
> > Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
> > ---
> >   drivers/nvme/host/core.c | 48 +++++++++++++++++++++++++++++++++-------
> >   drivers/nvme/host/nvme.h |  1 +
> >   include/linux/nvme.h     | 19 ++++++++++++++--
> >   3 files changed, 58 insertions(+), 10 deletions(-)
> > 
> > diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> > index 9491dbcfe81a..45a3cb5a35bd 100644
> > --- a/drivers/nvme/host/core.c
> > +++ b/drivers/nvme/host/core.c
> > @@ -1056,8 +1056,13 @@ static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id)
> >   	return error;
> >   }
> > +static bool nvme_multi_css(struct nvme_ctrl *ctrl)
> > +{
> > +	return (ctrl->ctrl_config & NVME_CC_CSS_MASK) == NVME_CC_CSS_CSI;
> > +}
> > +
> >   static int nvme_process_ns_desc(struct nvme_ctrl *ctrl, struct nvme_ns_ids *ids,
> > -		struct nvme_ns_id_desc *cur)
> > +		struct nvme_ns_id_desc *cur, bool *csi_seen)
> >   {
> >   	const char *warn_str = "ctrl returned bogus length:";
> >   	void *data = cur;
> > @@ -1087,6 +1092,15 @@ static int nvme_process_ns_desc(struct nvme_ctrl *ctrl, struct nvme_ns_ids *ids,
> >   		}
> >   		uuid_copy(&ids->uuid, data + sizeof(*cur));
> >   		return NVME_NIDT_UUID_LEN;
> > +	case NVME_NIDT_CSI:
> > +		if (cur->nidl != NVME_NIDT_CSI_LEN) {
> > +			dev_warn(ctrl->device, "%s %d for NVME_NIDT_CSI\n",
> > +				 warn_str, cur->nidl);
> > +			return -1;
> > +		}
> > +		memcpy(&ids->csi, data + sizeof(*cur), NVME_NIDT_CSI_LEN);
> > +		*csi_seen = true;
> > +		return NVME_NIDT_CSI_LEN;
> >   	default:
> >   		/* Skip unknown types */
> >   		return cur->nidl;
> > @@ -1097,10 +1111,9 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid,
> >   		struct nvme_ns_ids *ids)
> >   {
> >   	struct nvme_command c = { };
> > -	int status;
> > +	bool csi_seen = false;
> > +	int status, pos, len;
> >   	void *data;
> > -	int pos;
> > -	int len;
> >   	c.identify.opcode = nvme_admin_identify;
> >   	c.identify.nsid = cpu_to_le32(nsid);
> > @@ -1130,13 +1143,19 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid,
> >   		if (cur->nidl == 0)
> >   			break;
> > -		len = nvme_process_ns_desc(ctrl, ids, cur);
> > +		len = nvme_process_ns_desc(ctrl, ids, cur, &csi_seen);
> >   		if (len < 0)
> >   			goto free_data;
> >   		len += sizeof(*cur);
> >   	}
> >   free_data:
> > +	if (!status && nvme_multi_css(ctrl) && !csi_seen) {
> 
> We will clear the status if we detect a path error, that is to
> avoid needlessly removing the ns for path failures, so you should
> check at the goto site.

The problem is that this check has to be done after checking all the ns descs,
so this check to be done as the final thing, at least after processing all the
ns descs. No matter if nvme_process_ns_desc() returned an error, or if
simply NVME_NIDT_CSI wasn't part of the ns desc list, so the loop reached the
end without error.

Even if the nvme command failed and the status was cleared:

                if (status > 0 && !(status & NVME_SC_DNR))
                        status = 0;

we still need to return an error, if (nvme_multi_css(ctrl) && !csi_seen).
(Not reporting a CSI when nvme_multi_css() is enabled, is fatal.)

That is why the code looks like it does.

I guess we could do something like this, which does the same thing,
but perhaps is a bit clearer:

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index e95f0c498a6b..bef687b9a277 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1160,8 +1160,10 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid,
                  * Don't treat an error as fatal, as we potentially already
                  * have a NGUID or EUI-64.
                  */
-               if (status > 0 && !(status & NVME_SC_DNR))
+               if (status > 0 && !(status & NVME_SC_DNR)) {
                        status = 0;
+                       goto csi_check;
+               }
                goto free_data;
        }

@@ -1173,17 +1175,17 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid,

                len = nvme_process_ns_desc(ctrl, ids, cur, &csi_seen);
                if (len < 0)
-                       goto free_data;
+                       goto csi_check;

                len += sizeof(*cur);
        }
-free_data:
-       if (!status && nvme_multi_css(ctrl) && !csi_seen) {
+csi_check:
+       if (nvme_multi_css(ctrl) && !csi_seen) {
                dev_warn(ctrl->device, "Command set not reported for nsid:%d\n",
                         nsid);
                status = -EINVAL;
        }
-
+free_data:
        kfree(data);
        return status;
 }


> 
> > +		dev_warn(ctrl->device, "Command set not reported for nsid:%d\n",
> > +			 nsid);
> > +		status = -EINVAL;
> > +	}
> > +
> >   	kfree(data);
> >   	return status;
> >   }
> > @@ -1792,7 +1811,7 @@ static int nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid,
> >   		memcpy(ids->eui64, id->eui64, sizeof(id->eui64));
> >   	if (ctrl->vs >= NVME_VS(1, 2, 0))
> >   		memcpy(ids->nguid, id->nguid, sizeof(id->nguid));
> > -	if (ctrl->vs >= NVME_VS(1, 3, 0))
> > +	if (ctrl->vs >= NVME_VS(1, 3, 0) || nvme_multi_css(ctrl))
> >   		return nvme_identify_ns_descs(ctrl, nsid, ids);
> >   	return 0;
> >   }
> > @@ -1808,7 +1827,8 @@ static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b)
> >   {
> >   	return uuid_equal(&a->uuid, &b->uuid) &&
> >   		memcmp(&a->nguid, &b->nguid, sizeof(a->nguid)) == 0 &&
> > -		memcmp(&a->eui64, &b->eui64, sizeof(a->eui64)) == 0;
> > +		memcmp(&a->eui64, &b->eui64, sizeof(a->eui64)) == 0 &&
> > +		a->csi == b->csi;
> >   }
> >   static int nvme_setup_streams_ns(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
> > @@ -1930,6 +1950,15 @@ static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
> >   	if (ns->lba_shift == 0)
> >   		ns->lba_shift = 9;
> > +	switch (ns->head->ids.csi) {
> > +	case NVME_CSI_NVM:
> > +		break;
> > +	default:
> > +		dev_warn(ctrl->device, "unknown csi:%d ns:%d\n",
> > +			ns->head->ids.csi, ns->head->ns_id);
> > +		return -ENODEV;
> > +	}
> 
> Not sure we need a switch-case statement for a single case target...

I would consider it two cases. A supported CSI or a non-supported CSI
(which means any CSI value != NVME_CSI_NVM).

However, a follow up patch (patch 5/5 in this series) adds another case
to this switch-case statement (NVME_CSI_ZNS).

I guess this patch could have used an if-else statement, and patch 5/5
replaced the if-statement with a switch-case.
However, since a patch in the same series actually adds another case,
I think that it is more clear this way.
(A switch-case with only two cases added, in a patch that is not the last
one in the series, suggests (at least to me), that it will most likely be
extended in a following patch.)


Kind regards,
Niklas
Keith Busch June 23, 2020, 2:25 p.m. UTC | #5
On Tue, Jun 23, 2020 at 09:20:35AM +0000, Niklas Cassel wrote:
> On Tue, Jun 23, 2020 at 08:20:23AM +0200, Hannes Reinecke wrote:
> > On 6/22/20 6:25 PM, Keith Busch wrote:
> > > From: Niklas Cassel <niklas.cassel@wdc.com>
> > > -	if (ctrl->vs >= NVME_VS(1, 3, 0))
> > > +	if (ctrl->vs >= NVME_VS(1, 3, 0) || nvme_multi_css(ctrl))
> > >   		return nvme_identify_ns_descs(ctrl, nsid, ids);
> > >   	return 0;
> > >   }
> > 
> > Hmm? Are command sets even defined for something earlier than 1.3?
> 
> According to Keith, usually new features are not really tied to a
> specific NVMe version.

Not really according to me; nvme just allows controllers to implement
features ratified after the initial version release.
 
> So if someone implements/enables multiple command sets feature on
> an older base spec of NVMe, we still want to support/allow that.

Right. We don't check the version to see if multi-command sets are
supported when we initially enable it, so we shouldn't need to check the
version later.
Keith Busch June 23, 2020, 2:59 p.m. UTC | #6
On Tue, Jun 23, 2020 at 11:25:53AM +0000, Niklas Cassel wrote:
> On Tue, Jun 23, 2020 at 01:53:47AM -0700, Sagi Grimberg wrote:
> > >   static int nvme_setup_streams_ns(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
> > > @@ -1930,6 +1950,15 @@ static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
> > >   	if (ns->lba_shift == 0)
> > >   		ns->lba_shift = 9;
> > > +	switch (ns->head->ids.csi) {
> > > +	case NVME_CSI_NVM:
> > > +		break;
> > > +	default:
> > > +		dev_warn(ctrl->device, "unknown csi:%d ns:%d\n",
> > > +			ns->head->ids.csi, ns->head->ns_id);
> > > +		return -ENODEV;
> > > +	}
> > 
> > Not sure we need a switch-case statement for a single case target...
> 
> I would consider it two cases. A supported CSI or a non-supported CSI
> (which means any CSI value != NVME_CSI_NVM).
> 
> However, a follow up patch (patch 5/5 in this series) adds another case
> to this switch-case statement (NVME_CSI_ZNS).
> 
> I guess this patch could have used an if-else statement, and patch 5/5
> replaced the if-statement with a switch-case.
> However, since a patch in the same series actually adds another case,
> I think that it is more clear this way.
> (A switch-case with only two cases added, in a patch that is not the last
> one in the series, suggests (at least to me), that it will most likely be
> extended in a following patch.)

Yeah, this patch is laying the foundation for future command sets.
Keith Busch June 23, 2020, 10:10 p.m. UTC | #7
On Tue, Jun 23, 2020 at 11:25:53AM +0000, Niklas Cassel wrote:
> On Tue, Jun 23, 2020 at 01:53:47AM -0700, Sagi Grimberg wrote:
> > On 6/22/20 9:25 AM, Keith Busch wrote:
> > > -		len = nvme_process_ns_desc(ctrl, ids, cur);
> > > +		len = nvme_process_ns_desc(ctrl, ids, cur, &csi_seen);
> > >   		if (len < 0)
> > >   			goto free_data;
> > >   		len += sizeof(*cur);
> > >   	}
> > >   free_data:
> > > +	if (!status && nvme_multi_css(ctrl) && !csi_seen) {
> > 
> > We will clear the status if we detect a path error, that is to
> > avoid needlessly removing the ns for path failures, so you should
> > check at the goto site.
> 
> The problem is that this check has to be done after checking all the ns descs,
> so this check to be done as the final thing, at least after processing all the
> ns descs. No matter if nvme_process_ns_desc() returned an error, or if
> simply NVME_NIDT_CSI wasn't part of the ns desc list, so the loop reached the
> end without error.
> 
> Even if the nvme command failed and the status was cleared:
> 
>                 if (status > 0 && !(status & NVME_SC_DNR))
>                         status = 0;

This check is so weird. What has DNR got to do with whether or not we
want to continue with this namespace? The commit that adds this says
it's to check for a host failed IO, but a controller can just as validly
set DNR in its error status, in which case we'd still want clear the
status.
Sagi Grimberg June 23, 2020, 11:17 p.m. UTC | #8
>>>> -		len = nvme_process_ns_desc(ctrl, ids, cur);
>>>> +		len = nvme_process_ns_desc(ctrl, ids, cur, &csi_seen);
>>>>    		if (len < 0)
>>>>    			goto free_data;
>>>>    		len += sizeof(*cur);
>>>>    	}
>>>>    free_data:
>>>> +	if (!status && nvme_multi_css(ctrl) && !csi_seen) {
>>>
>>> We will clear the status if we detect a path error, that is to
>>> avoid needlessly removing the ns for path failures, so you should
>>> check at the goto site.
>>
>> The problem is that this check has to be done after checking all the ns descs,
>> so this check to be done as the final thing, at least after processing all the
>> ns descs. No matter if nvme_process_ns_desc() returned an error, or if
>> simply NVME_NIDT_CSI wasn't part of the ns desc list, so the loop reached the
>> end without error.
>>
>> Even if the nvme command failed and the status was cleared:
>>
>>                  if (status > 0 && !(status & NVME_SC_DNR))
>>                          status = 0;
> 
> This check is so weird. What has DNR got to do with whether or not we
> want to continue with this namespace? The commit that adds this says
> it's to check for a host failed IO, but a controller can just as validly
> set DNR in its error status, in which case we'd still want clear the
> status.

The reason is that if this error is not a DNR error, it means that we
should retry and succeed, we don't want to remove the namespace.

The problem that this solves is the fact that we have various error
recovery conditions (interconnect issues, failures, resets) and the
async works are designed to continue to run, issue I/O and fail. The
scan work, will revalidate namespaces and remove them if it fails to
do so, which is inherently wrong if these are simply inaccessible by
the host at this time.
Sagi Grimberg June 23, 2020, 11:20 p.m. UTC | #9
On 6/23/20 4:25 AM, Niklas Cassel wrote:
> On Tue, Jun 23, 2020 at 01:53:47AM -0700, Sagi Grimberg wrote:
>>
>>
>> On 6/22/20 9:25 AM, Keith Busch wrote:
>>> From: Niklas Cassel <niklas.cassel@wdc.com>
>>>
>>> Implements support for the I/O Command Sets command set. The command set
>>> introduces a method to enumerate multiple command sets per namespace. If
>>> the command set is exposed, this method for enumeration will be used
>>> instead of the traditional method that uses the CC.CSS register command
>>> set register for command set identification.
>>>
>>> For namespaces where the Command Set Identifier is not supported or
>>> recognized, the specific namespace will not be created.
>>>
>>> Reviewed-by: Javier González <javier.gonz@samsung.com>
>>> Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
>>> Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
>>> Reviewed-by: Matias Bjørling <matias.bjorling@wdc.com>
>>> Reviewed-by: Daniel Wagner <dwagner@suse.de>
>>> Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
>>> ---
>>>    drivers/nvme/host/core.c | 48 +++++++++++++++++++++++++++++++++-------
>>>    drivers/nvme/host/nvme.h |  1 +
>>>    include/linux/nvme.h     | 19 ++++++++++++++--
>>>    3 files changed, 58 insertions(+), 10 deletions(-)
>>>
>>> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
>>> index 9491dbcfe81a..45a3cb5a35bd 100644
>>> --- a/drivers/nvme/host/core.c
>>> +++ b/drivers/nvme/host/core.c
>>> @@ -1056,8 +1056,13 @@ static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id)
>>>    	return error;
>>>    }
>>> +static bool nvme_multi_css(struct nvme_ctrl *ctrl)
>>> +{
>>> +	return (ctrl->ctrl_config & NVME_CC_CSS_MASK) == NVME_CC_CSS_CSI;
>>> +}
>>> +
>>>    static int nvme_process_ns_desc(struct nvme_ctrl *ctrl, struct nvme_ns_ids *ids,
>>> -		struct nvme_ns_id_desc *cur)
>>> +		struct nvme_ns_id_desc *cur, bool *csi_seen)
>>>    {
>>>    	const char *warn_str = "ctrl returned bogus length:";
>>>    	void *data = cur;
>>> @@ -1087,6 +1092,15 @@ static int nvme_process_ns_desc(struct nvme_ctrl *ctrl, struct nvme_ns_ids *ids,
>>>    		}
>>>    		uuid_copy(&ids->uuid, data + sizeof(*cur));
>>>    		return NVME_NIDT_UUID_LEN;
>>> +	case NVME_NIDT_CSI:
>>> +		if (cur->nidl != NVME_NIDT_CSI_LEN) {
>>> +			dev_warn(ctrl->device, "%s %d for NVME_NIDT_CSI\n",
>>> +				 warn_str, cur->nidl);
>>> +			return -1;
>>> +		}
>>> +		memcpy(&ids->csi, data + sizeof(*cur), NVME_NIDT_CSI_LEN);
>>> +		*csi_seen = true;
>>> +		return NVME_NIDT_CSI_LEN;
>>>    	default:
>>>    		/* Skip unknown types */
>>>    		return cur->nidl;
>>> @@ -1097,10 +1111,9 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid,
>>>    		struct nvme_ns_ids *ids)
>>>    {
>>>    	struct nvme_command c = { };
>>> -	int status;
>>> +	bool csi_seen = false;
>>> +	int status, pos, len;
>>>    	void *data;
>>> -	int pos;
>>> -	int len;
>>>    	c.identify.opcode = nvme_admin_identify;
>>>    	c.identify.nsid = cpu_to_le32(nsid);
>>> @@ -1130,13 +1143,19 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid,
>>>    		if (cur->nidl == 0)
>>>    			break;
>>> -		len = nvme_process_ns_desc(ctrl, ids, cur);
>>> +		len = nvme_process_ns_desc(ctrl, ids, cur, &csi_seen);
>>>    		if (len < 0)
>>>    			goto free_data;
>>>    		len += sizeof(*cur);
>>>    	}
>>>    free_data:
>>> +	if (!status && nvme_multi_css(ctrl) && !csi_seen) {
>>
>> We will clear the status if we detect a path error, that is to
>> avoid needlessly removing the ns for path failures, so you should
>> check at the goto site.
> 
> The problem is that this check has to be done after checking all the ns descs,
> so this check to be done as the final thing, at least after processing all the
> ns descs. No matter if nvme_process_ns_desc() returned an error, or if
> simply NVME_NIDT_CSI wasn't part of the ns desc list, so the loop reached the
> end without error.
> 
> Even if the nvme command failed and the status was cleared:
> 
>                  if (status > 0 && !(status & NVME_SC_DNR))
>                          status = 0;
> 
> we still need to return an error, if (nvme_multi_css(ctrl) && !csi_seen).
> (Not reporting a CSI when nvme_multi_css() is enabled, is fatal.)
> 
> That is why the code looks like it does.
> 
> I guess we could do something like this, which does the same thing,
> but perhaps is a bit clearer:
> 
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index e95f0c498a6b..bef687b9a277 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -1160,8 +1160,10 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid,
>                    * Don't treat an error as fatal, as we potentially already
>                    * have a NGUID or EUI-64.
>                    */
> -               if (status > 0 && !(status & NVME_SC_DNR))
> +               if (status > 0 && !(status & NVME_SC_DNR)) {
>                          status = 0;
> +                       goto csi_check;
> +               }

I think its the opposite. If we failed with DNR, you should assume
that either the controller wants the host to retry, or its a
path/transport error. So we need to leave this namespace alone and
assume that when the host is connected back to the controller, the
scan_work will revalidate again.

So you should fail the csi_check only if you see a DNR error status.
Keith Busch June 24, 2020, 5:25 p.m. UTC | #10
On Tue, Jun 23, 2020 at 04:17:30PM -0700, Sagi Grimberg wrote:
> 
> > > > > -		len = nvme_process_ns_desc(ctrl, ids, cur);
> > > > > +		len = nvme_process_ns_desc(ctrl, ids, cur, &csi_seen);
> > > > >    		if (len < 0)
> > > > >    			goto free_data;
> > > > >    		len += sizeof(*cur);
> > > > >    	}
> > > > >    free_data:
> > > > > +	if (!status && nvme_multi_css(ctrl) && !csi_seen) {
> > > > 
> > > > We will clear the status if we detect a path error, that is to
> > > > avoid needlessly removing the ns for path failures, so you should
> > > > check at the goto site.
> > > 
> > > The problem is that this check has to be done after checking all the ns descs,
> > > so this check to be done as the final thing, at least after processing all the
> > > ns descs. No matter if nvme_process_ns_desc() returned an error, or if
> > > simply NVME_NIDT_CSI wasn't part of the ns desc list, so the loop reached the
> > > end without error.
> > > 
> > > Even if the nvme command failed and the status was cleared:
> > > 
> > >                  if (status > 0 && !(status & NVME_SC_DNR))
> > >                          status = 0;
> > 
> > This check is so weird. What has DNR got to do with whether or not we
> > want to continue with this namespace? The commit that adds this says
> > it's to check for a host failed IO, but a controller can just as validly
> > set DNR in its error status, in which case we'd still want clear the
> > status.
> 
> The reason is that if this error is not a DNR error, it means that we
> should retry and succeed, we don't want to remove the namespace.

And what if it is a DNR error? For example, the controller simply
doesn't support this CNS value. While the controller should support it,
we definitely don't need it for NVM command set namespaces.
 
> The problem that this solves is the fact that we have various error
> recovery conditions (interconnect issues, failures, resets) and the
> async works are designed to continue to run, issue I/O and fail. The
> scan work, will revalidate namespaces and remove them if it fails to
> do so, which is inherently wrong if these are simply inaccessible by
> the host at this time.

Relying on a specific bit in the status code seems a bit indirect for
this kind of condition.
Sagi Grimberg June 24, 2020, 5:46 p.m. UTC | #11
On 6/24/20 10:25 AM, Keith Busch wrote:
> On Tue, Jun 23, 2020 at 04:17:30PM -0700, Sagi Grimberg wrote:
>>
>>>>>> -		len = nvme_process_ns_desc(ctrl, ids, cur);
>>>>>> +		len = nvme_process_ns_desc(ctrl, ids, cur, &csi_seen);
>>>>>>     		if (len < 0)
>>>>>>     			goto free_data;
>>>>>>     		len += sizeof(*cur);
>>>>>>     	}
>>>>>>     free_data:
>>>>>> +	if (!status && nvme_multi_css(ctrl) && !csi_seen) {
>>>>>
>>>>> We will clear the status if we detect a path error, that is to
>>>>> avoid needlessly removing the ns for path failures, so you should
>>>>> check at the goto site.
>>>>
>>>> The problem is that this check has to be done after checking all the ns descs,
>>>> so this check to be done as the final thing, at least after processing all the
>>>> ns descs. No matter if nvme_process_ns_desc() returned an error, or if
>>>> simply NVME_NIDT_CSI wasn't part of the ns desc list, so the loop reached the
>>>> end without error.
>>>>
>>>> Even if the nvme command failed and the status was cleared:
>>>>
>>>>                   if (status > 0 && !(status & NVME_SC_DNR))
>>>>                           status = 0;
>>>
>>> This check is so weird. What has DNR got to do with whether or not we
>>> want to continue with this namespace? The commit that adds this says
>>> it's to check for a host failed IO, but a controller can just as validly
>>> set DNR in its error status, in which case we'd still want clear the
>>> status.
>>
>> The reason is that if this error is not a DNR error, it means that we
>> should retry and succeed, we don't want to remove the namespace.
> 
> And what if it is a DNR error? For example, the controller simply
> doesn't support this CNS value. While the controller should support it,
> we definitely don't need it for NVM command set namespaces.

Maybe I mis-undersatnd the comment, but if you see a DNR error, it means
that the controller replied an error and its final, so then you can have
extra checks.

I think the point here is that if we got a non-dnr status, we should not
take any actions on this namespace because we need to retry first
(either because the controller is unavailable or it needs the host to
retry for another reason).

>> The problem that this solves is the fact that we have various error
>> recovery conditions (interconnect issues, failures, resets) and the
>> async works are designed to continue to run, issue I/O and fail. The
>> scan work, will revalidate namespaces and remove them if it fails to
>> do so, which is inherently wrong if these are simply inaccessible by
>> the host at this time.
> 
> Relying on a specific bit in the status code seems a bit indirect for
> this kind of condition.

I actually think this approach covers exactly what we are trying to
achieve. But I'll let others comment on this.
Keith Busch June 24, 2020, 6:03 p.m. UTC | #12
On Wed, Jun 24, 2020 at 10:46:03AM -0700, Sagi Grimberg wrote:
> On 6/24/20 10:25 AM, Keith Busch wrote:
> > On Tue, Jun 23, 2020 at 04:17:30PM -0700, Sagi Grimberg wrote:
> > And what if it is a DNR error? For example, the controller simply
> > doesn't support this CNS value. While the controller should support it,
> > we definitely don't need it for NVM command set namespaces.
> 
> Maybe I mis-undersatnd the comment, but if you see a DNR error, it means
> that the controller replied an error and its final, so then you can have
> extra checks.

If the controller does not support the CNS value, it may return Invalid
Field with DNR set. That error currently gets propogated back to
nvme_init_ns_head(), which then abandons the namespace. Just as the code
coments say, we had been historically been clearing such errors because
we have other ways to identify the namespace, but now we're not clearing
that error.
Sagi Grimberg June 24, 2020, 6:28 p.m. UTC | #13
>>> And what if it is a DNR error? For example, the controller simply
>>> doesn't support this CNS value. While the controller should support it,
>>> we definitely don't need it for NVM command set namespaces.
>>
>> Maybe I mis-undersatnd the comment, but if you see a DNR error, it means
>> that the controller replied an error and its final, so then you can have
>> extra checks.
> 
> If the controller does not support the CNS value, it may return Invalid
> Field with DNR set. That error currently gets propogated back to
> nvme_init_ns_head(), which then abandons the namespace. Just as the code
> coments say, we had been historically been clearing such errors because
> we have other ways to identify the namespace, but now we're not clearing
> that error.

I don't understand what you are saying Keith.

My comment was for this block:
--
	if (!status && nvme_multi_css(ctrl) && !csi_seen) {
		dev_warn(ctrl->device, "Command set not reported for nsid:%d\n",
			 nsid);
		status = -EINVAL;
	}
--

I was saying that !status doesn't necessarily mean success, but it can
also mean that its an retry-able error status (due to transport or
controller). If we see a retry-able error we should still clear the
status so we don't abandon the namespace.

This for example would achieve that:
--
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index ba512c45b40f..46d8a8379aff 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1126,12 +1126,21 @@ static int nvme_identify_ns_descs(struct 
nvme_ctrl *ctrl, unsigned nsid,
         if (status) {
                 dev_warn(ctrl->device,
                         "Identify Descriptors failed (%d)\n", status);
-                /*
-                 * Don't treat an error as fatal, as we potentially already
-                 * have a NGUID or EUI-64.
-                 */
-               if (status > 0 && !(status & NVME_SC_DNR))
-                       status = 0;
+
+               if (status > 0 && !(status & NVME_SC_DNR)) {
+                       if (nvme_multi_css(ctrl) && !csi_seen) {
+                               dev_warn(ctrl->device,
+                                       "Command set not reported for 
nsid:%d\n",
+                                       nsid);
+                               status = -EINVAL;
+                       } else {
+                               /*
+                                * Don't treat an error as fatal, as we
+                                * potentially already have a NGUID or 
EUI-64.
+                                */
+                               status = 0;
+                       }
+               }
                 goto free_data;
         }
--
Sagi Grimberg June 24, 2020, 6:33 p.m. UTC | #14
>> If the controller does not support the CNS value, it may return Invalid
>> Field with DNR set. That error currently gets propogated back to
>> nvme_init_ns_head(), which then abandons the namespace. Just as the code
>> coments say, we had been historically been clearing such errors because
>> we have other ways to identify the namespace, but now we're not clearing
>> that error.
> 
> I don't understand what you are saying Keith.
> 
> My comment was for this block:
> -- 
>      if (!status && nvme_multi_css(ctrl) && !csi_seen) {
>          dev_warn(ctrl->device, "Command set not reported for nsid:%d\n",
>               nsid);
>          status = -EINVAL;
>      }
> -- 
> 
> I was saying that !status doesn't necessarily mean success, but it can
> also mean that its an retry-able error status (due to transport or
> controller). If we see a retry-able error we should still clear the
> status so we don't abandon the namespace.
> 
> This for example would achieve that:

Sorry, meant this:
--
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index ba512c45b40f..3187cf768d08 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1130,8 +1130,14 @@ static int nvme_identify_ns_descs(struct 
nvme_ctrl *ctrl, unsigned nsid,
                   * Don't treat an error as fatal, as we potentially 
already
                   * have a NGUID or EUI-64.
                   */
-               if (status > 0 && !(status & NVME_SC_DNR))
+               if (status > 0 && !(status & NVME_SC_DNR)) {
                         status = 0;
+               } else if (status == 0 && nvme_multi_css(ctrl) && 
!csi_seen) {
+                               dev_warn(ctrl->device,
+                                       "Command set not reported for 
nsid:%d\n",
+                                       nsid);
+                               status = -EINVAL;
+               }
                 goto free_data;
         }
--
Keith Busch June 24, 2020, 6:40 p.m. UTC | #15
On Wed, Jun 24, 2020 at 11:33:25AM -0700, Sagi Grimberg wrote:
> > > If the controller does not support the CNS value, it may return Invalid
> > > Field with DNR set. That error currently gets propogated back to
> > > nvme_init_ns_head(), which then abandons the namespace. Just as the code
> > > coments say, we had been historically been clearing such errors because
> > > we have other ways to identify the namespace, but now we're not clearing
> > > that error.
> > 
> > I don't understand what you are saying Keith.
> > 
> > My comment was for this block:
> > -- 
> >      if (!status && nvme_multi_css(ctrl) && !csi_seen) {
> >          dev_warn(ctrl->device, "Command set not reported for nsid:%d\n",
> >               nsid);
> >          status = -EINVAL;
> >      }
> > -- 
> > 
> > I was saying that !status doesn't necessarily mean success, but it can
> > also mean that its an retry-able error status (due to transport or
> > controller). If we see a retry-able error we should still clear the
> > status so we don't abandon the namespace.
> > 
> > This for example would achieve that:

We're not talking about the same thing. I am only talking about what
introduced the DNR check, from commit 59c7c3caaaf87.

I know you added it because you want to abort comparing identifiers on a
rescan when retrieving the identifiers failed. That's fine, but I am
saying this fails namespace creation in the first place for some types
of devices that used to succeed.
Sagi Grimberg June 24, 2020, 7:03 p.m. UTC | #16
>>>> If the controller does not support the CNS value, it may return Invalid
>>>> Field with DNR set. That error currently gets propogated back to
>>>> nvme_init_ns_head(), which then abandons the namespace. Just as the code
>>>> coments say, we had been historically been clearing such errors because
>>>> we have other ways to identify the namespace, but now we're not clearing
>>>> that error.
>>>
>>> I don't understand what you are saying Keith.
>>>
>>> My comment was for this block:
>>> -- 
>>>       if (!status && nvme_multi_css(ctrl) && !csi_seen) {
>>>           dev_warn(ctrl->device, "Command set not reported for nsid:%d\n",
>>>                nsid);
>>>           status = -EINVAL;
>>>       }
>>> -- 
>>>
>>> I was saying that !status doesn't necessarily mean success, but it can
>>> also mean that its an retry-able error status (due to transport or
>>> controller). If we see a retry-able error we should still clear the
>>> status so we don't abandon the namespace.
>>>
>>> This for example would achieve that:
> 
> We're not talking about the same thing. I am only talking about what
> introduced the DNR check, from commit 59c7c3caaaf87.
> 
> I know you added it because you want to abort comparing identifiers on a
> rescan when retrieving the identifiers failed. That's fine, but I am
> saying this fails namespace creation in the first place for some types
> of devices that used to succeed.

OK, now I think I understand (thanks for clarifying that the discussion
is not on patch 3/5, but rather on 59c7c3caaaf87).

So the original proposal was to check NVME_SC_HOST_PATH_ERROR (and now
we have NVME_SC_HOST_ABORTED_CMD) but with the review Christoph gave
it seemed to make more sense that we generalize and check the DNR bit.

We could restrict it back to checking the status is
NVME_SC_HOST_PATH_ERROR or NVME_SC_HOST_ABORTED_CMD if you think it
creates problems. However, if we keep the code as is, the original
comment still holds.
Keith Busch June 24, 2020, 9:49 p.m. UTC | #17
On Wed, Jun 24, 2020 at 12:03:41PM -0700, Sagi Grimberg wrote:
> 
> > > > > If the controller does not support the CNS value, it may return Invalid
> > > > > Field with DNR set. That error currently gets propogated back to
> > > > > nvme_init_ns_head(), which then abandons the namespace. Just as the code
> > > > > coments say, we had been historically been clearing such errors because
> > > > > we have other ways to identify the namespace, but now we're not clearing
> > > > > that error.
> > > > 
> > > > I don't understand what you are saying Keith.
> > > > 
> > > > My comment was for this block:
> > > > -- 
> > > >       if (!status && nvme_multi_css(ctrl) && !csi_seen) {
> > > >           dev_warn(ctrl->device, "Command set not reported for nsid:%d\n",
> > > >                nsid);
> > > >           status = -EINVAL;
> > > >       }
> > > > -- 
> > > > 
> > > > I was saying that !status doesn't necessarily mean success, but it can
> > > > also mean that its an retry-able error status (due to transport or
> > > > controller). If we see a retry-able error we should still clear the
> > > > status so we don't abandon the namespace.
> > > > 
> > > > This for example would achieve that:
> > 
> > We're not talking about the same thing. I am only talking about what
> > introduced the DNR check, from commit 59c7c3caaaf87.
> > 
> > I know you added it because you want to abort comparing identifiers on a
> > rescan when retrieving the identifiers failed. That's fine, but I am
> > saying this fails namespace creation in the first place for some types
> > of devices that used to succeed.
> 
> OK, now I think I understand (thanks for clarifying that the discussion
> is not on patch 3/5, but rather on 59c7c3caaaf87).
> 
> So the original proposal was to check NVME_SC_HOST_PATH_ERROR (and now
> we have NVME_SC_HOST_ABORTED_CMD) but with the review Christoph gave
> it seemed to make more sense that we generalize and check the DNR bit.

Okay, I didn't question this approach when it first went through, so
sorry about this digression, but I really don't get how this DNR check
helps at all.

The code currently returns an error if DNR is set. Based on the commit
messages, it sounds like you need that error to skip comparing
identifiers through nvme's scan_work calling revalidate_disk():
suppressing the error has revalidate_disk() fail with -ENODEV when
comparing identifiers fails.

Since we do return the error when DNR is set, we skip comparing
identifiers and return blk_status_to_errno(nvme_error_status(ret))
instead. How is that errno an improvement?

And then if DNR is not set, we suppress the error and proceed with
comparing identifiers??
Sagi Grimberg June 24, 2020, 10:54 p.m. UTC | #18
On 6/24/20 2:49 PM, Keith Busch wrote:
> On Wed, Jun 24, 2020 at 12:03:41PM -0700, Sagi Grimberg wrote:
>>
>>>>>> If the controller does not support the CNS value, it may return Invalid
>>>>>> Field with DNR set. That error currently gets propogated back to
>>>>>> nvme_init_ns_head(), which then abandons the namespace. Just as the code
>>>>>> coments say, we had been historically been clearing such errors because
>>>>>> we have other ways to identify the namespace, but now we're not clearing
>>>>>> that error.
>>>>>
>>>>> I don't understand what you are saying Keith.
>>>>>
>>>>> My comment was for this block:
>>>>> -- 
>>>>>        if (!status && nvme_multi_css(ctrl) && !csi_seen) {
>>>>>            dev_warn(ctrl->device, "Command set not reported for nsid:%d\n",
>>>>>                 nsid);
>>>>>            status = -EINVAL;
>>>>>        }
>>>>> -- 
>>>>>
>>>>> I was saying that !status doesn't necessarily mean success, but it can
>>>>> also mean that its an retry-able error status (due to transport or
>>>>> controller). If we see a retry-able error we should still clear the
>>>>> status so we don't abandon the namespace.
>>>>>
>>>>> This for example would achieve that:
>>>
>>> We're not talking about the same thing. I am only talking about what
>>> introduced the DNR check, from commit 59c7c3caaaf87.
>>>
>>> I know you added it because you want to abort comparing identifiers on a
>>> rescan when retrieving the identifiers failed. That's fine, but I am
>>> saying this fails namespace creation in the first place for some types
>>> of devices that used to succeed.
>>
>> OK, now I think I understand (thanks for clarifying that the discussion
>> is not on patch 3/5, but rather on 59c7c3caaaf87).
>>
>> So the original proposal was to check NVME_SC_HOST_PATH_ERROR (and now
>> we have NVME_SC_HOST_ABORTED_CMD) but with the review Christoph gave
>> it seemed to make more sense that we generalize and check the DNR bit.
> 
> Okay, I didn't question this approach when it first went through, so
> sorry about this digression, but I really don't get how this DNR check
> helps at all.
> 
> The code currently returns an error if DNR is set.

Right.

> Based on the commit
> messages, it sounds like you need that error to skip comparing
> identifiers through nvme's scan_work calling revalidate_disk():
> suppressing the error has revalidate_disk() fail with -ENODEV when
> comparing identifiers fails.

Your understanding is correct.

> Since we do return the error when DNR is set, we skip comparing
> identifiers and return blk_status_to_errno(nvme_error_status(ret))
> instead. How is that errno an improvement?

See nvme_revalidate_disk:
out:
         /*
          * Only fail the function if we got a fatal error back from the
          * device, otherwise ignore the error and just move on.
          */
         if (ret == -ENOMEM || (ret > 0 && !(ret & NVME_SC_DNR)))
                 ret = 0;
         else if (ret > 0)
                 ret = blk_status_to_errno(nvme_error_status(ret));
         return ret;

So we don't actually propagate the error back if its a non-DNR (hence
retry-able error). The errno was there before in order to not leak NVMe
errors to the block layer.

> And then if DNR is not set, we suppress the error and proceed with
> comparing identifiers??

Wait, I think that I re-read it it's backwards. The intent was to indeed
clear the error for the DNR case and propagate the error for the non-DNR
case!

The code needs to be:
--
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 2afed32d3892..3e84ab6c2bd3 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1130,7 +1130,7 @@ static int nvme_identify_ns_descs(struct nvme_ctrl 
*ctrl, unsigned nsid,
                   * Don't treat an error as fatal, as we potentially 
already
                   * have a NGUID or EUI-64.
                   */
-               if (status > 0 && !(status & NVME_SC_DNR))
+               if (status > 0 && (status & NVME_SC_DNR))
                         status = 0;
                 goto free_data;
         }
--

This way, if the controller failed the identify it will be with DNR
status and we will silently ignore, and if the transport failed its
a non-DNR status, and we propagate the status back, skip the id compare,
and then silently ignore the error in nvme_revalidate_disk (as above).

Looking into the original fix we had internally, this was the case, and
got fat-fingered in I can only assume...

Will send a fix right away, good catch keith!
Keith Busch June 24, 2020, 11:54 p.m. UTC | #19
On Wed, Jun 24, 2020 at 03:54:05PM -0700, Sagi Grimberg wrote:
> The code needs to be:
> --
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index 2afed32d3892..3e84ab6c2bd3 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -1130,7 +1130,7 @@ static int nvme_identify_ns_descs(struct nvme_ctrl
> *ctrl, unsigned nsid,
>                   * Don't treat an error as fatal, as we potentially already
>                   * have a NGUID or EUI-64.
>                   */
> -               if (status > 0 && !(status & NVME_SC_DNR))
> +               if (status > 0 && (status & NVME_SC_DNR))
>                         status = 0;
>                 goto free_data;
>         }
> --

Aha, I was assuming the code was the way you wanted it, hence my
confusion :)

The above makes sense walking through different scenarios. I needed
to reconcile this in order to understand how we'll address it with
Nikla's patch that start this conversation.
Christoph Hellwig June 26, 2020, 8:54 a.m. UTC | #20
On Tue, Jun 23, 2020 at 01:53:47AM -0700, Sagi Grimberg wrote:
>>   	if (ns->lba_shift == 0)
>>   		ns->lba_shift = 9;
>>   +	switch (ns->head->ids.csi) {
>> +	case NVME_CSI_NVM:
>> +		break;
>> +	default:
>> +		dev_warn(ctrl->device, "unknown csi:%d ns:%d\n",
>> +			ns->head->ids.csi, ns->head->ns_id);
>> +		return -ENODEV;
>> +	}
>
> Not sure we need a switch-case statement for a single case target...

I think a switch makes inherent sense when there is an identifier that
can have multiple values, even if there only is one for now.
diff mbox series

Patch

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 9491dbcfe81a..45a3cb5a35bd 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1056,8 +1056,13 @@  static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id)
 	return error;
 }
 
+static bool nvme_multi_css(struct nvme_ctrl *ctrl)
+{
+	return (ctrl->ctrl_config & NVME_CC_CSS_MASK) == NVME_CC_CSS_CSI;
+}
+
 static int nvme_process_ns_desc(struct nvme_ctrl *ctrl, struct nvme_ns_ids *ids,
-		struct nvme_ns_id_desc *cur)
+		struct nvme_ns_id_desc *cur, bool *csi_seen)
 {
 	const char *warn_str = "ctrl returned bogus length:";
 	void *data = cur;
@@ -1087,6 +1092,15 @@  static int nvme_process_ns_desc(struct nvme_ctrl *ctrl, struct nvme_ns_ids *ids,
 		}
 		uuid_copy(&ids->uuid, data + sizeof(*cur));
 		return NVME_NIDT_UUID_LEN;
+	case NVME_NIDT_CSI:
+		if (cur->nidl != NVME_NIDT_CSI_LEN) {
+			dev_warn(ctrl->device, "%s %d for NVME_NIDT_CSI\n",
+				 warn_str, cur->nidl);
+			return -1;
+		}
+		memcpy(&ids->csi, data + sizeof(*cur), NVME_NIDT_CSI_LEN);
+		*csi_seen = true;
+		return NVME_NIDT_CSI_LEN;
 	default:
 		/* Skip unknown types */
 		return cur->nidl;
@@ -1097,10 +1111,9 @@  static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid,
 		struct nvme_ns_ids *ids)
 {
 	struct nvme_command c = { };
-	int status;
+	bool csi_seen = false;
+	int status, pos, len;
 	void *data;
-	int pos;
-	int len;
 
 	c.identify.opcode = nvme_admin_identify;
 	c.identify.nsid = cpu_to_le32(nsid);
@@ -1130,13 +1143,19 @@  static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid,
 		if (cur->nidl == 0)
 			break;
 
-		len = nvme_process_ns_desc(ctrl, ids, cur);
+		len = nvme_process_ns_desc(ctrl, ids, cur, &csi_seen);
 		if (len < 0)
 			goto free_data;
 
 		len += sizeof(*cur);
 	}
 free_data:
+	if (!status && nvme_multi_css(ctrl) && !csi_seen) {
+		dev_warn(ctrl->device, "Command set not reported for nsid:%d\n",
+			 nsid);
+		status = -EINVAL;
+	}
+
 	kfree(data);
 	return status;
 }
@@ -1792,7 +1811,7 @@  static int nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid,
 		memcpy(ids->eui64, id->eui64, sizeof(id->eui64));
 	if (ctrl->vs >= NVME_VS(1, 2, 0))
 		memcpy(ids->nguid, id->nguid, sizeof(id->nguid));
-	if (ctrl->vs >= NVME_VS(1, 3, 0))
+	if (ctrl->vs >= NVME_VS(1, 3, 0) || nvme_multi_css(ctrl))
 		return nvme_identify_ns_descs(ctrl, nsid, ids);
 	return 0;
 }
@@ -1808,7 +1827,8 @@  static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b)
 {
 	return uuid_equal(&a->uuid, &b->uuid) &&
 		memcmp(&a->nguid, &b->nguid, sizeof(a->nguid)) == 0 &&
-		memcmp(&a->eui64, &b->eui64, sizeof(a->eui64)) == 0;
+		memcmp(&a->eui64, &b->eui64, sizeof(a->eui64)) == 0 &&
+		a->csi == b->csi;
 }
 
 static int nvme_setup_streams_ns(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
@@ -1930,6 +1950,15 @@  static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
 	if (ns->lba_shift == 0)
 		ns->lba_shift = 9;
 
+	switch (ns->head->ids.csi) {
+	case NVME_CSI_NVM:
+		break;
+	default:
+		dev_warn(ctrl->device, "unknown csi:%d ns:%d\n",
+			ns->head->ids.csi, ns->head->ns_id);
+		return -ENODEV;
+	}
+
 	if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) &&
 	    is_power_of_2(ctrl->max_hw_sectors))
 		iob = ctrl->max_hw_sectors;
@@ -2264,7 +2293,10 @@  int nvme_enable_ctrl(struct nvme_ctrl *ctrl)
 
 	ctrl->page_size = 1 << page_shift;
 
-	ctrl->ctrl_config = NVME_CC_CSS_NVM;
+	if (NVME_CAP_CSS(ctrl->cap) & NVME_CAP_CSS_CSI)
+		ctrl->ctrl_config = NVME_CC_CSS_CSI;
+	else
+		ctrl->ctrl_config = NVME_CC_CSS_NVM;
 	ctrl->ctrl_config |= (page_shift - 12) << NVME_CC_MPS_SHIFT;
 	ctrl->ctrl_config |= NVME_CC_AMS_RR | NVME_CC_SHN_NONE;
 	ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index c0f4226d3299..a84f71459caa 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -339,6 +339,7 @@  struct nvme_ns_ids {
 	u8	eui64[8];
 	u8	nguid[16];
 	uuid_t	uuid;
+	u8	csi;
 };
 
 /*
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 5ce51ab4c50e..81ffe5247505 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -132,6 +132,7 @@  enum {
 #define NVME_CAP_TIMEOUT(cap)	(((cap) >> 24) & 0xff)
 #define NVME_CAP_STRIDE(cap)	(((cap) >> 32) & 0xf)
 #define NVME_CAP_NSSRC(cap)	(((cap) >> 36) & 0x1)
+#define NVME_CAP_CSS(cap)	(((cap) >> 37) & 0xff)
 #define NVME_CAP_MPSMIN(cap)	(((cap) >> 48) & 0xf)
 #define NVME_CAP_MPSMAX(cap)	(((cap) >> 52) & 0xf)
 
@@ -162,7 +163,6 @@  enum {
 
 enum {
 	NVME_CC_ENABLE		= 1 << 0,
-	NVME_CC_CSS_NVM		= 0 << 4,
 	NVME_CC_EN_SHIFT	= 0,
 	NVME_CC_CSS_SHIFT	= 4,
 	NVME_CC_MPS_SHIFT	= 7,
@@ -170,6 +170,9 @@  enum {
 	NVME_CC_SHN_SHIFT	= 14,
 	NVME_CC_IOSQES_SHIFT	= 16,
 	NVME_CC_IOCQES_SHIFT	= 20,
+	NVME_CC_CSS_NVM		= 0 << NVME_CC_CSS_SHIFT,
+	NVME_CC_CSS_CSI		= 6 << NVME_CC_CSS_SHIFT,
+	NVME_CC_CSS_MASK	= 7 << NVME_CC_CSS_SHIFT,
 	NVME_CC_AMS_RR		= 0 << NVME_CC_AMS_SHIFT,
 	NVME_CC_AMS_WRRU	= 1 << NVME_CC_AMS_SHIFT,
 	NVME_CC_AMS_VS		= 7 << NVME_CC_AMS_SHIFT,
@@ -179,6 +182,8 @@  enum {
 	NVME_CC_SHN_MASK	= 3 << NVME_CC_SHN_SHIFT,
 	NVME_CC_IOSQES		= NVME_NVM_IOSQES << NVME_CC_IOSQES_SHIFT,
 	NVME_CC_IOCQES		= NVME_NVM_IOCQES << NVME_CC_IOCQES_SHIFT,
+	NVME_CAP_CSS_NVM	= 1 << 0,
+	NVME_CAP_CSS_CSI	= 1 << 6,
 	NVME_CSTS_RDY		= 1 << 0,
 	NVME_CSTS_CFS		= 1 << 1,
 	NVME_CSTS_NSSRO		= 1 << 4,
@@ -374,6 +379,8 @@  enum {
 	NVME_ID_CNS_CTRL		= 0x01,
 	NVME_ID_CNS_NS_ACTIVE_LIST	= 0x02,
 	NVME_ID_CNS_NS_DESC_LIST	= 0x03,
+	NVME_ID_CNS_CS_NS		= 0x05,
+	NVME_ID_CNS_CS_CTRL		= 0x06,
 	NVME_ID_CNS_NS_PRESENT_LIST	= 0x10,
 	NVME_ID_CNS_NS_PRESENT		= 0x11,
 	NVME_ID_CNS_CTRL_NS_LIST	= 0x12,
@@ -383,6 +390,10 @@  enum {
 	NVME_ID_CNS_UUID_LIST		= 0x17,
 };
 
+enum {
+	NVME_CSI_NVM			= 0,
+};
+
 enum {
 	NVME_DIR_IDENTIFY		= 0x00,
 	NVME_DIR_STREAMS		= 0x01,
@@ -435,11 +446,13 @@  struct nvme_ns_id_desc {
 #define NVME_NIDT_EUI64_LEN	8
 #define NVME_NIDT_NGUID_LEN	16
 #define NVME_NIDT_UUID_LEN	16
+#define NVME_NIDT_CSI_LEN	1
 
 enum {
 	NVME_NIDT_EUI64		= 0x01,
 	NVME_NIDT_NGUID		= 0x02,
 	NVME_NIDT_UUID		= 0x03,
+	NVME_NIDT_CSI		= 0x04,
 };
 
 struct nvme_smart_log {
@@ -972,7 +985,9 @@  struct nvme_identify {
 	__u8			cns;
 	__u8			rsvd3;
 	__le16			ctrlid;
-	__u32			rsvd11[5];
+	__u8			rsvd11[3];
+	__u8			csi;
+	__u32			rsvd12[4];
 };
 
 #define NVME_IDENTIFY_DATA_SIZE 4096