[3/6] block: add support for zone offline transition
diff mbox series

Message ID 20200625122152.17359-4-javier@javigon.com
State New
Headers show
Series
  • ZNS: Extra features for current patches
Related show

Commit Message

Javier González June 25, 2020, 12:21 p.m. UTC
From: Javier González <javier.gonz@samsung.com>

Add support for offline transition on the zoned block device using the
new zone management IOCTL

Signed-off-by: Javier González <javier.gonz@samsung.com>
Signed-off-by: SelvaKumar S <selvakuma.s1@samsung.com>
Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
---
 block/blk-core.c              | 2 ++
 block/blk-zoned.c             | 3 +++
 drivers/nvme/host/core.c      | 3 +++
 include/linux/blk_types.h     | 3 +++
 include/linux/blkdev.h        | 1 -
 include/uapi/linux/blkzoned.h | 1 +
 6 files changed, 12 insertions(+), 1 deletion(-)

Comments

Matias Bjørling June 25, 2020, 2:12 p.m. UTC | #1
On 25/06/2020 14.21, Javier González wrote:
> From: Javier González <javier.gonz@samsung.com>
>
> Add support for offline transition on the zoned block device using the
> new zone management IOCTL
>
> Signed-off-by: Javier González <javier.gonz@samsung.com>
> Signed-off-by: SelvaKumar S <selvakuma.s1@samsung.com>
> Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
> Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
> ---
>   block/blk-core.c              | 2 ++
>   block/blk-zoned.c             | 3 +++
>   drivers/nvme/host/core.c      | 3 +++
>   include/linux/blk_types.h     | 3 +++
>   include/linux/blkdev.h        | 1 -
>   include/uapi/linux/blkzoned.h | 1 +
>   6 files changed, 12 insertions(+), 1 deletion(-)
>
> diff --git a/block/blk-core.c b/block/blk-core.c
> index 03252af8c82c..589cbdacc5ec 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -140,6 +140,7 @@ static const char *const blk_op_name[] = {
>   	REQ_OP_NAME(ZONE_CLOSE),
>   	REQ_OP_NAME(ZONE_FINISH),
>   	REQ_OP_NAME(ZONE_APPEND),
> +	REQ_OP_NAME(ZONE_OFFLINE),
>   	REQ_OP_NAME(WRITE_SAME),
>   	REQ_OP_NAME(WRITE_ZEROES),
>   	REQ_OP_NAME(SCSI_IN),
> @@ -1030,6 +1031,7 @@ generic_make_request_checks(struct bio *bio)
>   	case REQ_OP_ZONE_OPEN:
>   	case REQ_OP_ZONE_CLOSE:
>   	case REQ_OP_ZONE_FINISH:
> +	case REQ_OP_ZONE_OFFLINE:
>   		if (!blk_queue_is_zoned(q))
>   			goto not_supported;
>   		break;
> diff --git a/block/blk-zoned.c b/block/blk-zoned.c
> index 29194388a1bb..704fc15813d1 100644
> --- a/block/blk-zoned.c
> +++ b/block/blk-zoned.c
> @@ -416,6 +416,9 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode,
>   	case BLK_ZONE_MGMT_RESET:
>   		op = REQ_OP_ZONE_RESET;
>   		break;
> +	case BLK_ZONE_MGMT_OFFLINE:
> +		op = REQ_OP_ZONE_OFFLINE;
> +		break;
>   	default:
>   		return -ENOTTY;
>   	}
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index f1215523792b..5b95c81d2a2d 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -776,6 +776,9 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
>   	case REQ_OP_ZONE_FINISH:
>   		ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_FINISH);
>   		break;
> +	case REQ_OP_ZONE_OFFLINE:
> +		ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_OFFLINE);
> +		break;
>   	case REQ_OP_WRITE_ZEROES:
>   		ret = nvme_setup_write_zeroes(ns, req, cmd);
>   		break;
> diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
> index 16b57fb2b99c..b3921263c3dd 100644
> --- a/include/linux/blk_types.h
> +++ b/include/linux/blk_types.h
> @@ -316,6 +316,8 @@ enum req_opf {
>   	REQ_OP_ZONE_FINISH	= 12,
>   	/* write data at the current zone write pointer */
>   	REQ_OP_ZONE_APPEND	= 13,
> +	/* Transition a zone to offline */
> +	REQ_OP_ZONE_OFFLINE	= 14,
>   
>   	/* SCSI passthrough using struct scsi_request */
>   	REQ_OP_SCSI_IN		= 32,
> @@ -456,6 +458,7 @@ static inline bool op_is_zone_mgmt(enum req_opf op)
>   	case REQ_OP_ZONE_OPEN:
>   	case REQ_OP_ZONE_CLOSE:
>   	case REQ_OP_ZONE_FINISH:
> +	case REQ_OP_ZONE_OFFLINE:
>   		return true;
>   	default:
>   		return false;
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index bd8521f94dc4..8308d8a3720b 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -372,7 +372,6 @@ extern int blkdev_zone_ops_ioctl(struct block_device *bdev, fmode_t mode,
>   				  unsigned int cmd, unsigned long arg);
>   extern int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode,
>   				  unsigned int cmd, unsigned long arg);
> -
>   #else /* CONFIG_BLK_DEV_ZONED */
>   
>   static inline unsigned int blkdev_nr_zones(struct gendisk *disk)
> diff --git a/include/uapi/linux/blkzoned.h b/include/uapi/linux/blkzoned.h
> index a8c89fe58f97..d0978ee10fc7 100644
> --- a/include/uapi/linux/blkzoned.h
> +++ b/include/uapi/linux/blkzoned.h
> @@ -155,6 +155,7 @@ enum blk_zone_action {
>   	BLK_ZONE_MGMT_FINISH	= 0x2,
>   	BLK_ZONE_MGMT_OPEN	= 0x3,
>   	BLK_ZONE_MGMT_RESET	= 0x4,
> +	BLK_ZONE_MGMT_OFFLINE	= 0x5,
>   };
>   
>   /**

I am not sure this makes sense to expose through the kernel zone api. 
One of the goals of the kernel zone API is to be a layer that provides 
an unified zone model across SMR HDDs and ZNS SSDs. The offline zone 
operation, as defined in the ZNS specification, does not have an 
equivalent in SMR HDDs (ZAC/ZBC).

This is different from the Zone Capacity change, where the zone capacity 
simply was zone size for SMR HDDs. Making it easy to support. That is 
not the same for ZAC/ZBC, that does not offer the offline operation to 
transition zones in read only state to offline state.
Javier González June 25, 2020, 7:48 p.m. UTC | #2
On 25.06.2020 16:12, Matias Bjørling wrote:
>On 25/06/2020 14.21, Javier González wrote:
>>From: Javier González <javier.gonz@samsung.com>
>>
>>Add support for offline transition on the zoned block device using the
>>new zone management IOCTL
>>
>>Signed-off-by: Javier González <javier.gonz@samsung.com>
>>Signed-off-by: SelvaKumar S <selvakuma.s1@samsung.com>
>>Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
>>Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
>>---
>>  block/blk-core.c              | 2 ++
>>  block/blk-zoned.c             | 3 +++
>>  drivers/nvme/host/core.c      | 3 +++
>>  include/linux/blk_types.h     | 3 +++
>>  include/linux/blkdev.h        | 1 -
>>  include/uapi/linux/blkzoned.h | 1 +
>>  6 files changed, 12 insertions(+), 1 deletion(-)
>>
>>diff --git a/block/blk-core.c b/block/blk-core.c
>>index 03252af8c82c..589cbdacc5ec 100644
>>--- a/block/blk-core.c
>>+++ b/block/blk-core.c
>>@@ -140,6 +140,7 @@ static const char *const blk_op_name[] = {
>>  	REQ_OP_NAME(ZONE_CLOSE),
>>  	REQ_OP_NAME(ZONE_FINISH),
>>  	REQ_OP_NAME(ZONE_APPEND),
>>+	REQ_OP_NAME(ZONE_OFFLINE),
>>  	REQ_OP_NAME(WRITE_SAME),
>>  	REQ_OP_NAME(WRITE_ZEROES),
>>  	REQ_OP_NAME(SCSI_IN),
>>@@ -1030,6 +1031,7 @@ generic_make_request_checks(struct bio *bio)
>>  	case REQ_OP_ZONE_OPEN:
>>  	case REQ_OP_ZONE_CLOSE:
>>  	case REQ_OP_ZONE_FINISH:
>>+	case REQ_OP_ZONE_OFFLINE:
>>  		if (!blk_queue_is_zoned(q))
>>  			goto not_supported;
>>  		break;
>>diff --git a/block/blk-zoned.c b/block/blk-zoned.c
>>index 29194388a1bb..704fc15813d1 100644
>>--- a/block/blk-zoned.c
>>+++ b/block/blk-zoned.c
>>@@ -416,6 +416,9 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode,
>>  	case BLK_ZONE_MGMT_RESET:
>>  		op = REQ_OP_ZONE_RESET;
>>  		break;
>>+	case BLK_ZONE_MGMT_OFFLINE:
>>+		op = REQ_OP_ZONE_OFFLINE;
>>+		break;
>>  	default:
>>  		return -ENOTTY;
>>  	}
>>diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
>>index f1215523792b..5b95c81d2a2d 100644
>>--- a/drivers/nvme/host/core.c
>>+++ b/drivers/nvme/host/core.c
>>@@ -776,6 +776,9 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
>>  	case REQ_OP_ZONE_FINISH:
>>  		ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_FINISH);
>>  		break;
>>+	case REQ_OP_ZONE_OFFLINE:
>>+		ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_OFFLINE);
>>+		break;
>>  	case REQ_OP_WRITE_ZEROES:
>>  		ret = nvme_setup_write_zeroes(ns, req, cmd);
>>  		break;
>>diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
>>index 16b57fb2b99c..b3921263c3dd 100644
>>--- a/include/linux/blk_types.h
>>+++ b/include/linux/blk_types.h
>>@@ -316,6 +316,8 @@ enum req_opf {
>>  	REQ_OP_ZONE_FINISH	= 12,
>>  	/* write data at the current zone write pointer */
>>  	REQ_OP_ZONE_APPEND	= 13,
>>+	/* Transition a zone to offline */
>>+	REQ_OP_ZONE_OFFLINE	= 14,
>>  	/* SCSI passthrough using struct scsi_request */
>>  	REQ_OP_SCSI_IN		= 32,
>>@@ -456,6 +458,7 @@ static inline bool op_is_zone_mgmt(enum req_opf op)
>>  	case REQ_OP_ZONE_OPEN:
>>  	case REQ_OP_ZONE_CLOSE:
>>  	case REQ_OP_ZONE_FINISH:
>>+	case REQ_OP_ZONE_OFFLINE:
>>  		return true;
>>  	default:
>>  		return false;
>>diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
>>index bd8521f94dc4..8308d8a3720b 100644
>>--- a/include/linux/blkdev.h
>>+++ b/include/linux/blkdev.h
>>@@ -372,7 +372,6 @@ extern int blkdev_zone_ops_ioctl(struct block_device *bdev, fmode_t mode,
>>  				  unsigned int cmd, unsigned long arg);
>>  extern int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode,
>>  				  unsigned int cmd, unsigned long arg);
>>-
>>  #else /* CONFIG_BLK_DEV_ZONED */
>>  static inline unsigned int blkdev_nr_zones(struct gendisk *disk)
>>diff --git a/include/uapi/linux/blkzoned.h b/include/uapi/linux/blkzoned.h
>>index a8c89fe58f97..d0978ee10fc7 100644
>>--- a/include/uapi/linux/blkzoned.h
>>+++ b/include/uapi/linux/blkzoned.h
>>@@ -155,6 +155,7 @@ enum blk_zone_action {
>>  	BLK_ZONE_MGMT_FINISH	= 0x2,
>>  	BLK_ZONE_MGMT_OPEN	= 0x3,
>>  	BLK_ZONE_MGMT_RESET	= 0x4,
>>+	BLK_ZONE_MGMT_OFFLINE	= 0x5,
>>  };
>>  /**
>
>I am not sure this makes sense to expose through the kernel zone api. 
>One of the goals of the kernel zone API is to be a layer that provides 
>an unified zone model across SMR HDDs and ZNS SSDs. The offline zone 
>operation, as defined in the ZNS specification, does not have an 
>equivalent in SMR HDDs (ZAC/ZBC).
>
>This is different from the Zone Capacity change, where the zone 
>capacity simply was zone size for SMR HDDs. Making it easy to support. 
>That is not the same for ZAC/ZBC, that does not offer the offline 
>operation to transition zones in read only state to offline state.

I agree that an unified interface is desirable. However, the truth is
that ZAC/ZBC are different, and will differ more and more and time goes
by. We can deal with the differences at the driver level or with checks
at the API level, but limiting ZNS with ZAC/ZBC is a hard constraint.

Note too that I chose to only support this particular transition on the
new management IOCTL to avoid confusion for existing ZAC/ZBC users.

It would be good to clarify what is the plan for kernel APIs moving
forward, as I believe there is a general desire to support new ZNS
features, which will not necessarily be replicated in SMR drives.

Javier
Damien Le Moal June 26, 2020, 1:14 a.m. UTC | #3
On 2020/06/26 4:48, Javier González wrote:
> On 25.06.2020 16:12, Matias Bjørling wrote:
>> On 25/06/2020 14.21, Javier González wrote:
>>> From: Javier González <javier.gonz@samsung.com>
>>>
>>> Add support for offline transition on the zoned block device using the
>>> new zone management IOCTL
>>>
>>> Signed-off-by: Javier González <javier.gonz@samsung.com>
>>> Signed-off-by: SelvaKumar S <selvakuma.s1@samsung.com>
>>> Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
>>> Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
>>> ---
>>>  block/blk-core.c              | 2 ++
>>>  block/blk-zoned.c             | 3 +++
>>>  drivers/nvme/host/core.c      | 3 +++
>>>  include/linux/blk_types.h     | 3 +++
>>>  include/linux/blkdev.h        | 1 -
>>>  include/uapi/linux/blkzoned.h | 1 +
>>>  6 files changed, 12 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/block/blk-core.c b/block/blk-core.c
>>> index 03252af8c82c..589cbdacc5ec 100644
>>> --- a/block/blk-core.c
>>> +++ b/block/blk-core.c
>>> @@ -140,6 +140,7 @@ static const char *const blk_op_name[] = {
>>>  	REQ_OP_NAME(ZONE_CLOSE),
>>>  	REQ_OP_NAME(ZONE_FINISH),
>>>  	REQ_OP_NAME(ZONE_APPEND),
>>> +	REQ_OP_NAME(ZONE_OFFLINE),
>>>  	REQ_OP_NAME(WRITE_SAME),
>>>  	REQ_OP_NAME(WRITE_ZEROES),
>>>  	REQ_OP_NAME(SCSI_IN),
>>> @@ -1030,6 +1031,7 @@ generic_make_request_checks(struct bio *bio)
>>>  	case REQ_OP_ZONE_OPEN:
>>>  	case REQ_OP_ZONE_CLOSE:
>>>  	case REQ_OP_ZONE_FINISH:
>>> +	case REQ_OP_ZONE_OFFLINE:
>>>  		if (!blk_queue_is_zoned(q))
>>>  			goto not_supported;
>>>  		break;
>>> diff --git a/block/blk-zoned.c b/block/blk-zoned.c
>>> index 29194388a1bb..704fc15813d1 100644
>>> --- a/block/blk-zoned.c
>>> +++ b/block/blk-zoned.c
>>> @@ -416,6 +416,9 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode,
>>>  	case BLK_ZONE_MGMT_RESET:
>>>  		op = REQ_OP_ZONE_RESET;
>>>  		break;
>>> +	case BLK_ZONE_MGMT_OFFLINE:
>>> +		op = REQ_OP_ZONE_OFFLINE;
>>> +		break;
>>>  	default:
>>>  		return -ENOTTY;
>>>  	}
>>> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
>>> index f1215523792b..5b95c81d2a2d 100644
>>> --- a/drivers/nvme/host/core.c
>>> +++ b/drivers/nvme/host/core.c
>>> @@ -776,6 +776,9 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
>>>  	case REQ_OP_ZONE_FINISH:
>>>  		ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_FINISH);
>>>  		break;
>>> +	case REQ_OP_ZONE_OFFLINE:
>>> +		ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_OFFLINE);
>>> +		break;
>>>  	case REQ_OP_WRITE_ZEROES:
>>>  		ret = nvme_setup_write_zeroes(ns, req, cmd);
>>>  		break;
>>> diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
>>> index 16b57fb2b99c..b3921263c3dd 100644
>>> --- a/include/linux/blk_types.h
>>> +++ b/include/linux/blk_types.h
>>> @@ -316,6 +316,8 @@ enum req_opf {
>>>  	REQ_OP_ZONE_FINISH	= 12,
>>>  	/* write data at the current zone write pointer */
>>>  	REQ_OP_ZONE_APPEND	= 13,
>>> +	/* Transition a zone to offline */
>>> +	REQ_OP_ZONE_OFFLINE	= 14,
>>>  	/* SCSI passthrough using struct scsi_request */
>>>  	REQ_OP_SCSI_IN		= 32,
>>> @@ -456,6 +458,7 @@ static inline bool op_is_zone_mgmt(enum req_opf op)
>>>  	case REQ_OP_ZONE_OPEN:
>>>  	case REQ_OP_ZONE_CLOSE:
>>>  	case REQ_OP_ZONE_FINISH:
>>> +	case REQ_OP_ZONE_OFFLINE:
>>>  		return true;
>>>  	default:
>>>  		return false;
>>> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
>>> index bd8521f94dc4..8308d8a3720b 100644
>>> --- a/include/linux/blkdev.h
>>> +++ b/include/linux/blkdev.h
>>> @@ -372,7 +372,6 @@ extern int blkdev_zone_ops_ioctl(struct block_device *bdev, fmode_t mode,
>>>  				  unsigned int cmd, unsigned long arg);
>>>  extern int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode,
>>>  				  unsigned int cmd, unsigned long arg);
>>> -
>>>  #else /* CONFIG_BLK_DEV_ZONED */
>>>  static inline unsigned int blkdev_nr_zones(struct gendisk *disk)
>>> diff --git a/include/uapi/linux/blkzoned.h b/include/uapi/linux/blkzoned.h
>>> index a8c89fe58f97..d0978ee10fc7 100644
>>> --- a/include/uapi/linux/blkzoned.h
>>> +++ b/include/uapi/linux/blkzoned.h
>>> @@ -155,6 +155,7 @@ enum blk_zone_action {
>>>  	BLK_ZONE_MGMT_FINISH	= 0x2,
>>>  	BLK_ZONE_MGMT_OPEN	= 0x3,
>>>  	BLK_ZONE_MGMT_RESET	= 0x4,
>>> +	BLK_ZONE_MGMT_OFFLINE	= 0x5,
>>>  };
>>>  /**
>>
>> I am not sure this makes sense to expose through the kernel zone api. 
>> One of the goals of the kernel zone API is to be a layer that provides 
>> an unified zone model across SMR HDDs and ZNS SSDs. The offline zone 
>> operation, as defined in the ZNS specification, does not have an 
>> equivalent in SMR HDDs (ZAC/ZBC).
>>
>> This is different from the Zone Capacity change, where the zone 
>> capacity simply was zone size for SMR HDDs. Making it easy to support. 
>> That is not the same for ZAC/ZBC, that does not offer the offline 
>> operation to transition zones in read only state to offline state.
> 
> I agree that an unified interface is desirable. However, the truth is
> that ZAC/ZBC are different, and will differ more and more and time goes
> by. We can deal with the differences at the driver level or with checks
> at the API level, but limiting ZNS with ZAC/ZBC is a hard constraint.

As long as you keep ZNS namespace report itself as being "host-managed" like
ZBC/ZAC disks, we need the consistency and common interface. If you break that,
the meaning of the zoned model queue attribute disappears and an application or
in-kernel user cannot rely on this model anymore to know how the drive will behave.

> Note too that I chose to only support this particular transition on the
> new management IOCTL to avoid confusion for existing ZAC/ZBC users.
> 
> It would be good to clarify what is the plan for kernel APIs moving
> forward, as I believe there is a general desire to support new ZNS
> features, which will not necessarily be replicated in SMR drives.

What the drive is supposed to support and its behavior is determined by the
zoned model. ZNS standard was written so that most things have an equivalent
with ZBC/ZAC, e.g. the zone state machine is nearly identical. Differences are
either emulated (e.g. zone append scsi emulation), or not supported (e.g. zone
capacity change) so that the kernel follows the same pattern and maintains a
coherent behavior between device protocols for the host-managed model.

Think of a file system, or any other in-kernel user. If they have to change
their code based on the device type (NVMe vs SCSI), then the zoned block device
interface is broken. Right now, that is not the case, everything works equally
well on ZNS and SCSI, modulo the need by a user for conventional zones that ZNS
do not define. But that is still consistent with the host-managed model since
conventional zones are optional.

For this particular patch, there is currently no in-kernel user, and it is not
clear how this will be useful to applications. At least please clarify this. And
most likely, similarly to discard etc operations that are optional, having a
sysfs attribute and in-kernel API indicating if the drive supports offlining
zones will be needed. Otherwise, the caller will have to play with error codes
to understand if the drive does not support the command or if it is supported
but the command failed. Not nice. Better to know before issuing the command.


> 
> Javier
>
Damien Le Moal June 26, 2020, 1:34 a.m. UTC | #4
On 2020/06/25 21:22, Javier González wrote:
> From: Javier González <javier.gonz@samsung.com>
> 
> Add support for offline transition on the zoned block device using the
> new zone management IOCTL
> 
> Signed-off-by: Javier González <javier.gonz@samsung.com>
> Signed-off-by: SelvaKumar S <selvakuma.s1@samsung.com>
> Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
> Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
> ---
>  block/blk-core.c              | 2 ++
>  block/blk-zoned.c             | 3 +++
>  drivers/nvme/host/core.c      | 3 +++
>  include/linux/blk_types.h     | 3 +++
>  include/linux/blkdev.h        | 1 -
>  include/uapi/linux/blkzoned.h | 1 +
>  6 files changed, 12 insertions(+), 1 deletion(-)
> 
> diff --git a/block/blk-core.c b/block/blk-core.c
> index 03252af8c82c..589cbdacc5ec 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -140,6 +140,7 @@ static const char *const blk_op_name[] = {
>  	REQ_OP_NAME(ZONE_CLOSE),
>  	REQ_OP_NAME(ZONE_FINISH),
>  	REQ_OP_NAME(ZONE_APPEND),
> +	REQ_OP_NAME(ZONE_OFFLINE),
>  	REQ_OP_NAME(WRITE_SAME),
>  	REQ_OP_NAME(WRITE_ZEROES),
>  	REQ_OP_NAME(SCSI_IN),
> @@ -1030,6 +1031,7 @@ generic_make_request_checks(struct bio *bio)
>  	case REQ_OP_ZONE_OPEN:
>  	case REQ_OP_ZONE_CLOSE:
>  	case REQ_OP_ZONE_FINISH:
> +	case REQ_OP_ZONE_OFFLINE:
>  		if (!blk_queue_is_zoned(q))
>  			goto not_supported;
>  		break;
> diff --git a/block/blk-zoned.c b/block/blk-zoned.c
> index 29194388a1bb..704fc15813d1 100644
> --- a/block/blk-zoned.c
> +++ b/block/blk-zoned.c
> @@ -416,6 +416,9 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode,
>  	case BLK_ZONE_MGMT_RESET:
>  		op = REQ_OP_ZONE_RESET;
>  		break;
> +	case BLK_ZONE_MGMT_OFFLINE:
> +		op = REQ_OP_ZONE_OFFLINE;
> +		break;
>  	default:
>  		return -ENOTTY;
>  	}
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index f1215523792b..5b95c81d2a2d 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -776,6 +776,9 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
>  	case REQ_OP_ZONE_FINISH:
>  		ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_FINISH);
>  		break;
> +	case REQ_OP_ZONE_OFFLINE:
> +		ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_OFFLINE);
> +		break;
>  	case REQ_OP_WRITE_ZEROES:
>  		ret = nvme_setup_write_zeroes(ns, req, cmd);
>  		break;
> diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
> index 16b57fb2b99c..b3921263c3dd 100644
> --- a/include/linux/blk_types.h
> +++ b/include/linux/blk_types.h
> @@ -316,6 +316,8 @@ enum req_opf {
>  	REQ_OP_ZONE_FINISH	= 12,
>  	/* write data at the current zone write pointer */
>  	REQ_OP_ZONE_APPEND	= 13,
> +	/* Transition a zone to offline */
> +	REQ_OP_ZONE_OFFLINE	= 14,
>  
>  	/* SCSI passthrough using struct scsi_request */
>  	REQ_OP_SCSI_IN		= 32,
> @@ -456,6 +458,7 @@ static inline bool op_is_zone_mgmt(enum req_opf op)
>  	case REQ_OP_ZONE_OPEN:
>  	case REQ_OP_ZONE_CLOSE:
>  	case REQ_OP_ZONE_FINISH:
> +	case REQ_OP_ZONE_OFFLINE:
>  		return true;
>  	default:
>  		return false;
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index bd8521f94dc4..8308d8a3720b 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -372,7 +372,6 @@ extern int blkdev_zone_ops_ioctl(struct block_device *bdev, fmode_t mode,
>  				  unsigned int cmd, unsigned long arg);
>  extern int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode,
>  				  unsigned int cmd, unsigned long arg);
> -
>  #else /* CONFIG_BLK_DEV_ZONED */
>  
>  static inline unsigned int blkdev_nr_zones(struct gendisk *disk)
> diff --git a/include/uapi/linux/blkzoned.h b/include/uapi/linux/blkzoned.h
> index a8c89fe58f97..d0978ee10fc7 100644
> --- a/include/uapi/linux/blkzoned.h
> +++ b/include/uapi/linux/blkzoned.h
> @@ -155,6 +155,7 @@ enum blk_zone_action {
>  	BLK_ZONE_MGMT_FINISH	= 0x2,
>  	BLK_ZONE_MGMT_OPEN	= 0x3,
>  	BLK_ZONE_MGMT_RESET	= 0x4,
> +	BLK_ZONE_MGMT_OFFLINE	= 0x5,
>  };
>  
>  /**
> 

As mentioned in previous email, the usefulness of this is dubious. Please
elaborate in the commit message. Sure NVMe ZNS defines this and we can support
it. But without a good use case, what is the point ?

scsi SD driver will return BLK_STS_NOTSUPP if this offlining is sent to a
ZBC/ZAC drive. Not nice. Having a sysfs attribute "max_offline_zone_sectors" or
the like to indicate support by the device or not would be nicer.

Does offling ALL zones make any sense ? Because this patch does not prevent the
use of the REQ_ZONE_ALL flags introduced in patch 2. Probably not a good idea to
allow offlining all zones, no ?
Javier González June 26, 2020, 6:08 a.m. UTC | #5
On 26.06.2020 01:34, Damien Le Moal wrote:
>On 2020/06/25 21:22, Javier González wrote:
>> From: Javier González <javier.gonz@samsung.com>
>>
>> Add support for offline transition on the zoned block device using the
>> new zone management IOCTL
>>
>> Signed-off-by: Javier González <javier.gonz@samsung.com>
>> Signed-off-by: SelvaKumar S <selvakuma.s1@samsung.com>
>> Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
>> Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
>> ---
>>  block/blk-core.c              | 2 ++
>>  block/blk-zoned.c             | 3 +++
>>  drivers/nvme/host/core.c      | 3 +++
>>  include/linux/blk_types.h     | 3 +++
>>  include/linux/blkdev.h        | 1 -
>>  include/uapi/linux/blkzoned.h | 1 +
>>  6 files changed, 12 insertions(+), 1 deletion(-)
>>
>> diff --git a/block/blk-core.c b/block/blk-core.c
>> index 03252af8c82c..589cbdacc5ec 100644
>> --- a/block/blk-core.c
>> +++ b/block/blk-core.c
>> @@ -140,6 +140,7 @@ static const char *const blk_op_name[] = {
>>  	REQ_OP_NAME(ZONE_CLOSE),
>>  	REQ_OP_NAME(ZONE_FINISH),
>>  	REQ_OP_NAME(ZONE_APPEND),
>> +	REQ_OP_NAME(ZONE_OFFLINE),
>>  	REQ_OP_NAME(WRITE_SAME),
>>  	REQ_OP_NAME(WRITE_ZEROES),
>>  	REQ_OP_NAME(SCSI_IN),
>> @@ -1030,6 +1031,7 @@ generic_make_request_checks(struct bio *bio)
>>  	case REQ_OP_ZONE_OPEN:
>>  	case REQ_OP_ZONE_CLOSE:
>>  	case REQ_OP_ZONE_FINISH:
>> +	case REQ_OP_ZONE_OFFLINE:
>>  		if (!blk_queue_is_zoned(q))
>>  			goto not_supported;
>>  		break;
>> diff --git a/block/blk-zoned.c b/block/blk-zoned.c
>> index 29194388a1bb..704fc15813d1 100644
>> --- a/block/blk-zoned.c
>> +++ b/block/blk-zoned.c
>> @@ -416,6 +416,9 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode,
>>  	case BLK_ZONE_MGMT_RESET:
>>  		op = REQ_OP_ZONE_RESET;
>>  		break;
>> +	case BLK_ZONE_MGMT_OFFLINE:
>> +		op = REQ_OP_ZONE_OFFLINE;
>> +		break;
>>  	default:
>>  		return -ENOTTY;
>>  	}
>> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
>> index f1215523792b..5b95c81d2a2d 100644
>> --- a/drivers/nvme/host/core.c
>> +++ b/drivers/nvme/host/core.c
>> @@ -776,6 +776,9 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
>>  	case REQ_OP_ZONE_FINISH:
>>  		ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_FINISH);
>>  		break;
>> +	case REQ_OP_ZONE_OFFLINE:
>> +		ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_OFFLINE);
>> +		break;
>>  	case REQ_OP_WRITE_ZEROES:
>>  		ret = nvme_setup_write_zeroes(ns, req, cmd);
>>  		break;
>> diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
>> index 16b57fb2b99c..b3921263c3dd 100644
>> --- a/include/linux/blk_types.h
>> +++ b/include/linux/blk_types.h
>> @@ -316,6 +316,8 @@ enum req_opf {
>>  	REQ_OP_ZONE_FINISH	= 12,
>>  	/* write data at the current zone write pointer */
>>  	REQ_OP_ZONE_APPEND	= 13,
>> +	/* Transition a zone to offline */
>> +	REQ_OP_ZONE_OFFLINE	= 14,
>>
>>  	/* SCSI passthrough using struct scsi_request */
>>  	REQ_OP_SCSI_IN		= 32,
>> @@ -456,6 +458,7 @@ static inline bool op_is_zone_mgmt(enum req_opf op)
>>  	case REQ_OP_ZONE_OPEN:
>>  	case REQ_OP_ZONE_CLOSE:
>>  	case REQ_OP_ZONE_FINISH:
>> +	case REQ_OP_ZONE_OFFLINE:
>>  		return true;
>>  	default:
>>  		return false;
>> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
>> index bd8521f94dc4..8308d8a3720b 100644
>> --- a/include/linux/blkdev.h
>> +++ b/include/linux/blkdev.h
>> @@ -372,7 +372,6 @@ extern int blkdev_zone_ops_ioctl(struct block_device *bdev, fmode_t mode,
>>  				  unsigned int cmd, unsigned long arg);
>>  extern int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode,
>>  				  unsigned int cmd, unsigned long arg);
>> -
>>  #else /* CONFIG_BLK_DEV_ZONED */
>>
>>  static inline unsigned int blkdev_nr_zones(struct gendisk *disk)
>> diff --git a/include/uapi/linux/blkzoned.h b/include/uapi/linux/blkzoned.h
>> index a8c89fe58f97..d0978ee10fc7 100644
>> --- a/include/uapi/linux/blkzoned.h
>> +++ b/include/uapi/linux/blkzoned.h
>> @@ -155,6 +155,7 @@ enum blk_zone_action {
>>  	BLK_ZONE_MGMT_FINISH	= 0x2,
>>  	BLK_ZONE_MGMT_OPEN	= 0x3,
>>  	BLK_ZONE_MGMT_RESET	= 0x4,
>> +	BLK_ZONE_MGMT_OFFLINE	= 0x5,
>>  };
>>
>>  /**
>>
>
>As mentioned in previous email, the usefulness of this is dubious. Please
>elaborate in the commit message. Sure NVMe ZNS defines this and we can support
>it. But without a good use case, what is the point ?

Use case is to transition zones in read-only state to offline when we
are done moving valid data. It is easier to explicitly managing zones
that are not usable by having all under the offline state.

>
>scsi SD driver will return BLK_STS_NOTSUPP if this offlining is sent to a
>ZBC/ZAC drive. Not nice. Having a sysfs attribute "max_offline_zone_sectors" or
>the like to indicate support by the device or not would be nicer.

We can do that.

>
>Does offling ALL zones make any sense ? Because this patch does not prevent the
>use of the REQ_ZONE_ALL flags introduced in patch 2. Probably not a good idea to
>allow offlining all zones, no ?

AFAIK the transition to offline is only valid when coming from a
read-only state. I did think of adding a check, but I can see that other
transitions go directly to the driver and then the device, so I decided
to follow the same model. If you think it is better, we can add the
check.

Javier
Javier González June 26, 2020, 6:18 a.m. UTC | #6
On 26.06.2020 01:14, Damien Le Moal wrote:
>On 2020/06/26 4:48, Javier González wrote:
>> On 25.06.2020 16:12, Matias Bjørling wrote:
>>> On 25/06/2020 14.21, Javier González wrote:
>>>> From: Javier González <javier.gonz@samsung.com>
>>>>
>>>> Add support for offline transition on the zoned block device using the
>>>> new zone management IOCTL
>>>>
>>>> Signed-off-by: Javier González <javier.gonz@samsung.com>
>>>> Signed-off-by: SelvaKumar S <selvakuma.s1@samsung.com>
>>>> Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
>>>> Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
>>>> ---
>>>>  block/blk-core.c              | 2 ++
>>>>  block/blk-zoned.c             | 3 +++
>>>>  drivers/nvme/host/core.c      | 3 +++
>>>>  include/linux/blk_types.h     | 3 +++
>>>>  include/linux/blkdev.h        | 1 -
>>>>  include/uapi/linux/blkzoned.h | 1 +
>>>>  6 files changed, 12 insertions(+), 1 deletion(-)
>>>>
>>>> diff --git a/block/blk-core.c b/block/blk-core.c
>>>> index 03252af8c82c..589cbdacc5ec 100644
>>>> --- a/block/blk-core.c
>>>> +++ b/block/blk-core.c
>>>> @@ -140,6 +140,7 @@ static const char *const blk_op_name[] = {
>>>>  	REQ_OP_NAME(ZONE_CLOSE),
>>>>  	REQ_OP_NAME(ZONE_FINISH),
>>>>  	REQ_OP_NAME(ZONE_APPEND),
>>>> +	REQ_OP_NAME(ZONE_OFFLINE),
>>>>  	REQ_OP_NAME(WRITE_SAME),
>>>>  	REQ_OP_NAME(WRITE_ZEROES),
>>>>  	REQ_OP_NAME(SCSI_IN),
>>>> @@ -1030,6 +1031,7 @@ generic_make_request_checks(struct bio *bio)
>>>>  	case REQ_OP_ZONE_OPEN:
>>>>  	case REQ_OP_ZONE_CLOSE:
>>>>  	case REQ_OP_ZONE_FINISH:
>>>> +	case REQ_OP_ZONE_OFFLINE:
>>>>  		if (!blk_queue_is_zoned(q))
>>>>  			goto not_supported;
>>>>  		break;
>>>> diff --git a/block/blk-zoned.c b/block/blk-zoned.c
>>>> index 29194388a1bb..704fc15813d1 100644
>>>> --- a/block/blk-zoned.c
>>>> +++ b/block/blk-zoned.c
>>>> @@ -416,6 +416,9 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode,
>>>>  	case BLK_ZONE_MGMT_RESET:
>>>>  		op = REQ_OP_ZONE_RESET;
>>>>  		break;
>>>> +	case BLK_ZONE_MGMT_OFFLINE:
>>>> +		op = REQ_OP_ZONE_OFFLINE;
>>>> +		break;
>>>>  	default:
>>>>  		return -ENOTTY;
>>>>  	}
>>>> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
>>>> index f1215523792b..5b95c81d2a2d 100644
>>>> --- a/drivers/nvme/host/core.c
>>>> +++ b/drivers/nvme/host/core.c
>>>> @@ -776,6 +776,9 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
>>>>  	case REQ_OP_ZONE_FINISH:
>>>>  		ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_FINISH);
>>>>  		break;
>>>> +	case REQ_OP_ZONE_OFFLINE:
>>>> +		ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_OFFLINE);
>>>> +		break;
>>>>  	case REQ_OP_WRITE_ZEROES:
>>>>  		ret = nvme_setup_write_zeroes(ns, req, cmd);
>>>>  		break;
>>>> diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
>>>> index 16b57fb2b99c..b3921263c3dd 100644
>>>> --- a/include/linux/blk_types.h
>>>> +++ b/include/linux/blk_types.h
>>>> @@ -316,6 +316,8 @@ enum req_opf {
>>>>  	REQ_OP_ZONE_FINISH	= 12,
>>>>  	/* write data at the current zone write pointer */
>>>>  	REQ_OP_ZONE_APPEND	= 13,
>>>> +	/* Transition a zone to offline */
>>>> +	REQ_OP_ZONE_OFFLINE	= 14,
>>>>  	/* SCSI passthrough using struct scsi_request */
>>>>  	REQ_OP_SCSI_IN		= 32,
>>>> @@ -456,6 +458,7 @@ static inline bool op_is_zone_mgmt(enum req_opf op)
>>>>  	case REQ_OP_ZONE_OPEN:
>>>>  	case REQ_OP_ZONE_CLOSE:
>>>>  	case REQ_OP_ZONE_FINISH:
>>>> +	case REQ_OP_ZONE_OFFLINE:
>>>>  		return true;
>>>>  	default:
>>>>  		return false;
>>>> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
>>>> index bd8521f94dc4..8308d8a3720b 100644
>>>> --- a/include/linux/blkdev.h
>>>> +++ b/include/linux/blkdev.h
>>>> @@ -372,7 +372,6 @@ extern int blkdev_zone_ops_ioctl(struct block_device *bdev, fmode_t mode,
>>>>  				  unsigned int cmd, unsigned long arg);
>>>>  extern int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode,
>>>>  				  unsigned int cmd, unsigned long arg);
>>>> -
>>>>  #else /* CONFIG_BLK_DEV_ZONED */
>>>>  static inline unsigned int blkdev_nr_zones(struct gendisk *disk)
>>>> diff --git a/include/uapi/linux/blkzoned.h b/include/uapi/linux/blkzoned.h
>>>> index a8c89fe58f97..d0978ee10fc7 100644
>>>> --- a/include/uapi/linux/blkzoned.h
>>>> +++ b/include/uapi/linux/blkzoned.h
>>>> @@ -155,6 +155,7 @@ enum blk_zone_action {
>>>>  	BLK_ZONE_MGMT_FINISH	= 0x2,
>>>>  	BLK_ZONE_MGMT_OPEN	= 0x3,
>>>>  	BLK_ZONE_MGMT_RESET	= 0x4,
>>>> +	BLK_ZONE_MGMT_OFFLINE	= 0x5,
>>>>  };
>>>>  /**
>>>
>>> I am not sure this makes sense to expose through the kernel zone api.
>>> One of the goals of the kernel zone API is to be a layer that provides
>>> an unified zone model across SMR HDDs and ZNS SSDs. The offline zone
>>> operation, as defined in the ZNS specification, does not have an
>>> equivalent in SMR HDDs (ZAC/ZBC).
>>>
>>> This is different from the Zone Capacity change, where the zone
>>> capacity simply was zone size for SMR HDDs. Making it easy to support.
>>> That is not the same for ZAC/ZBC, that does not offer the offline
>>> operation to transition zones in read only state to offline state.
>>
>> I agree that an unified interface is desirable. However, the truth is
>> that ZAC/ZBC are different, and will differ more and more and time goes
>> by. We can deal with the differences at the driver level or with checks
>> at the API level, but limiting ZNS with ZAC/ZBC is a hard constraint.
>
>As long as you keep ZNS namespace report itself as being "host-managed" like
>ZBC/ZAC disks, we need the consistency and common interface. If you break that,
>the meaning of the zoned model queue attribute disappears and an application or
>in-kernel user cannot rely on this model anymore to know how the drive will behave.

I agree. The API should be clean and common, but that should not prevent
extensions to ZAC/ZBC or ZNS specifics. The suggestions you propose in
the other patches make sense to do this properly.

>
>> Note too that I chose to only support this particular transition on the
>> new management IOCTL to avoid confusion for existing ZAC/ZBC users.
>>
>> It would be good to clarify what is the plan for kernel APIs moving
>> forward, as I believe there is a general desire to support new ZNS
>> features, which will not necessarily be replicated in SMR drives.
>
>What the drive is supposed to support and its behavior is determined by the
>zoned model. ZNS standard was written so that most things have an equivalent
>with ZBC/ZAC, e.g. the zone state machine is nearly identical. Differences are
>either emulated (e.g. zone append scsi emulation), or not supported (e.g. zone
>capacity change) so that the kernel follows the same pattern and maintains a
>coherent behavior between device protocols for the host-managed model.

Yes.

>
>Think of a file system, or any other in-kernel user. If they have to change
>their code based on the device type (NVMe vs SCSI), then the zoned block device
>interface is broken. Right now, that is not the case, everything works equally
>well on ZNS and SCSI, modulo the need by a user for conventional zones that ZNS
>do not define. But that is still consistent with the host-managed model since
>conventional zones are optional.

I think this is a very nice goal, but I do believe we will not be able
to keep a 100% consistent behavior. We will have new features on either
of the specs that do not make sense on the other and we will have to
deal with them. We can deal with this as generic optional features, but
at the end of the day, applications will need to check whether the
feature is selected or not.

This said, I agree that we need a good way to communicate this, and the
suggestions you made with sysfs parameters and flags make sense to me.

>
>For this particular patch, there is currently no in-kernel user, and it is not
>clear how this will be useful to applications. At least please clarify this. And
>most likely, similarly to discard etc operations that are optional, having a
>sysfs attribute and in-kernel API indicating if the drive supports offlining
>zones will be needed. Otherwise, the caller will have to play with error codes
>to understand if the drive does not support the command or if it is supported
>but the command failed. Not nice. Better to know before issuing the command.

Makes sense. See the reply on the patch itself.

Javier
Damien Le Moal June 26, 2020, 6:42 a.m. UTC | #7
On 2020/06/26 15:09, Javier González wrote:
> On 26.06.2020 01:34, Damien Le Moal wrote:
>> On 2020/06/25 21:22, Javier González wrote:
>>> From: Javier González <javier.gonz@samsung.com>
>>>
>>> Add support for offline transition on the zoned block device using the
>>> new zone management IOCTL
>>>
>>> Signed-off-by: Javier González <javier.gonz@samsung.com>
>>> Signed-off-by: SelvaKumar S <selvakuma.s1@samsung.com>
>>> Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
>>> Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
>>> ---
>>>  block/blk-core.c              | 2 ++
>>>  block/blk-zoned.c             | 3 +++
>>>  drivers/nvme/host/core.c      | 3 +++
>>>  include/linux/blk_types.h     | 3 +++
>>>  include/linux/blkdev.h        | 1 -
>>>  include/uapi/linux/blkzoned.h | 1 +
>>>  6 files changed, 12 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/block/blk-core.c b/block/blk-core.c
>>> index 03252af8c82c..589cbdacc5ec 100644
>>> --- a/block/blk-core.c
>>> +++ b/block/blk-core.c
>>> @@ -140,6 +140,7 @@ static const char *const blk_op_name[] = {
>>>  	REQ_OP_NAME(ZONE_CLOSE),
>>>  	REQ_OP_NAME(ZONE_FINISH),
>>>  	REQ_OP_NAME(ZONE_APPEND),
>>> +	REQ_OP_NAME(ZONE_OFFLINE),
>>>  	REQ_OP_NAME(WRITE_SAME),
>>>  	REQ_OP_NAME(WRITE_ZEROES),
>>>  	REQ_OP_NAME(SCSI_IN),
>>> @@ -1030,6 +1031,7 @@ generic_make_request_checks(struct bio *bio)
>>>  	case REQ_OP_ZONE_OPEN:
>>>  	case REQ_OP_ZONE_CLOSE:
>>>  	case REQ_OP_ZONE_FINISH:
>>> +	case REQ_OP_ZONE_OFFLINE:
>>>  		if (!blk_queue_is_zoned(q))
>>>  			goto not_supported;
>>>  		break;
>>> diff --git a/block/blk-zoned.c b/block/blk-zoned.c
>>> index 29194388a1bb..704fc15813d1 100644
>>> --- a/block/blk-zoned.c
>>> +++ b/block/blk-zoned.c
>>> @@ -416,6 +416,9 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode,
>>>  	case BLK_ZONE_MGMT_RESET:
>>>  		op = REQ_OP_ZONE_RESET;
>>>  		break;
>>> +	case BLK_ZONE_MGMT_OFFLINE:
>>> +		op = REQ_OP_ZONE_OFFLINE;
>>> +		break;
>>>  	default:
>>>  		return -ENOTTY;
>>>  	}
>>> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
>>> index f1215523792b..5b95c81d2a2d 100644
>>> --- a/drivers/nvme/host/core.c
>>> +++ b/drivers/nvme/host/core.c
>>> @@ -776,6 +776,9 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
>>>  	case REQ_OP_ZONE_FINISH:
>>>  		ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_FINISH);
>>>  		break;
>>> +	case REQ_OP_ZONE_OFFLINE:
>>> +		ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_OFFLINE);
>>> +		break;
>>>  	case REQ_OP_WRITE_ZEROES:
>>>  		ret = nvme_setup_write_zeroes(ns, req, cmd);
>>>  		break;
>>> diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
>>> index 16b57fb2b99c..b3921263c3dd 100644
>>> --- a/include/linux/blk_types.h
>>> +++ b/include/linux/blk_types.h
>>> @@ -316,6 +316,8 @@ enum req_opf {
>>>  	REQ_OP_ZONE_FINISH	= 12,
>>>  	/* write data at the current zone write pointer */
>>>  	REQ_OP_ZONE_APPEND	= 13,
>>> +	/* Transition a zone to offline */
>>> +	REQ_OP_ZONE_OFFLINE	= 14,
>>>
>>>  	/* SCSI passthrough using struct scsi_request */
>>>  	REQ_OP_SCSI_IN		= 32,
>>> @@ -456,6 +458,7 @@ static inline bool op_is_zone_mgmt(enum req_opf op)
>>>  	case REQ_OP_ZONE_OPEN:
>>>  	case REQ_OP_ZONE_CLOSE:
>>>  	case REQ_OP_ZONE_FINISH:
>>> +	case REQ_OP_ZONE_OFFLINE:
>>>  		return true;
>>>  	default:
>>>  		return false;
>>> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
>>> index bd8521f94dc4..8308d8a3720b 100644
>>> --- a/include/linux/blkdev.h
>>> +++ b/include/linux/blkdev.h
>>> @@ -372,7 +372,6 @@ extern int blkdev_zone_ops_ioctl(struct block_device *bdev, fmode_t mode,
>>>  				  unsigned int cmd, unsigned long arg);
>>>  extern int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode,
>>>  				  unsigned int cmd, unsigned long arg);
>>> -
>>>  #else /* CONFIG_BLK_DEV_ZONED */
>>>
>>>  static inline unsigned int blkdev_nr_zones(struct gendisk *disk)
>>> diff --git a/include/uapi/linux/blkzoned.h b/include/uapi/linux/blkzoned.h
>>> index a8c89fe58f97..d0978ee10fc7 100644
>>> --- a/include/uapi/linux/blkzoned.h
>>> +++ b/include/uapi/linux/blkzoned.h
>>> @@ -155,6 +155,7 @@ enum blk_zone_action {
>>>  	BLK_ZONE_MGMT_FINISH	= 0x2,
>>>  	BLK_ZONE_MGMT_OPEN	= 0x3,
>>>  	BLK_ZONE_MGMT_RESET	= 0x4,
>>> +	BLK_ZONE_MGMT_OFFLINE	= 0x5,
>>>  };
>>>
>>>  /**
>>>
>>
>> As mentioned in previous email, the usefulness of this is dubious. Please
>> elaborate in the commit message. Sure NVMe ZNS defines this and we can support
>> it. But without a good use case, what is the point ?
> 
> Use case is to transition zones in read-only state to offline when we
> are done moving valid data. It is easier to explicitly managing zones
> that are not usable by having all under the offline state.

Then adding a simple BLKZONEOFFLINE ioctl, similar to open, close, finish and
reset, would be enough. No need for all the new zone management ioctl with flags
plumbing.

> 
>>
>> scsi SD driver will return BLK_STS_NOTSUPP if this offlining is sent to a
>> ZBC/ZAC drive. Not nice. Having a sysfs attribute "max_offline_zone_sectors" or
>> the like to indicate support by the device or not would be nicer.
> 
> We can do that.
> 
>>
>> Does offling ALL zones make any sense ? Because this patch does not prevent the
>> use of the REQ_ZONE_ALL flags introduced in patch 2. Probably not a good idea to
>> allow offlining all zones, no ?
> 
> AFAIK the transition to offline is only valid when coming from a
> read-only state. I did think of adding a check, but I can see that other
> transitions go directly to the driver and then the device, so I decided
> to follow the same model. If you think it is better, we can add the
> check.

My point was that the REQ_ZONE_ALL flag would make no sense for offlining zones
but this patch does not have anything checking that. There is no point in
sending a command that is known to be incorrect to the drive...

> 
> Javier
>
Javier González June 26, 2020, 6:58 a.m. UTC | #8
On 26.06.2020 06:42, Damien Le Moal wrote:
>On 2020/06/26 15:09, Javier González wrote:
>> On 26.06.2020 01:34, Damien Le Moal wrote:
>>> On 2020/06/25 21:22, Javier González wrote:
>>>> From: Javier González <javier.gonz@samsung.com>
>>>>
>>>> Add support for offline transition on the zoned block device using the
>>>> new zone management IOCTL
>>>>
>>>> Signed-off-by: Javier González <javier.gonz@samsung.com>
>>>> Signed-off-by: SelvaKumar S <selvakuma.s1@samsung.com>
>>>> Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
>>>> Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
>>>> ---
>>>>  block/blk-core.c              | 2 ++
>>>>  block/blk-zoned.c             | 3 +++
>>>>  drivers/nvme/host/core.c      | 3 +++
>>>>  include/linux/blk_types.h     | 3 +++
>>>>  include/linux/blkdev.h        | 1 -
>>>>  include/uapi/linux/blkzoned.h | 1 +
>>>>  6 files changed, 12 insertions(+), 1 deletion(-)
>>>>
>>>> diff --git a/block/blk-core.c b/block/blk-core.c
>>>> index 03252af8c82c..589cbdacc5ec 100644
>>>> --- a/block/blk-core.c
>>>> +++ b/block/blk-core.c
>>>> @@ -140,6 +140,7 @@ static const char *const blk_op_name[] = {
>>>>  	REQ_OP_NAME(ZONE_CLOSE),
>>>>  	REQ_OP_NAME(ZONE_FINISH),
>>>>  	REQ_OP_NAME(ZONE_APPEND),
>>>> +	REQ_OP_NAME(ZONE_OFFLINE),
>>>>  	REQ_OP_NAME(WRITE_SAME),
>>>>  	REQ_OP_NAME(WRITE_ZEROES),
>>>>  	REQ_OP_NAME(SCSI_IN),
>>>> @@ -1030,6 +1031,7 @@ generic_make_request_checks(struct bio *bio)
>>>>  	case REQ_OP_ZONE_OPEN:
>>>>  	case REQ_OP_ZONE_CLOSE:
>>>>  	case REQ_OP_ZONE_FINISH:
>>>> +	case REQ_OP_ZONE_OFFLINE:
>>>>  		if (!blk_queue_is_zoned(q))
>>>>  			goto not_supported;
>>>>  		break;
>>>> diff --git a/block/blk-zoned.c b/block/blk-zoned.c
>>>> index 29194388a1bb..704fc15813d1 100644
>>>> --- a/block/blk-zoned.c
>>>> +++ b/block/blk-zoned.c
>>>> @@ -416,6 +416,9 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode,
>>>>  	case BLK_ZONE_MGMT_RESET:
>>>>  		op = REQ_OP_ZONE_RESET;
>>>>  		break;
>>>> +	case BLK_ZONE_MGMT_OFFLINE:
>>>> +		op = REQ_OP_ZONE_OFFLINE;
>>>> +		break;
>>>>  	default:
>>>>  		return -ENOTTY;
>>>>  	}
>>>> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
>>>> index f1215523792b..5b95c81d2a2d 100644
>>>> --- a/drivers/nvme/host/core.c
>>>> +++ b/drivers/nvme/host/core.c
>>>> @@ -776,6 +776,9 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
>>>>  	case REQ_OP_ZONE_FINISH:
>>>>  		ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_FINISH);
>>>>  		break;
>>>> +	case REQ_OP_ZONE_OFFLINE:
>>>> +		ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_OFFLINE);
>>>> +		break;
>>>>  	case REQ_OP_WRITE_ZEROES:
>>>>  		ret = nvme_setup_write_zeroes(ns, req, cmd);
>>>>  		break;
>>>> diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
>>>> index 16b57fb2b99c..b3921263c3dd 100644
>>>> --- a/include/linux/blk_types.h
>>>> +++ b/include/linux/blk_types.h
>>>> @@ -316,6 +316,8 @@ enum req_opf {
>>>>  	REQ_OP_ZONE_FINISH	= 12,
>>>>  	/* write data at the current zone write pointer */
>>>>  	REQ_OP_ZONE_APPEND	= 13,
>>>> +	/* Transition a zone to offline */
>>>> +	REQ_OP_ZONE_OFFLINE	= 14,
>>>>
>>>>  	/* SCSI passthrough using struct scsi_request */
>>>>  	REQ_OP_SCSI_IN		= 32,
>>>> @@ -456,6 +458,7 @@ static inline bool op_is_zone_mgmt(enum req_opf op)
>>>>  	case REQ_OP_ZONE_OPEN:
>>>>  	case REQ_OP_ZONE_CLOSE:
>>>>  	case REQ_OP_ZONE_FINISH:
>>>> +	case REQ_OP_ZONE_OFFLINE:
>>>>  		return true;
>>>>  	default:
>>>>  		return false;
>>>> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
>>>> index bd8521f94dc4..8308d8a3720b 100644
>>>> --- a/include/linux/blkdev.h
>>>> +++ b/include/linux/blkdev.h
>>>> @@ -372,7 +372,6 @@ extern int blkdev_zone_ops_ioctl(struct block_device *bdev, fmode_t mode,
>>>>  				  unsigned int cmd, unsigned long arg);
>>>>  extern int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode,
>>>>  				  unsigned int cmd, unsigned long arg);
>>>> -
>>>>  #else /* CONFIG_BLK_DEV_ZONED */
>>>>
>>>>  static inline unsigned int blkdev_nr_zones(struct gendisk *disk)
>>>> diff --git a/include/uapi/linux/blkzoned.h b/include/uapi/linux/blkzoned.h
>>>> index a8c89fe58f97..d0978ee10fc7 100644
>>>> --- a/include/uapi/linux/blkzoned.h
>>>> +++ b/include/uapi/linux/blkzoned.h
>>>> @@ -155,6 +155,7 @@ enum blk_zone_action {
>>>>  	BLK_ZONE_MGMT_FINISH	= 0x2,
>>>>  	BLK_ZONE_MGMT_OPEN	= 0x3,
>>>>  	BLK_ZONE_MGMT_RESET	= 0x4,
>>>> +	BLK_ZONE_MGMT_OFFLINE	= 0x5,
>>>>  };
>>>>
>>>>  /**
>>>>
>>>
>>> As mentioned in previous email, the usefulness of this is dubious. Please
>>> elaborate in the commit message. Sure NVMe ZNS defines this and we can support
>>> it. But without a good use case, what is the point ?
>>
>> Use case is to transition zones in read-only state to offline when we
>> are done moving valid data. It is easier to explicitly managing zones
>> that are not usable by having all under the offline state.
>
>Then adding a simple BLKZONEOFFLINE ioctl, similar to open, close, finish and
>reset, would be enough. No need for all the new zone management ioctl with flags
>plumbing.

Ok. We can add that then.

Note that zone management is not motivated by this use case at all, but
it made sense to implement it here instead of as a new BLKZONEOFFLINE
IOCTL as ZAC/ZBC users will not be able to use it either way.

>
>>
>>>
>>> scsi SD driver will return BLK_STS_NOTSUPP if this offlining is sent to a
>>> ZBC/ZAC drive. Not nice. Having a sysfs attribute "max_offline_zone_sectors" or
>>> the like to indicate support by the device or not would be nicer.
>>
>> We can do that.
>>
>>>
>>> Does offling ALL zones make any sense ? Because this patch does not prevent the
>>> use of the REQ_ZONE_ALL flags introduced in patch 2. Probably not a good idea to
>>> allow offlining all zones, no ?
>>
>> AFAIK the transition to offline is only valid when coming from a
>> read-only state. I did think of adding a check, but I can see that other
>> transitions go directly to the driver and then the device, so I decided
>> to follow the same model. If you think it is better, we can add the
>> check.
>
>My point was that the REQ_ZONE_ALL flag would make no sense for offlining zones
>but this patch does not have anything checking that. There is no point in
>sending a command that is known to be incorrect to the drive...

I will add some extra checks then to fail early. I assume these should
be in the NVMe driver as it is NVMe-specific, right?

Javier
Damien Le Moal June 26, 2020, 7:17 a.m. UTC | #9
On 2020/06/26 15:58, Javier González wrote:
> On 26.06.2020 06:42, Damien Le Moal wrote:
>> On 2020/06/26 15:09, Javier González wrote:
>>> On 26.06.2020 01:34, Damien Le Moal wrote:
>>>> On 2020/06/25 21:22, Javier González wrote:
>>>>> From: Javier González <javier.gonz@samsung.com>
>>>>>
>>>>> Add support for offline transition on the zoned block device using the
>>>>> new zone management IOCTL
>>>>>
>>>>> Signed-off-by: Javier González <javier.gonz@samsung.com>
>>>>> Signed-off-by: SelvaKumar S <selvakuma.s1@samsung.com>
>>>>> Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
>>>>> Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
>>>>> ---
>>>>>  block/blk-core.c              | 2 ++
>>>>>  block/blk-zoned.c             | 3 +++
>>>>>  drivers/nvme/host/core.c      | 3 +++
>>>>>  include/linux/blk_types.h     | 3 +++
>>>>>  include/linux/blkdev.h        | 1 -
>>>>>  include/uapi/linux/blkzoned.h | 1 +
>>>>>  6 files changed, 12 insertions(+), 1 deletion(-)
>>>>>
>>>>> diff --git a/block/blk-core.c b/block/blk-core.c
>>>>> index 03252af8c82c..589cbdacc5ec 100644
>>>>> --- a/block/blk-core.c
>>>>> +++ b/block/blk-core.c
>>>>> @@ -140,6 +140,7 @@ static const char *const blk_op_name[] = {
>>>>>  	REQ_OP_NAME(ZONE_CLOSE),
>>>>>  	REQ_OP_NAME(ZONE_FINISH),
>>>>>  	REQ_OP_NAME(ZONE_APPEND),
>>>>> +	REQ_OP_NAME(ZONE_OFFLINE),
>>>>>  	REQ_OP_NAME(WRITE_SAME),
>>>>>  	REQ_OP_NAME(WRITE_ZEROES),
>>>>>  	REQ_OP_NAME(SCSI_IN),
>>>>> @@ -1030,6 +1031,7 @@ generic_make_request_checks(struct bio *bio)
>>>>>  	case REQ_OP_ZONE_OPEN:
>>>>>  	case REQ_OP_ZONE_CLOSE:
>>>>>  	case REQ_OP_ZONE_FINISH:
>>>>> +	case REQ_OP_ZONE_OFFLINE:
>>>>>  		if (!blk_queue_is_zoned(q))
>>>>>  			goto not_supported;
>>>>>  		break;
>>>>> diff --git a/block/blk-zoned.c b/block/blk-zoned.c
>>>>> index 29194388a1bb..704fc15813d1 100644
>>>>> --- a/block/blk-zoned.c
>>>>> +++ b/block/blk-zoned.c
>>>>> @@ -416,6 +416,9 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode,
>>>>>  	case BLK_ZONE_MGMT_RESET:
>>>>>  		op = REQ_OP_ZONE_RESET;
>>>>>  		break;
>>>>> +	case BLK_ZONE_MGMT_OFFLINE:
>>>>> +		op = REQ_OP_ZONE_OFFLINE;
>>>>> +		break;
>>>>>  	default:
>>>>>  		return -ENOTTY;
>>>>>  	}
>>>>> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
>>>>> index f1215523792b..5b95c81d2a2d 100644
>>>>> --- a/drivers/nvme/host/core.c
>>>>> +++ b/drivers/nvme/host/core.c
>>>>> @@ -776,6 +776,9 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
>>>>>  	case REQ_OP_ZONE_FINISH:
>>>>>  		ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_FINISH);
>>>>>  		break;
>>>>> +	case REQ_OP_ZONE_OFFLINE:
>>>>> +		ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_OFFLINE);
>>>>> +		break;
>>>>>  	case REQ_OP_WRITE_ZEROES:
>>>>>  		ret = nvme_setup_write_zeroes(ns, req, cmd);
>>>>>  		break;
>>>>> diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
>>>>> index 16b57fb2b99c..b3921263c3dd 100644
>>>>> --- a/include/linux/blk_types.h
>>>>> +++ b/include/linux/blk_types.h
>>>>> @@ -316,6 +316,8 @@ enum req_opf {
>>>>>  	REQ_OP_ZONE_FINISH	= 12,
>>>>>  	/* write data at the current zone write pointer */
>>>>>  	REQ_OP_ZONE_APPEND	= 13,
>>>>> +	/* Transition a zone to offline */
>>>>> +	REQ_OP_ZONE_OFFLINE	= 14,
>>>>>
>>>>>  	/* SCSI passthrough using struct scsi_request */
>>>>>  	REQ_OP_SCSI_IN		= 32,
>>>>> @@ -456,6 +458,7 @@ static inline bool op_is_zone_mgmt(enum req_opf op)
>>>>>  	case REQ_OP_ZONE_OPEN:
>>>>>  	case REQ_OP_ZONE_CLOSE:
>>>>>  	case REQ_OP_ZONE_FINISH:
>>>>> +	case REQ_OP_ZONE_OFFLINE:
>>>>>  		return true;
>>>>>  	default:
>>>>>  		return false;
>>>>> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
>>>>> index bd8521f94dc4..8308d8a3720b 100644
>>>>> --- a/include/linux/blkdev.h
>>>>> +++ b/include/linux/blkdev.h
>>>>> @@ -372,7 +372,6 @@ extern int blkdev_zone_ops_ioctl(struct block_device *bdev, fmode_t mode,
>>>>>  				  unsigned int cmd, unsigned long arg);
>>>>>  extern int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode,
>>>>>  				  unsigned int cmd, unsigned long arg);
>>>>> -
>>>>>  #else /* CONFIG_BLK_DEV_ZONED */
>>>>>
>>>>>  static inline unsigned int blkdev_nr_zones(struct gendisk *disk)
>>>>> diff --git a/include/uapi/linux/blkzoned.h b/include/uapi/linux/blkzoned.h
>>>>> index a8c89fe58f97..d0978ee10fc7 100644
>>>>> --- a/include/uapi/linux/blkzoned.h
>>>>> +++ b/include/uapi/linux/blkzoned.h
>>>>> @@ -155,6 +155,7 @@ enum blk_zone_action {
>>>>>  	BLK_ZONE_MGMT_FINISH	= 0x2,
>>>>>  	BLK_ZONE_MGMT_OPEN	= 0x3,
>>>>>  	BLK_ZONE_MGMT_RESET	= 0x4,
>>>>> +	BLK_ZONE_MGMT_OFFLINE	= 0x5,
>>>>>  };
>>>>>
>>>>>  /**
>>>>>
>>>>
>>>> As mentioned in previous email, the usefulness of this is dubious. Please
>>>> elaborate in the commit message. Sure NVMe ZNS defines this and we can support
>>>> it. But without a good use case, what is the point ?
>>>
>>> Use case is to transition zones in read-only state to offline when we
>>> are done moving valid data. It is easier to explicitly managing zones
>>> that are not usable by having all under the offline state.
>>
>> Then adding a simple BLKZONEOFFLINE ioctl, similar to open, close, finish and
>> reset, would be enough. No need for all the new zone management ioctl with flags
>> plumbing.
> 
> Ok. We can add that then.
> 
> Note that zone management is not motivated by this use case at all, but
> it made sense to implement it here instead of as a new BLKZONEOFFLINE
> IOCTL as ZAC/ZBC users will not be able to use it either way.

Sure, that is fine. We could actually add that to sd_zbc.c since we have zone
tracking there. A read-only zone can be reported as offline to sync-up with zns.
The value of it is dubious though as most applications will treat read-only and
offline zones the same way: as unusable. That is what zonefs does.

> 
>>
>>>
>>>>
>>>> scsi SD driver will return BLK_STS_NOTSUPP if this offlining is sent to a
>>>> ZBC/ZAC drive. Not nice. Having a sysfs attribute "max_offline_zone_sectors" or
>>>> the like to indicate support by the device or not would be nicer.
>>>
>>> We can do that.
>>>
>>>>
>>>> Does offling ALL zones make any sense ? Because this patch does not prevent the
>>>> use of the REQ_ZONE_ALL flags introduced in patch 2. Probably not a good idea to
>>>> allow offlining all zones, no ?
>>>
>>> AFAIK the transition to offline is only valid when coming from a
>>> read-only state. I did think of adding a check, but I can see that other
>>> transitions go directly to the driver and then the device, so I decided
>>> to follow the same model. If you think it is better, we can add the
>>> check.
>>
>> My point was that the REQ_ZONE_ALL flag would make no sense for offlining zones
>> but this patch does not have anything checking that. There is no point in
>> sending a command that is known to be incorrect to the drive...
> 
> I will add some extra checks then to fail early. I assume these should
> be in the NVMe driver as it is NVMe-specific, right?

If it is a simple BLKZONEOFFLINE ioctl, it can be processed exactly like open,
close and finish, using blkdev_zone_mgmt(). Calling that one for a range of
sectors of more than one zone will likely not make any sense most of the time,
but that is allowed for all other ops, so I guess you can keep it as is for
offline too. blkdev_zone_mgmt() will actually not need any change. You will only
need to wire the ioctl path and update op_is_zone_mgmt(). That's it. Simple that
way.

> 
> Javier
>
Javier González June 26, 2020, 7:26 a.m. UTC | #10
On 26.06.2020 07:17, Damien Le Moal wrote:
>On 2020/06/26 15:58, Javier González wrote:
>> On 26.06.2020 06:42, Damien Le Moal wrote:
>>> On 2020/06/26 15:09, Javier González wrote:
>>>> On 26.06.2020 01:34, Damien Le Moal wrote:
>>>>> On 2020/06/25 21:22, Javier González wrote:
>>>>>> From: Javier González <javier.gonz@samsung.com>
>>>>>>
>>>>>> Add support for offline transition on the zoned block device using the
>>>>>> new zone management IOCTL
>>>>>>
>>>>>> Signed-off-by: Javier González <javier.gonz@samsung.com>
>>>>>> Signed-off-by: SelvaKumar S <selvakuma.s1@samsung.com>
>>>>>> Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
>>>>>> Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
>>>>>> ---
>>>>>>  block/blk-core.c              | 2 ++
>>>>>>  block/blk-zoned.c             | 3 +++
>>>>>>  drivers/nvme/host/core.c      | 3 +++
>>>>>>  include/linux/blk_types.h     | 3 +++
>>>>>>  include/linux/blkdev.h        | 1 -
>>>>>>  include/uapi/linux/blkzoned.h | 1 +
>>>>>>  6 files changed, 12 insertions(+), 1 deletion(-)
>>>>>>
>>>>>> diff --git a/block/blk-core.c b/block/blk-core.c
>>>>>> index 03252af8c82c..589cbdacc5ec 100644
>>>>>> --- a/block/blk-core.c
>>>>>> +++ b/block/blk-core.c
>>>>>> @@ -140,6 +140,7 @@ static const char *const blk_op_name[] = {
>>>>>>  	REQ_OP_NAME(ZONE_CLOSE),
>>>>>>  	REQ_OP_NAME(ZONE_FINISH),
>>>>>>  	REQ_OP_NAME(ZONE_APPEND),
>>>>>> +	REQ_OP_NAME(ZONE_OFFLINE),
>>>>>>  	REQ_OP_NAME(WRITE_SAME),
>>>>>>  	REQ_OP_NAME(WRITE_ZEROES),
>>>>>>  	REQ_OP_NAME(SCSI_IN),
>>>>>> @@ -1030,6 +1031,7 @@ generic_make_request_checks(struct bio *bio)
>>>>>>  	case REQ_OP_ZONE_OPEN:
>>>>>>  	case REQ_OP_ZONE_CLOSE:
>>>>>>  	case REQ_OP_ZONE_FINISH:
>>>>>> +	case REQ_OP_ZONE_OFFLINE:
>>>>>>  		if (!blk_queue_is_zoned(q))
>>>>>>  			goto not_supported;
>>>>>>  		break;
>>>>>> diff --git a/block/blk-zoned.c b/block/blk-zoned.c
>>>>>> index 29194388a1bb..704fc15813d1 100644
>>>>>> --- a/block/blk-zoned.c
>>>>>> +++ b/block/blk-zoned.c
>>>>>> @@ -416,6 +416,9 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode,
>>>>>>  	case BLK_ZONE_MGMT_RESET:
>>>>>>  		op = REQ_OP_ZONE_RESET;
>>>>>>  		break;
>>>>>> +	case BLK_ZONE_MGMT_OFFLINE:
>>>>>> +		op = REQ_OP_ZONE_OFFLINE;
>>>>>> +		break;
>>>>>>  	default:
>>>>>>  		return -ENOTTY;
>>>>>>  	}
>>>>>> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
>>>>>> index f1215523792b..5b95c81d2a2d 100644
>>>>>> --- a/drivers/nvme/host/core.c
>>>>>> +++ b/drivers/nvme/host/core.c
>>>>>> @@ -776,6 +776,9 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
>>>>>>  	case REQ_OP_ZONE_FINISH:
>>>>>>  		ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_FINISH);
>>>>>>  		break;
>>>>>> +	case REQ_OP_ZONE_OFFLINE:
>>>>>> +		ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_OFFLINE);
>>>>>> +		break;
>>>>>>  	case REQ_OP_WRITE_ZEROES:
>>>>>>  		ret = nvme_setup_write_zeroes(ns, req, cmd);
>>>>>>  		break;
>>>>>> diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
>>>>>> index 16b57fb2b99c..b3921263c3dd 100644
>>>>>> --- a/include/linux/blk_types.h
>>>>>> +++ b/include/linux/blk_types.h
>>>>>> @@ -316,6 +316,8 @@ enum req_opf {
>>>>>>  	REQ_OP_ZONE_FINISH	= 12,
>>>>>>  	/* write data at the current zone write pointer */
>>>>>>  	REQ_OP_ZONE_APPEND	= 13,
>>>>>> +	/* Transition a zone to offline */
>>>>>> +	REQ_OP_ZONE_OFFLINE	= 14,
>>>>>>
>>>>>>  	/* SCSI passthrough using struct scsi_request */
>>>>>>  	REQ_OP_SCSI_IN		= 32,
>>>>>> @@ -456,6 +458,7 @@ static inline bool op_is_zone_mgmt(enum req_opf op)
>>>>>>  	case REQ_OP_ZONE_OPEN:
>>>>>>  	case REQ_OP_ZONE_CLOSE:
>>>>>>  	case REQ_OP_ZONE_FINISH:
>>>>>> +	case REQ_OP_ZONE_OFFLINE:
>>>>>>  		return true;
>>>>>>  	default:
>>>>>>  		return false;
>>>>>> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
>>>>>> index bd8521f94dc4..8308d8a3720b 100644
>>>>>> --- a/include/linux/blkdev.h
>>>>>> +++ b/include/linux/blkdev.h
>>>>>> @@ -372,7 +372,6 @@ extern int blkdev_zone_ops_ioctl(struct block_device *bdev, fmode_t mode,
>>>>>>  				  unsigned int cmd, unsigned long arg);
>>>>>>  extern int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode,
>>>>>>  				  unsigned int cmd, unsigned long arg);
>>>>>> -
>>>>>>  #else /* CONFIG_BLK_DEV_ZONED */
>>>>>>
>>>>>>  static inline unsigned int blkdev_nr_zones(struct gendisk *disk)
>>>>>> diff --git a/include/uapi/linux/blkzoned.h b/include/uapi/linux/blkzoned.h
>>>>>> index a8c89fe58f97..d0978ee10fc7 100644
>>>>>> --- a/include/uapi/linux/blkzoned.h
>>>>>> +++ b/include/uapi/linux/blkzoned.h
>>>>>> @@ -155,6 +155,7 @@ enum blk_zone_action {
>>>>>>  	BLK_ZONE_MGMT_FINISH	= 0x2,
>>>>>>  	BLK_ZONE_MGMT_OPEN	= 0x3,
>>>>>>  	BLK_ZONE_MGMT_RESET	= 0x4,
>>>>>> +	BLK_ZONE_MGMT_OFFLINE	= 0x5,
>>>>>>  };
>>>>>>
>>>>>>  /**
>>>>>>
>>>>>
>>>>> As mentioned in previous email, the usefulness of this is dubious. Please
>>>>> elaborate in the commit message. Sure NVMe ZNS defines this and we can support
>>>>> it. But without a good use case, what is the point ?
>>>>
>>>> Use case is to transition zones in read-only state to offline when we
>>>> are done moving valid data. It is easier to explicitly managing zones
>>>> that are not usable by having all under the offline state.
>>>
>>> Then adding a simple BLKZONEOFFLINE ioctl, similar to open, close, finish and
>>> reset, would be enough. No need for all the new zone management ioctl with flags
>>> plumbing.
>>
>> Ok. We can add that then.
>>
>> Note that zone management is not motivated by this use case at all, but
>> it made sense to implement it here instead of as a new BLKZONEOFFLINE
>> IOCTL as ZAC/ZBC users will not be able to use it either way.
>
>Sure, that is fine. We could actually add that to sd_zbc.c since we have zone
>tracking there. A read-only zone can be reported as offline to sync-up with zns.
>The value of it is dubious though as most applications will treat read-only and
>offline zones the same way: as unusable. That is what zonefs does.

Ok.

>
>>
>>>
>>>>
>>>>>
>>>>> scsi SD driver will return BLK_STS_NOTSUPP if this offlining is sent to a
>>>>> ZBC/ZAC drive. Not nice. Having a sysfs attribute "max_offline_zone_sectors" or
>>>>> the like to indicate support by the device or not would be nicer.
>>>>
>>>> We can do that.
>>>>
>>>>>
>>>>> Does offling ALL zones make any sense ? Because this patch does not prevent the
>>>>> use of the REQ_ZONE_ALL flags introduced in patch 2. Probably not a good idea to
>>>>> allow offlining all zones, no ?
>>>>
>>>> AFAIK the transition to offline is only valid when coming from a
>>>> read-only state. I did think of adding a check, but I can see that other
>>>> transitions go directly to the driver and then the device, so I decided
>>>> to follow the same model. If you think it is better, we can add the
>>>> check.
>>>
>>> My point was that the REQ_ZONE_ALL flag would make no sense for offlining zones
>>> but this patch does not have anything checking that. There is no point in
>>> sending a command that is known to be incorrect to the drive...
>>
>> I will add some extra checks then to fail early. I assume these should
>> be in the NVMe driver as it is NVMe-specific, right?
>
>If it is a simple BLKZONEOFFLINE ioctl, it can be processed exactly like open,
>close and finish, using blkdev_zone_mgmt(). Calling that one for a range of
>sectors of more than one zone will likely not make any sense most of the time,
>but that is allowed for all other ops, so I guess you can keep it as is for
>offline too. blkdev_zone_mgmt() will actually not need any change. You will only
>need to wire the ioctl path and update op_is_zone_mgmt(). That's it. Simple that
>way.

Sounds good.

Javier
Christoph Hellwig June 26, 2020, 9:07 a.m. UTC | #11
On Thu, Jun 25, 2020 at 04:12:21PM +0200, Matias Bjørling wrote:
> I am not sure this makes sense to expose through the kernel zone api. One 
> of the goals of the kernel zone API is to be a layer that provides an 
> unified zone model across SMR HDDs and ZNS SSDs. The offline zone 
> operation, as defined in the ZNS specification, does not have an equivalent 
> in SMR HDDs (ZAC/ZBC).
>
> This is different from the Zone Capacity change, where the zone capacity 
> simply was zone size for SMR HDDs. Making it easy to support. That is not 
> the same for ZAC/ZBC, that does not offer the offline operation to 
> transition zones in read only state to offline state.

Bullshit.  It is eactly the same case of careful additions to the model,
which totally make sense.

The only major issue with the patch is that we need a flag to indicate
if a given device supports offlining zones before wiring it up.
Christoph Hellwig June 26, 2020, 9:11 a.m. UTC | #12
On Fri, Jun 26, 2020 at 01:14:30AM +0000, Damien Le Moal wrote:
> As long as you keep ZNS namespace report itself as being "host-managed" like
> ZBC/ZAC disks, we need the consistency and common interface. If you break that,
> the meaning of the zoned model queue attribute disappears and an application or
> in-kernel user cannot rely on this model anymore to know how the drive will behave.

We just need a way to expose to applications that new feature are
supported.  Just like we did with zone capacity support.  That is why
we added the feature flags to uapi zone structure.

> Think of a file system, or any other in-kernel user. If they have to change
> their code based on the device type (NVMe vs SCSI), then the zoned block device
> interface is broken. Right now, that is not the case, everything works equally
> well on ZNS and SCSI, modulo the need by a user for conventional zones that ZNS
> do not define. But that is still consistent with the host-managed model since
> conventional zones are optional.

That is why we have the feature flag.  That user should not know the
underlying transport or spec.  But it can reliably discover "this thing
support zone capacity" or "this thing supports offline zones" or even
nasty thing like "this zone can time out when open" which are much
harder to deal with.

> For this particular patch, there is currently no in-kernel user, and it is not
> clear how this will be useful to applications. At least please clarify this. And

The main user is the ioctl.  And if you think about how offline zones are
(suppose to) be used driving this from management tools in userspace
actually totally make sense.  Unlike for example open/close all which
just don't make sense as primitives to start with.
Damien Le Moal June 26, 2020, 9:15 a.m. UTC | #13
On 2020/06/26 18:11, hch@lst.de wrote:
> On Fri, Jun 26, 2020 at 01:14:30AM +0000, Damien Le Moal wrote:
>> As long as you keep ZNS namespace report itself as being "host-managed" like
>> ZBC/ZAC disks, we need the consistency and common interface. If you break that,
>> the meaning of the zoned model queue attribute disappears and an application or
>> in-kernel user cannot rely on this model anymore to know how the drive will behave.
> 
> We just need a way to expose to applications that new feature are
> supported.  Just like we did with zone capacity support.  That is why
> we added the feature flags to uapi zone structure.
> 
>> Think of a file system, or any other in-kernel user. If they have to change
>> their code based on the device type (NVMe vs SCSI), then the zoned block device
>> interface is broken. Right now, that is not the case, everything works equally
>> well on ZNS and SCSI, modulo the need by a user for conventional zones that ZNS
>> do not define. But that is still consistent with the host-managed model since
>> conventional zones are optional.
> 
> That is why we have the feature flag.  That user should not know the
> underlying transport or spec.  But it can reliably discover "this thing
> support zone capacity" or "this thing supports offline zones" or even
> nasty thing like "this zone can time out when open" which are much
> harder to deal with.
> 
>> For this particular patch, there is currently no in-kernel user, and it is not
>> clear how this will be useful to applications. At least please clarify this. And
> 
> The main user is the ioctl.  And if you think about how offline zones are
> (suppose to) be used driving this from management tools in userspace
> actually totally make sense.  Unlike for example open/close all which
> just don't make sense as primitives to start with.

OK. Adding a new BLKZONEOFFLINE ioctl is easy though and fits into the current
zone management plumbing well, I think. So the patch can be significantly
simplified (no need for the new zone management op function with flags).

>
Christoph Hellwig June 26, 2020, 9:17 a.m. UTC | #14
On Fri, Jun 26, 2020 at 09:15:14AM +0000, Damien Le Moal wrote:
> On 2020/06/26 18:11, hch@lst.de wrote:
> > On Fri, Jun 26, 2020 at 01:14:30AM +0000, Damien Le Moal wrote:
> >> As long as you keep ZNS namespace report itself as being "host-managed" like
> >> ZBC/ZAC disks, we need the consistency and common interface. If you break that,
> >> the meaning of the zoned model queue attribute disappears and an application or
> >> in-kernel user cannot rely on this model anymore to know how the drive will behave.
> > 
> > We just need a way to expose to applications that new feature are
> > supported.  Just like we did with zone capacity support.  That is why
> > we added the feature flags to uapi zone structure.
> > 
> >> Think of a file system, or any other in-kernel user. If they have to change
> >> their code based on the device type (NVMe vs SCSI), then the zoned block device
> >> interface is broken. Right now, that is not the case, everything works equally
> >> well on ZNS and SCSI, modulo the need by a user for conventional zones that ZNS
> >> do not define. But that is still consistent with the host-managed model since
> >> conventional zones are optional.
> > 
> > That is why we have the feature flag.  That user should not know the
> > underlying transport or spec.  But it can reliably discover "this thing
> > support zone capacity" or "this thing supports offline zones" or even
> > nasty thing like "this zone can time out when open" which are much
> > harder to deal with.
> > 
> >> For this particular patch, there is currently no in-kernel user, and it is not
> >> clear how this will be useful to applications. At least please clarify this. And
> > 
> > The main user is the ioctl.  And if you think about how offline zones are
> > (suppose to) be used driving this from management tools in userspace
> > actually totally make sense.  Unlike for example open/close all which
> > just don't make sense as primitives to start with.
> 
> OK. Adding a new BLKZONEOFFLINE ioctl is easy though and fits into the current
> zone management plumbing well, I think. So the patch can be significantly
> simplified (no need for the new zone management op function with flags).

Yes, I'm all for reusing the existing plumbing and style as much as
possible.
Javier González June 26, 2020, 10:02 a.m. UTC | #15
On 26.06.2020 11:17, hch@lst.de wrote:
>On Fri, Jun 26, 2020 at 09:15:14AM +0000, Damien Le Moal wrote:
>> On 2020/06/26 18:11, hch@lst.de wrote:
>> > On Fri, Jun 26, 2020 at 01:14:30AM +0000, Damien Le Moal wrote:
>> >> As long as you keep ZNS namespace report itself as being "host-managed" like
>> >> ZBC/ZAC disks, we need the consistency and common interface. If you break that,
>> >> the meaning of the zoned model queue attribute disappears and an application or
>> >> in-kernel user cannot rely on this model anymore to know how the drive will behave.
>> >
>> > We just need a way to expose to applications that new feature are
>> > supported.  Just like we did with zone capacity support.  That is why
>> > we added the feature flags to uapi zone structure.
>> >
>> >> Think of a file system, or any other in-kernel user. If they have to change
>> >> their code based on the device type (NVMe vs SCSI), then the zoned block device
>> >> interface is broken. Right now, that is not the case, everything works equally
>> >> well on ZNS and SCSI, modulo the need by a user for conventional zones that ZNS
>> >> do not define. But that is still consistent with the host-managed model since
>> >> conventional zones are optional.
>> >
>> > That is why we have the feature flag.  That user should not know the
>> > underlying transport or spec.  But it can reliably discover "this thing
>> > support zone capacity" or "this thing supports offline zones" or even
>> > nasty thing like "this zone can time out when open" which are much
>> > harder to deal with.
>> >
>> >> For this particular patch, there is currently no in-kernel user, and it is not
>> >> clear how this will be useful to applications. At least please clarify this. And
>> >
>> > The main user is the ioctl.  And if you think about how offline zones are
>> > (suppose to) be used driving this from management tools in userspace
>> > actually totally make sense.  Unlike for example open/close all which
>> > just don't make sense as primitives to start with.
>>
>> OK. Adding a new BLKZONEOFFLINE ioctl is easy though and fits into the current
>> zone management plumbing well, I think. So the patch can be significantly
>> simplified (no need for the new zone management op function with flags).
>
>Yes, I'm all for reusing the existing plumbing and style as much as
>possible.

OK. Will use the current path on V2.

Thanks!
Javier

Patch
diff mbox series

diff --git a/block/blk-core.c b/block/blk-core.c
index 03252af8c82c..589cbdacc5ec 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -140,6 +140,7 @@  static const char *const blk_op_name[] = {
 	REQ_OP_NAME(ZONE_CLOSE),
 	REQ_OP_NAME(ZONE_FINISH),
 	REQ_OP_NAME(ZONE_APPEND),
+	REQ_OP_NAME(ZONE_OFFLINE),
 	REQ_OP_NAME(WRITE_SAME),
 	REQ_OP_NAME(WRITE_ZEROES),
 	REQ_OP_NAME(SCSI_IN),
@@ -1030,6 +1031,7 @@  generic_make_request_checks(struct bio *bio)
 	case REQ_OP_ZONE_OPEN:
 	case REQ_OP_ZONE_CLOSE:
 	case REQ_OP_ZONE_FINISH:
+	case REQ_OP_ZONE_OFFLINE:
 		if (!blk_queue_is_zoned(q))
 			goto not_supported;
 		break;
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 29194388a1bb..704fc15813d1 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -416,6 +416,9 @@  int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode,
 	case BLK_ZONE_MGMT_RESET:
 		op = REQ_OP_ZONE_RESET;
 		break;
+	case BLK_ZONE_MGMT_OFFLINE:
+		op = REQ_OP_ZONE_OFFLINE;
+		break;
 	default:
 		return -ENOTTY;
 	}
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index f1215523792b..5b95c81d2a2d 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -776,6 +776,9 @@  blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
 	case REQ_OP_ZONE_FINISH:
 		ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_FINISH);
 		break;
+	case REQ_OP_ZONE_OFFLINE:
+		ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_OFFLINE);
+		break;
 	case REQ_OP_WRITE_ZEROES:
 		ret = nvme_setup_write_zeroes(ns, req, cmd);
 		break;
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 16b57fb2b99c..b3921263c3dd 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -316,6 +316,8 @@  enum req_opf {
 	REQ_OP_ZONE_FINISH	= 12,
 	/* write data at the current zone write pointer */
 	REQ_OP_ZONE_APPEND	= 13,
+	/* Transition a zone to offline */
+	REQ_OP_ZONE_OFFLINE	= 14,
 
 	/* SCSI passthrough using struct scsi_request */
 	REQ_OP_SCSI_IN		= 32,
@@ -456,6 +458,7 @@  static inline bool op_is_zone_mgmt(enum req_opf op)
 	case REQ_OP_ZONE_OPEN:
 	case REQ_OP_ZONE_CLOSE:
 	case REQ_OP_ZONE_FINISH:
+	case REQ_OP_ZONE_OFFLINE:
 		return true;
 	default:
 		return false;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index bd8521f94dc4..8308d8a3720b 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -372,7 +372,6 @@  extern int blkdev_zone_ops_ioctl(struct block_device *bdev, fmode_t mode,
 				  unsigned int cmd, unsigned long arg);
 extern int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode,
 				  unsigned int cmd, unsigned long arg);
-
 #else /* CONFIG_BLK_DEV_ZONED */
 
 static inline unsigned int blkdev_nr_zones(struct gendisk *disk)
diff --git a/include/uapi/linux/blkzoned.h b/include/uapi/linux/blkzoned.h
index a8c89fe58f97..d0978ee10fc7 100644
--- a/include/uapi/linux/blkzoned.h
+++ b/include/uapi/linux/blkzoned.h
@@ -155,6 +155,7 @@  enum blk_zone_action {
 	BLK_ZONE_MGMT_FINISH	= 0x2,
 	BLK_ZONE_MGMT_OPEN	= 0x3,
 	BLK_ZONE_MGMT_RESET	= 0x4,
+	BLK_ZONE_MGMT_OFFLINE	= 0x5,
 };
 
 /**