diff mbox series

[1/4] block: add zone open, close and finish support

Message ID 20190621130711.21986-2-mb@lightnvm.io (mailing list archive)
State New, archived
Headers show
Series open, close, finish zone support | expand

Commit Message

Matias Bjorling June 21, 2019, 1:07 p.m. UTC
From: Ajay Joshi <ajay.joshi@wdc.com>

Zoned block devices allows one to control zone transitions by using
explicit commands. The available transitions are:

  * Open zone: Transition a zone to open state.
  * Close zone: Transition a zone to closed state.
  * Finish zone: Transition a zone to full state.

Allow kernel to issue these transitions by introducing
blkdev_zones_mgmt_op() and add three new request opcodes:

  * REQ_IO_ZONE_OPEN, REQ_IO_ZONE_CLOSE, and REQ_OP_ZONE_FINISH

Allow user-space to issue the transitions through the following ioctls:

  * BLKOPENZONE, BLKCLOSEZONE, and BLKFINISHZONE.

Signed-off-by: Ajay Joshi <ajay.joshi@wdc.com>
Signed-off-by: Aravind Ramesh <aravind.ramesh@wdc.com>
Signed-off-by: Matias Bjørling <matias.bjorling@wdc.com>
---
 block/blk-core.c              |  3 ++
 block/blk-zoned.c             | 51 ++++++++++++++++++++++---------
 block/ioctl.c                 |  5 ++-
 include/linux/blk_types.h     | 27 +++++++++++++++--
 include/linux/blkdev.h        | 57 ++++++++++++++++++++++++++++++-----
 include/uapi/linux/blkzoned.h | 17 +++++++++--
 6 files changed, 133 insertions(+), 27 deletions(-)

Comments

Damien Le Moal June 22, 2019, 12:51 a.m. UTC | #1
Matias,

Some comments inline below.

On 2019/06/21 22:07, Matias Bjørling wrote:
> From: Ajay Joshi <ajay.joshi@wdc.com>
> 
> Zoned block devices allows one to control zone transitions by using
> explicit commands. The available transitions are:
> 
>   * Open zone: Transition a zone to open state.
>   * Close zone: Transition a zone to closed state.
>   * Finish zone: Transition a zone to full state.
> 
> Allow kernel to issue these transitions by introducing
> blkdev_zones_mgmt_op() and add three new request opcodes:
> 
>   * REQ_IO_ZONE_OPEN, REQ_IO_ZONE_CLOSE, and REQ_OP_ZONE_FINISH
> 
> Allow user-space to issue the transitions through the following ioctls:
> 
>   * BLKOPENZONE, BLKCLOSEZONE, and BLKFINISHZONE.
> 
> Signed-off-by: Ajay Joshi <ajay.joshi@wdc.com>
> Signed-off-by: Aravind Ramesh <aravind.ramesh@wdc.com>
> Signed-off-by: Matias Bjørling <matias.bjorling@wdc.com>
> ---
>  block/blk-core.c              |  3 ++
>  block/blk-zoned.c             | 51 ++++++++++++++++++++++---------
>  block/ioctl.c                 |  5 ++-
>  include/linux/blk_types.h     | 27 +++++++++++++++--
>  include/linux/blkdev.h        | 57 ++++++++++++++++++++++++++++++-----
>  include/uapi/linux/blkzoned.h | 17 +++++++++--
>  6 files changed, 133 insertions(+), 27 deletions(-)
> 
> diff --git a/block/blk-core.c b/block/blk-core.c
> index 8340f69670d8..c0f0dbad548d 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -897,6 +897,9 @@ generic_make_request_checks(struct bio *bio)
>  			goto not_supported;
>  		break;
>  	case REQ_OP_ZONE_RESET:
> +	case REQ_OP_ZONE_OPEN:
> +	case REQ_OP_ZONE_CLOSE:
> +	case REQ_OP_ZONE_FINISH:
>  		if (!blk_queue_is_zoned(q))
>  			goto not_supported;
>  		break;
> diff --git a/block/blk-zoned.c b/block/blk-zoned.c
> index ae7e91bd0618..d0c933593b93 100644
> --- a/block/blk-zoned.c
> +++ b/block/blk-zoned.c
> @@ -201,20 +201,22 @@ int blkdev_report_zones(struct block_device *bdev, sector_t sector,
>  EXPORT_SYMBOL_GPL(blkdev_report_zones);
>  
>  /**
> - * blkdev_reset_zones - Reset zones write pointer
> + * blkdev_zones_mgmt_op - Perform the specified operation on the zone(s)
>   * @bdev:	Target block device
> - * @sector:	Start sector of the first zone to reset
> - * @nr_sectors:	Number of sectors, at least the length of one zone
> + * @op:		Operation to be performed on the zone(s)
> + * @sector:	Start sector of the first zone to operate on
> + * @nr_sectors:	Number of sectors, at least the length of one zone and
> + *              must be zone size aligned.
>   * @gfp_mask:	Memory allocation flags (for bio_alloc)
>   *
>   * Description:
> - *    Reset the write pointer of the zones contained in the range
> + *    Perform the specified operation contained in the range
	Perform the specified operation over the sector range
>   *    @sector..@sector+@nr_sectors. Specifying the entire disk sector range
>   *    is valid, but the specified range should not contain conventional zones.
>   */
> -int blkdev_reset_zones(struct block_device *bdev,
> -		       sector_t sector, sector_t nr_sectors,
> -		       gfp_t gfp_mask)
> +int blkdev_zones_mgmt_op(struct block_device *bdev, enum req_opf op,
> +			 sector_t sector, sector_t nr_sectors,
> +			 gfp_t gfp_mask)
>  {
>  	struct request_queue *q = bdev_get_queue(bdev);
>  	sector_t zone_sectors;
> @@ -226,6 +228,9 @@ int blkdev_reset_zones(struct block_device *bdev,
>  	if (!blk_queue_is_zoned(q))
>  		return -EOPNOTSUPP;
>  
> +	if (!op_is_zone_mgmt_op(op))
> +		return -EOPNOTSUPP;

EINVAL may be better here.

> +
>  	if (bdev_read_only(bdev))
>  		return -EPERM;
>  
> @@ -248,7 +253,7 @@ int blkdev_reset_zones(struct block_device *bdev,
>  		bio = blk_next_bio(bio, 0, gfp_mask);
>  		bio->bi_iter.bi_sector = sector;
>  		bio_set_dev(bio, bdev);
> -		bio_set_op_attrs(bio, REQ_OP_ZONE_RESET, 0);
> +		bio_set_op_attrs(bio, op, 0);
>  
>  		sector += zone_sectors;
>  
> @@ -264,7 +269,7 @@ int blkdev_reset_zones(struct block_device *bdev,
>  
>  	return ret;
>  }
> -EXPORT_SYMBOL_GPL(blkdev_reset_zones);
> +EXPORT_SYMBOL_GPL(blkdev_zones_mgmt_op);
>  
>  /*
>   * BLKREPORTZONE ioctl processing.
> @@ -329,15 +334,16 @@ int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode,
>  }
>  
>  /*
> - * BLKRESETZONE ioctl processing.
> + * Zone operation (open, close, finish or reset) ioctl processing.
>   * Called from blkdev_ioctl.
>   */
> -int blkdev_reset_zones_ioctl(struct block_device *bdev, fmode_t mode,
> -			     unsigned int cmd, unsigned long arg)
> +int blkdev_zones_mgmt_op_ioctl(struct block_device *bdev, fmode_t mode,
> +				unsigned int cmd, unsigned long arg)
>  {
>  	void __user *argp = (void __user *)arg;
>  	struct request_queue *q;
>  	struct blk_zone_range zrange;
> +	enum req_opf op;
>  
>  	if (!argp)
>  		return -EINVAL;
> @@ -358,8 +364,25 @@ int blkdev_reset_zones_ioctl(struct block_device *bdev, fmode_t mode,
>  	if (copy_from_user(&zrange, argp, sizeof(struct blk_zone_range)))
>  		return -EFAULT;
>  
> -	return blkdev_reset_zones(bdev, zrange.sector, zrange.nr_sectors,
> -				  GFP_KERNEL);
> +	switch (cmd) {
> +	case BLKRESETZONE:
> +		op = REQ_OP_ZONE_RESET;
> +		break;
> +	case BLKOPENZONE:
> +		op = REQ_OP_ZONE_OPEN;
> +		break;
> +	case BLKCLOSEZONE:
> +		op = REQ_OP_ZONE_CLOSE;
> +		break;
> +	case BLKFINISHZONE:
> +		op = REQ_OP_ZONE_FINISH;
> +		break;
> +	default:
> +		return -ENOTTY;
> +	}
> +
> +	return blkdev_zones_mgmt_op(bdev, op, zrange.sector, zrange.nr_sectors,
> +				    GFP_KERNEL);
>  }
>  
>  static inline unsigned long *blk_alloc_zone_bitmap(int node,
> diff --git a/block/ioctl.c b/block/ioctl.c
> index 15a0eb80ada9..df7fe54db158 100644
> --- a/block/ioctl.c
> +++ b/block/ioctl.c
> @@ -532,7 +532,10 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
>  	case BLKREPORTZONE:
>  		return blkdev_report_zones_ioctl(bdev, mode, cmd, arg);
>  	case BLKRESETZONE:
> -		return blkdev_reset_zones_ioctl(bdev, mode, cmd, arg);
> +	case BLKOPENZONE:
> +	case BLKCLOSEZONE:
> +	case BLKFINISHZONE:
> +		return blkdev_zones_mgmt_op_ioctl(bdev, mode, cmd, arg);
>  	case BLKGETZONESZ:
>  		return put_uint(arg, bdev_zone_sectors(bdev));
>  	case BLKGETNRZONES:
> diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
> index 95202f80676c..067ef9242275 100644
> --- a/include/linux/blk_types.h
> +++ b/include/linux/blk_types.h
> @@ -284,13 +284,20 @@ enum req_opf {
>  	REQ_OP_DISCARD		= 3,
>  	/* securely erase sectors */
>  	REQ_OP_SECURE_ERASE	= 5,
> -	/* reset a zone write pointer */
> -	REQ_OP_ZONE_RESET	= 6,
>  	/* write the same sector many times */
>  	REQ_OP_WRITE_SAME	= 7,
>  	/* write the zero filled sector many times */
>  	REQ_OP_WRITE_ZEROES	= 9,
>  
> +	/* reset a zone write pointer */
> +	REQ_OP_ZONE_RESET	= 16,
> +	/* Open zone(s) */
> +	REQ_OP_ZONE_OPEN	= 17,
> +	/* Close zone(s) */
> +	REQ_OP_ZONE_CLOSE	= 18,
> +	/* Finish zone(s) */
> +	REQ_OP_ZONE_FINISH	= 19,
> +
>  	/* SCSI passthrough using struct scsi_request */
>  	REQ_OP_SCSI_IN		= 32,
>  	REQ_OP_SCSI_OUT		= 33,
> @@ -375,6 +382,22 @@ static inline void bio_set_op_attrs(struct bio *bio, unsigned op,
>  	bio->bi_opf = op | op_flags;
>  }
>  
> +/*
> + * Check if the op is zoned operation.
      Check if op is a zone management operation.
> + */
> +static inline bool op_is_zone_mgmt_op(enum req_opf op)

Similarly to "op_is_write" pattern, "op_is_zone_mgmt" may be a better name.

> +{
> +	switch (op) {
> +	case REQ_OP_ZONE_RESET:
> +	case REQ_OP_ZONE_OPEN:
> +	case REQ_OP_ZONE_CLOSE:
> +	case REQ_OP_ZONE_FINISH:
> +		return true;
> +	default:
> +		return false;
> +	}
> +}
> +
>  static inline bool op_is_write(unsigned int op)
>  {
>  	return (op & 1);
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index 592669bcc536..943084f9dc9c 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -348,14 +348,15 @@ extern unsigned int blkdev_nr_zones(struct block_device *bdev);
>  extern int blkdev_report_zones(struct block_device *bdev,
>  			       sector_t sector, struct blk_zone *zones,
>  			       unsigned int *nr_zones, gfp_t gfp_mask);
> -extern int blkdev_reset_zones(struct block_device *bdev, sector_t sectors,
> -			      sector_t nr_sectors, gfp_t gfp_mask);
>  extern int blk_revalidate_disk_zones(struct gendisk *disk);
>  
>  extern int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode,
>  				     unsigned int cmd, unsigned long arg);
> -extern int blkdev_reset_zones_ioctl(struct block_device *bdev, fmode_t mode,
> -				    unsigned int cmd, unsigned long arg);
> +extern int blkdev_zones_mgmt_op_ioctl(struct block_device *bdev, fmode_t mode,
> +					unsigned int cmd, unsigned long arg);
> +extern int blkdev_zones_mgmt_op(struct block_device *bdev, enum req_opf op,
> +				sector_t sector, sector_t nr_sectors,
> +				gfp_t gfp_mask);

To keep the grouping of declarations, may be declare this one where
blkdev_reset_zones() was ?

>  
>  #else /* CONFIG_BLK_DEV_ZONED */
>  
> @@ -376,15 +377,57 @@ static inline int blkdev_report_zones_ioctl(struct block_device *bdev,
>  	return -ENOTTY;
>  }
>  
> -static inline int blkdev_reset_zones_ioctl(struct block_device *bdev,
> -					   fmode_t mode, unsigned int cmd,
> -					   unsigned long arg)
> +static inline int blkdev_zones_mgmt_op_ioctl(struct block_device *bdev,
> +					     fmode_t mode, unsigned int cmd,
> +					     unsigned long arg)
> +{
> +	return -ENOTTY;
> +}
> +
> +static inline int blkdev_zones_mgmt_op(struct block_device *bdev,
> +				       enum req_opf op,
> +				       sector_t sector, sector_t nr_sectors,
> +				       gfp_t gfp_mask)
>  {
>  	return -ENOTTY;

That should be -ENOTSUPP. This is not an ioctl. The ioctl call is above this one.

>  }
>  
>  #endif /* CONFIG_BLK_DEV_ZONED */
>  
> +static inline int blkdev_reset_zones(struct block_device *bdev,
> +				     sector_t sector, sector_t nr_sectors,
> +				     gfp_t gfp_mask)
> +{
> +	return blkdev_zones_mgmt_op(bdev, REQ_OP_ZONE_RESET,
> +				    sector, nr_sectors, gfp_mask);
> +}
> +
> +static inline int blkdev_open_zones(struct block_device *bdev,
> +				    sector_t sector, sector_t nr_sectors,
> +				    gfp_t gfp_mask)
> +{
> +	return blkdev_zones_mgmt_op(bdev, REQ_OP_ZONE_OPEN,
> +				    sector, nr_sectors, gfp_mask);
> +}
> +
> +static inline int blkdev_close_zones(struct block_device *bdev,
> +				     sector_t sector, sector_t nr_sectors,
> +				     gfp_t gfp_mask)
> +{
> +	return blkdev_zones_mgmt_op(bdev, REQ_OP_ZONE_CLOSE,
> +				    sector, nr_sectors,
> +				    gfp_mask);
> +}
> +
> +static inline int blkdev_finish_zones(struct block_device *bdev,
> +				      sector_t sector, sector_t nr_sectors,
> +				      gfp_t gfp_mask)
> +{
> +	return blkdev_zones_mgmt_op(bdev, REQ_OP_ZONE_FINISH,
> +				    sector, nr_sectors,
> +				    gfp_mask);
> +}
> +
>  struct request_queue {
>  	/*
>  	 * Together with queue_head for cacheline sharing
> diff --git a/include/uapi/linux/blkzoned.h b/include/uapi/linux/blkzoned.h
> index 498eec813494..701e0692b8d3 100644
> --- a/include/uapi/linux/blkzoned.h
> +++ b/include/uapi/linux/blkzoned.h
> @@ -120,9 +120,11 @@ struct blk_zone_report {
>  };
>  
>  /**
> - * struct blk_zone_range - BLKRESETZONE ioctl request
> - * @sector: starting sector of the first zone to issue reset write pointer
> - * @nr_sectors: Total number of sectors of 1 or more zones to reset
> + * struct blk_zone_range - BLKRESETZONE/BLKOPENZONE/
> + *			   BLKCLOSEZONE/BLKFINISHZONE ioctl
> + *			   request
> + * @sector: starting sector of the first zone to operate on
> + * @nr_sectors: Total number of sectors of all zones to operate on
>   */
>  struct blk_zone_range {
>  	__u64		sector;
> @@ -139,10 +141,19 @@ struct blk_zone_range {
>   *                sector range. The sector range must be zone aligned.
>   * @BLKGETZONESZ: Get the device zone size in number of 512 B sectors.
>   * @BLKGETNRZONES: Get the total number of zones of the device.
> + * @BLKOPENZONE: Open the zones in the specified sector range. The
> + *               sector range must be zone aligned.
> + * @BLKCLOSEZONE: Close the zones in the specified sector range. The
> + *                sector range must be zone aligned.
> + * @BLKFINISHZONE: Finish the zones in the specified sector range. The
> + *                 sector range must be zone aligned.
>   */
>  #define BLKREPORTZONE	_IOWR(0x12, 130, struct blk_zone_report)
>  #define BLKRESETZONE	_IOW(0x12, 131, struct blk_zone_range)
>  #define BLKGETZONESZ	_IOR(0x12, 132, __u32)
>  #define BLKGETNRZONES	_IOR(0x12, 133, __u32)
> +#define BLKOPENZONE	_IOW(0x12, 134, struct blk_zone_range)
> +#define BLKCLOSEZONE	_IOW(0x12, 135, struct blk_zone_range)
> +#define BLKFINISHZONE	_IOW(0x12, 136, struct blk_zone_range)
>  
>  #endif /* _UAPI_BLKZONED_H */
>
Minwoo Im June 22, 2019, 6:04 a.m. UTC | #2
On 19-06-21 15:07:08, Matias Bjørling wrote:
> @@ -226,6 +228,9 @@ int blkdev_reset_zones(struct block_device *bdev,
>  	if (!blk_queue_is_zoned(q))
>  		return -EOPNOTSUPP;
>  
> +	if (!op_is_zone_mgmt_op(op))
> +		return -EOPNOTSUPP;
> +

nitpick: -EINVAL looks better to return as Damien pointed out.

Otherwise it looks good to me:

Reviewed-by: Minwoo Im <minwoo.im.dev@gmail.com>
Matias Bjorling June 24, 2019, 10:36 a.m. UTC | #3
On 6/22/19 2:51 AM, Damien Le Moal wrote:
> Matias,
> 
> Some comments inline below.
> 
> On 2019/06/21 22:07, Matias Bjørling wrote:
>> From: Ajay Joshi <ajay.joshi@wdc.com>
>>
>> Zoned block devices allows one to control zone transitions by using
>> explicit commands. The available transitions are:
>>
>>    * Open zone: Transition a zone to open state.
>>    * Close zone: Transition a zone to closed state.
>>    * Finish zone: Transition a zone to full state.
>>
>> Allow kernel to issue these transitions by introducing
>> blkdev_zones_mgmt_op() and add three new request opcodes:
>>
>>    * REQ_IO_ZONE_OPEN, REQ_IO_ZONE_CLOSE, and REQ_OP_ZONE_FINISH
>>
>> Allow user-space to issue the transitions through the following ioctls:
>>
>>    * BLKOPENZONE, BLKCLOSEZONE, and BLKFINISHZONE.
>>
>> Signed-off-by: Ajay Joshi <ajay.joshi@wdc.com>
>> Signed-off-by: Aravind Ramesh <aravind.ramesh@wdc.com>
>> Signed-off-by: Matias Bjørling <matias.bjorling@wdc.com>
>> ---
>>   block/blk-core.c              |  3 ++
>>   block/blk-zoned.c             | 51 ++++++++++++++++++++++---------
>>   block/ioctl.c                 |  5 ++-
>>   include/linux/blk_types.h     | 27 +++++++++++++++--
>>   include/linux/blkdev.h        | 57 ++++++++++++++++++++++++++++++-----
>>   include/uapi/linux/blkzoned.h | 17 +++++++++--
>>   6 files changed, 133 insertions(+), 27 deletions(-)
>>
>> diff --git a/block/blk-core.c b/block/blk-core.c
>> index 8340f69670d8..c0f0dbad548d 100644
>> --- a/block/blk-core.c
>> +++ b/block/blk-core.c
>> @@ -897,6 +897,9 @@ generic_make_request_checks(struct bio *bio)
>>   			goto not_supported;
>>   		break;
>>   	case REQ_OP_ZONE_RESET:
>> +	case REQ_OP_ZONE_OPEN:
>> +	case REQ_OP_ZONE_CLOSE:
>> +	case REQ_OP_ZONE_FINISH:
>>   		if (!blk_queue_is_zoned(q))
>>   			goto not_supported;
>>   		break;
>> diff --git a/block/blk-zoned.c b/block/blk-zoned.c
>> index ae7e91bd0618..d0c933593b93 100644
>> --- a/block/blk-zoned.c
>> +++ b/block/blk-zoned.c
>> @@ -201,20 +201,22 @@ int blkdev_report_zones(struct block_device *bdev, sector_t sector,
>>   EXPORT_SYMBOL_GPL(blkdev_report_zones);
>>   
>>   /**
>> - * blkdev_reset_zones - Reset zones write pointer
>> + * blkdev_zones_mgmt_op - Perform the specified operation on the zone(s)
>>    * @bdev:	Target block device
>> - * @sector:	Start sector of the first zone to reset
>> - * @nr_sectors:	Number of sectors, at least the length of one zone
>> + * @op:		Operation to be performed on the zone(s)
>> + * @sector:	Start sector of the first zone to operate on
>> + * @nr_sectors:	Number of sectors, at least the length of one zone and
>> + *              must be zone size aligned.
>>    * @gfp_mask:	Memory allocation flags (for bio_alloc)
>>    *
>>    * Description:
>> - *    Reset the write pointer of the zones contained in the range
>> + *    Perform the specified operation contained in the range
> 	Perform the specified operation over the sector range
>>    *    @sector..@sector+@nr_sectors. Specifying the entire disk sector range
>>    *    is valid, but the specified range should not contain conventional zones.
>>    */
>> -int blkdev_reset_zones(struct block_device *bdev,
>> -		       sector_t sector, sector_t nr_sectors,
>> -		       gfp_t gfp_mask)
>> +int blkdev_zones_mgmt_op(struct block_device *bdev, enum req_opf op,
>> +			 sector_t sector, sector_t nr_sectors,
>> +			 gfp_t gfp_mask)
>>   {
>>   	struct request_queue *q = bdev_get_queue(bdev);
>>   	sector_t zone_sectors;
>> @@ -226,6 +228,9 @@ int blkdev_reset_zones(struct block_device *bdev,
>>   	if (!blk_queue_is_zoned(q))
>>   		return -EOPNOTSUPP;
>>   
>> +	if (!op_is_zone_mgmt_op(op))
>> +		return -EOPNOTSUPP;
> 
> EINVAL may be better here.
> 
>> +
>>   	if (bdev_read_only(bdev))
>>   		return -EPERM;
>>   
>> @@ -248,7 +253,7 @@ int blkdev_reset_zones(struct block_device *bdev,
>>   		bio = blk_next_bio(bio, 0, gfp_mask);
>>   		bio->bi_iter.bi_sector = sector;
>>   		bio_set_dev(bio, bdev);
>> -		bio_set_op_attrs(bio, REQ_OP_ZONE_RESET, 0);
>> +		bio_set_op_attrs(bio, op, 0);
>>   
>>   		sector += zone_sectors;
>>   
>> @@ -264,7 +269,7 @@ int blkdev_reset_zones(struct block_device *bdev,
>>   
>>   	return ret;
>>   }
>> -EXPORT_SYMBOL_GPL(blkdev_reset_zones);
>> +EXPORT_SYMBOL_GPL(blkdev_zones_mgmt_op);
>>   
>>   /*
>>    * BLKREPORTZONE ioctl processing.
>> @@ -329,15 +334,16 @@ int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode,
>>   }
>>   
>>   /*
>> - * BLKRESETZONE ioctl processing.
>> + * Zone operation (open, close, finish or reset) ioctl processing.
>>    * Called from blkdev_ioctl.
>>    */
>> -int blkdev_reset_zones_ioctl(struct block_device *bdev, fmode_t mode,
>> -			     unsigned int cmd, unsigned long arg)
>> +int blkdev_zones_mgmt_op_ioctl(struct block_device *bdev, fmode_t mode,
>> +				unsigned int cmd, unsigned long arg)
>>   {
>>   	void __user *argp = (void __user *)arg;
>>   	struct request_queue *q;
>>   	struct blk_zone_range zrange;
>> +	enum req_opf op;
>>   
>>   	if (!argp)
>>   		return -EINVAL;
>> @@ -358,8 +364,25 @@ int blkdev_reset_zones_ioctl(struct block_device *bdev, fmode_t mode,
>>   	if (copy_from_user(&zrange, argp, sizeof(struct blk_zone_range)))
>>   		return -EFAULT;
>>   
>> -	return blkdev_reset_zones(bdev, zrange.sector, zrange.nr_sectors,
>> -				  GFP_KERNEL);
>> +	switch (cmd) {
>> +	case BLKRESETZONE:
>> +		op = REQ_OP_ZONE_RESET;
>> +		break;
>> +	case BLKOPENZONE:
>> +		op = REQ_OP_ZONE_OPEN;
>> +		break;
>> +	case BLKCLOSEZONE:
>> +		op = REQ_OP_ZONE_CLOSE;
>> +		break;
>> +	case BLKFINISHZONE:
>> +		op = REQ_OP_ZONE_FINISH;
>> +		break;
>> +	default:
>> +		return -ENOTTY;
>> +	}
>> +
>> +	return blkdev_zones_mgmt_op(bdev, op, zrange.sector, zrange.nr_sectors,
>> +				    GFP_KERNEL);
>>   }
>>   
>>   static inline unsigned long *blk_alloc_zone_bitmap(int node,
>> diff --git a/block/ioctl.c b/block/ioctl.c
>> index 15a0eb80ada9..df7fe54db158 100644
>> --- a/block/ioctl.c
>> +++ b/block/ioctl.c
>> @@ -532,7 +532,10 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
>>   	case BLKREPORTZONE:
>>   		return blkdev_report_zones_ioctl(bdev, mode, cmd, arg);
>>   	case BLKRESETZONE:
>> -		return blkdev_reset_zones_ioctl(bdev, mode, cmd, arg);
>> +	case BLKOPENZONE:
>> +	case BLKCLOSEZONE:
>> +	case BLKFINISHZONE:
>> +		return blkdev_zones_mgmt_op_ioctl(bdev, mode, cmd, arg);
>>   	case BLKGETZONESZ:
>>   		return put_uint(arg, bdev_zone_sectors(bdev));
>>   	case BLKGETNRZONES:
>> diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
>> index 95202f80676c..067ef9242275 100644
>> --- a/include/linux/blk_types.h
>> +++ b/include/linux/blk_types.h
>> @@ -284,13 +284,20 @@ enum req_opf {
>>   	REQ_OP_DISCARD		= 3,
>>   	/* securely erase sectors */
>>   	REQ_OP_SECURE_ERASE	= 5,
>> -	/* reset a zone write pointer */
>> -	REQ_OP_ZONE_RESET	= 6,
>>   	/* write the same sector many times */
>>   	REQ_OP_WRITE_SAME	= 7,
>>   	/* write the zero filled sector many times */
>>   	REQ_OP_WRITE_ZEROES	= 9,
>>   
>> +	/* reset a zone write pointer */
>> +	REQ_OP_ZONE_RESET	= 16,
>> +	/* Open zone(s) */
>> +	REQ_OP_ZONE_OPEN	= 17,
>> +	/* Close zone(s) */
>> +	REQ_OP_ZONE_CLOSE	= 18,
>> +	/* Finish zone(s) */
>> +	REQ_OP_ZONE_FINISH	= 19,
>> +
>>   	/* SCSI passthrough using struct scsi_request */
>>   	REQ_OP_SCSI_IN		= 32,
>>   	REQ_OP_SCSI_OUT		= 33,
>> @@ -375,6 +382,22 @@ static inline void bio_set_op_attrs(struct bio *bio, unsigned op,
>>   	bio->bi_opf = op | op_flags;
>>   }
>>   
>> +/*
>> + * Check if the op is zoned operation.
>        Check if op is a zone management operation.
>> + */
>> +static inline bool op_is_zone_mgmt_op(enum req_opf op)
> 
> Similarly to "op_is_write" pattern, "op_is_zone_mgmt" may be a better name.
> 
>> +{
>> +	switch (op) {
>> +	case REQ_OP_ZONE_RESET:
>> +	case REQ_OP_ZONE_OPEN:
>> +	case REQ_OP_ZONE_CLOSE:
>> +	case REQ_OP_ZONE_FINISH:
>> +		return true;
>> +	default:
>> +		return false;
>> +	}
>> +}
>> +
>>   static inline bool op_is_write(unsigned int op)
>>   {
>>   	return (op & 1);
>> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
>> index 592669bcc536..943084f9dc9c 100644
>> --- a/include/linux/blkdev.h
>> +++ b/include/linux/blkdev.h
>> @@ -348,14 +348,15 @@ extern unsigned int blkdev_nr_zones(struct block_device *bdev);
>>   extern int blkdev_report_zones(struct block_device *bdev,
>>   			       sector_t sector, struct blk_zone *zones,
>>   			       unsigned int *nr_zones, gfp_t gfp_mask);
>> -extern int blkdev_reset_zones(struct block_device *bdev, sector_t sectors,
>> -			      sector_t nr_sectors, gfp_t gfp_mask);
>>   extern int blk_revalidate_disk_zones(struct gendisk *disk);
>>   
>>   extern int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode,
>>   				     unsigned int cmd, unsigned long arg);
>> -extern int blkdev_reset_zones_ioctl(struct block_device *bdev, fmode_t mode,
>> -				    unsigned int cmd, unsigned long arg);
>> +extern int blkdev_zones_mgmt_op_ioctl(struct block_device *bdev, fmode_t mode,
>> +					unsigned int cmd, unsigned long arg);
>> +extern int blkdev_zones_mgmt_op(struct block_device *bdev, enum req_opf op,
>> +				sector_t sector, sector_t nr_sectors,
>> +				gfp_t gfp_mask);
> 
> To keep the grouping of declarations, may be declare this one where
> blkdev_reset_zones() was ?
> 
>>   
>>   #else /* CONFIG_BLK_DEV_ZONED */
>>   
>> @@ -376,15 +377,57 @@ static inline int blkdev_report_zones_ioctl(struct block_device *bdev,
>>   	return -ENOTTY;
>>   }
>>   
>> -static inline int blkdev_reset_zones_ioctl(struct block_device *bdev,
>> -					   fmode_t mode, unsigned int cmd,
>> -					   unsigned long arg)
>> +static inline int blkdev_zones_mgmt_op_ioctl(struct block_device *bdev,
>> +					     fmode_t mode, unsigned int cmd,
>> +					     unsigned long arg)
>> +{
>> +	return -ENOTTY;
>> +}
>> +
>> +static inline int blkdev_zones_mgmt_op(struct block_device *bdev,
>> +				       enum req_opf op,
>> +				       sector_t sector, sector_t nr_sectors,
>> +				       gfp_t gfp_mask)
>>   {
>>   	return -ENOTTY;
> 
> That should be -ENOTSUPP. This is not an ioctl. The ioctl call is above this one.
> 
>>   }
>>   
>>   #endif /* CONFIG_BLK_DEV_ZONED */
>>   
>> +static inline int blkdev_reset_zones(struct block_device *bdev,
>> +				     sector_t sector, sector_t nr_sectors,
>> +				     gfp_t gfp_mask)
>> +{
>> +	return blkdev_zones_mgmt_op(bdev, REQ_OP_ZONE_RESET,
>> +				    sector, nr_sectors, gfp_mask);
>> +}
>> +
>> +static inline int blkdev_open_zones(struct block_device *bdev,
>> +				    sector_t sector, sector_t nr_sectors,
>> +				    gfp_t gfp_mask)
>> +{
>> +	return blkdev_zones_mgmt_op(bdev, REQ_OP_ZONE_OPEN,
>> +				    sector, nr_sectors, gfp_mask);
>> +}
>> +
>> +static inline int blkdev_close_zones(struct block_device *bdev,
>> +				     sector_t sector, sector_t nr_sectors,
>> +				     gfp_t gfp_mask)
>> +{
>> +	return blkdev_zones_mgmt_op(bdev, REQ_OP_ZONE_CLOSE,
>> +				    sector, nr_sectors,
>> +				    gfp_mask);
>> +}
>> +
>> +static inline int blkdev_finish_zones(struct block_device *bdev,
>> +				      sector_t sector, sector_t nr_sectors,
>> +				      gfp_t gfp_mask)
>> +{
>> +	return blkdev_zones_mgmt_op(bdev, REQ_OP_ZONE_FINISH,
>> +				    sector, nr_sectors,
>> +				    gfp_mask);
>> +}
>> +
>>   struct request_queue {
>>   	/*
>>   	 * Together with queue_head for cacheline sharing
>> diff --git a/include/uapi/linux/blkzoned.h b/include/uapi/linux/blkzoned.h
>> index 498eec813494..701e0692b8d3 100644
>> --- a/include/uapi/linux/blkzoned.h
>> +++ b/include/uapi/linux/blkzoned.h
>> @@ -120,9 +120,11 @@ struct blk_zone_report {
>>   };
>>   
>>   /**
>> - * struct blk_zone_range - BLKRESETZONE ioctl request
>> - * @sector: starting sector of the first zone to issue reset write pointer
>> - * @nr_sectors: Total number of sectors of 1 or more zones to reset
>> + * struct blk_zone_range - BLKRESETZONE/BLKOPENZONE/
>> + *			   BLKCLOSEZONE/BLKFINISHZONE ioctl
>> + *			   request
>> + * @sector: starting sector of the first zone to operate on
>> + * @nr_sectors: Total number of sectors of all zones to operate on
>>    */
>>   struct blk_zone_range {
>>   	__u64		sector;
>> @@ -139,10 +141,19 @@ struct blk_zone_range {
>>    *                sector range. The sector range must be zone aligned.
>>    * @BLKGETZONESZ: Get the device zone size in number of 512 B sectors.
>>    * @BLKGETNRZONES: Get the total number of zones of the device.
>> + * @BLKOPENZONE: Open the zones in the specified sector range. The
>> + *               sector range must be zone aligned.
>> + * @BLKCLOSEZONE: Close the zones in the specified sector range. The
>> + *                sector range must be zone aligned.
>> + * @BLKFINISHZONE: Finish the zones in the specified sector range. The
>> + *                 sector range must be zone aligned.
>>    */
>>   #define BLKREPORTZONE	_IOWR(0x12, 130, struct blk_zone_report)
>>   #define BLKRESETZONE	_IOW(0x12, 131, struct blk_zone_range)
>>   #define BLKGETZONESZ	_IOR(0x12, 132, __u32)
>>   #define BLKGETNRZONES	_IOR(0x12, 133, __u32)
>> +#define BLKOPENZONE	_IOW(0x12, 134, struct blk_zone_range)
>> +#define BLKCLOSEZONE	_IOW(0x12, 135, struct blk_zone_range)
>> +#define BLKFINISHZONE	_IOW(0x12, 136, struct blk_zone_range)
>>   
>>   #endif /* _UAPI_BLKZONED_H */
>>
> 
> 

Thanks Damien.

I was wondering if we should, instead of introducing three new ioctls, 
make a v2 of the zone mgmt API?

Something like the following data structure being passed from user-space:

struct blk_zone_mgmt {
	__u8		opcode;
	__u8 		resv[3];
	__u32		flags;
	__u64		sector;
	__u64		nr_sectors;
	__u64		resv1; /* to make room if we where do pass a data 			 
data pointer through this API */
};

-Matias
Bart Van Assche June 24, 2019, 7:43 p.m. UTC | #4
On 6/21/19 6:07 AM, Matias Bjørling wrote:
> diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
> index 95202f80676c..067ef9242275 100644
> --- a/include/linux/blk_types.h
> +++ b/include/linux/blk_types.h
> @@ -284,13 +284,20 @@ enum req_opf {
>   	REQ_OP_DISCARD		= 3,
>   	/* securely erase sectors */
>   	REQ_OP_SECURE_ERASE	= 5,
> -	/* reset a zone write pointer */
> -	REQ_OP_ZONE_RESET	= 6,
>   	/* write the same sector many times */
>   	REQ_OP_WRITE_SAME	= 7,
>   	/* write the zero filled sector many times */
>   	REQ_OP_WRITE_ZEROES	= 9,
>   
> +	/* reset a zone write pointer */
> +	REQ_OP_ZONE_RESET	= 16,
> +	/* Open zone(s) */
> +	REQ_OP_ZONE_OPEN	= 17,
> +	/* Close zone(s) */
> +	REQ_OP_ZONE_CLOSE	= 18,
> +	/* Finish zone(s) */
> +	REQ_OP_ZONE_FINISH	= 19,
> +
>   	/* SCSI passthrough using struct scsi_request */
>   	REQ_OP_SCSI_IN		= 32,
>   	REQ_OP_SCSI_OUT		= 33,
> @@ -375,6 +382,22 @@ static inline void bio_set_op_attrs(struct bio *bio, unsigned op,
>   	bio->bi_opf = op | op_flags;
>   }

Are the new operation types ever passed to op_is_write()? The definition 
of that function is as follows:

static inline bool op_is_write(unsigned int op)
{
	return (op & 1);
}
Chaitanya Kulkarni June 24, 2019, 10:27 p.m. UTC | #5
On 6/24/19 12:43 PM, Bart Van Assche wrote:
> On 6/21/19 6:07 AM, Matias Bjørling wrote:
>> diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
>> index 95202f80676c..067ef9242275 100644
>> --- a/include/linux/blk_types.h
>> +++ b/include/linux/blk_types.h
>> @@ -284,13 +284,20 @@ enum req_opf {
>>    	REQ_OP_DISCARD		= 3,
>>    	/* securely erase sectors */
>>    	REQ_OP_SECURE_ERASE	= 5,
>> -	/* reset a zone write pointer */
>> -	REQ_OP_ZONE_RESET	= 6,
>>    	/* write the same sector many times */
>>    	REQ_OP_WRITE_SAME	= 7,
>>    	/* write the zero filled sector many times */
>>    	REQ_OP_WRITE_ZEROES	= 9,
>>    
>> +	/* reset a zone write pointer */
>> +	REQ_OP_ZONE_RESET	= 16,
>> +	/* Open zone(s) */
>> +	REQ_OP_ZONE_OPEN	= 17,
>> +	/* Close zone(s) */
>> +	REQ_OP_ZONE_CLOSE	= 18,
>> +	/* Finish zone(s) */
>> +	REQ_OP_ZONE_FINISH	= 19,
>> +
>>    	/* SCSI passthrough using struct scsi_request */
>>    	REQ_OP_SCSI_IN		= 32,
>>    	REQ_OP_SCSI_OUT		= 33,
>> @@ -375,6 +382,22 @@ static inline void bio_set_op_attrs(struct bio *bio, unsigned op,
>>    	bio->bi_opf = op | op_flags;
>>    }
> 
> Are the new operation types ever passed to op_is_write()? The definition
> of that function is as follows:
> 
May be we should change numbering since blktrace also relies on the 
having on_is_write() without looking at the rq_ops().

197  * Data direction bit lookup
  198  */
  199 static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ),
  200                                  BLK_TC_ACT(BLK_TC_WRITE) };  <----
  201
  202 #define BLK_TC_RAHEAD           BLK_TC_AHEAD
  203 #define BLK_TC_PREFLUSH         BLK_TC_FLUSH
  204
  205 /* The ilog2() calls fall out because they're constant */
  206 #define MASK_TC_BIT(rw, __name) ((rw & REQ_ ## __name) << \
  207           (ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - __REQ_ ## 
__name))
  208
  209 /*
  210  * The worker for the various blk_add_trace*() types. Fills out a
  211  * blk_io_trace structure and places it in a per-cpu subbuffer.
  212  */
  213 static void __blk_add_trace(struct blk_trace *bt, sector_t sector, 
int bytes,
  214                      int op, int op_flags, u32 what, int error, 
int pdu_len,
  215                      void *pdu_data, union kernfs_node_id *cgid)
  216 {
  217         struct task_struct *tsk = current;
  218         struct ring_buffer_event *event = NULL;
  219         struct ring_buffer *buffer = NULL;
  220         struct blk_io_trace *t;
  221         unsigned long flags = 0;
  222         unsigned long *sequence;
  223         pid_t pid;
  224         int cpu, pc = 0;
  225         bool blk_tracer = blk_tracer_enabled;
  226         ssize_t cgid_len = cgid ? sizeof(*cgid) : 0;
  227
  228         if (unlikely(bt->trace_state != Blktrace_running && 
!blk_tracer))
  229                 return;
  230
  231         what |= ddir_act[op_is_write(op) ? WRITE : READ];  <--- 


> static inline bool op_is_write(unsigned int op)
> {
> 	return (op & 1);
> }
>
Matias Bjorling June 25, 2019, 10:35 a.m. UTC | #6
On 6/25/19 12:27 AM, Chaitanya Kulkarni wrote:
> On 6/24/19 12:43 PM, Bart Van Assche wrote:
>> On 6/21/19 6:07 AM, Matias Bjørling wrote:
>>> diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
>>> index 95202f80676c..067ef9242275 100644
>>> --- a/include/linux/blk_types.h
>>> +++ b/include/linux/blk_types.h
>>> @@ -284,13 +284,20 @@ enum req_opf {
>>>     	REQ_OP_DISCARD		= 3,
>>>     	/* securely erase sectors */
>>>     	REQ_OP_SECURE_ERASE	= 5,
>>> -	/* reset a zone write pointer */
>>> -	REQ_OP_ZONE_RESET	= 6,
>>>     	/* write the same sector many times */
>>>     	REQ_OP_WRITE_SAME	= 7,
>>>     	/* write the zero filled sector many times */
>>>     	REQ_OP_WRITE_ZEROES	= 9,
>>>     
>>> +	/* reset a zone write pointer */
>>> +	REQ_OP_ZONE_RESET	= 16,
>>> +	/* Open zone(s) */
>>> +	REQ_OP_ZONE_OPEN	= 17,
>>> +	/* Close zone(s) */
>>> +	REQ_OP_ZONE_CLOSE	= 18,
>>> +	/* Finish zone(s) */
>>> +	REQ_OP_ZONE_FINISH	= 19,
>>> +
>>>     	/* SCSI passthrough using struct scsi_request */
>>>     	REQ_OP_SCSI_IN		= 32,
>>>     	REQ_OP_SCSI_OUT		= 33,
>>> @@ -375,6 +382,22 @@ static inline void bio_set_op_attrs(struct bio *bio, unsigned op,
>>>     	bio->bi_opf = op | op_flags;
>>>     }
>>
>> Are the new operation types ever passed to op_is_write()? The definition
>> of that function is as follows:
>>
> May be we should change numbering since blktrace also relies on the
> having on_is_write() without looking at the rq_ops().
> 
> 197  * Data direction bit lookup
>    198  */
>    199 static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ),
>    200                                  BLK_TC_ACT(BLK_TC_WRITE) };  <----
>    201
>    202 #define BLK_TC_RAHEAD           BLK_TC_AHEAD
>    203 #define BLK_TC_PREFLUSH         BLK_TC_FLUSH
>    204
>    205 /* The ilog2() calls fall out because they're constant */
>    206 #define MASK_TC_BIT(rw, __name) ((rw & REQ_ ## __name) << \
>    207           (ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - __REQ_ ##
> __name))
>    208
>    209 /*
>    210  * The worker for the various blk_add_trace*() types. Fills out a
>    211  * blk_io_trace structure and places it in a per-cpu subbuffer.
>    212  */
>    213 static void __blk_add_trace(struct blk_trace *bt, sector_t sector,
> int bytes,
>    214                      int op, int op_flags, u32 what, int error,
> int pdu_len,
>    215                      void *pdu_data, union kernfs_node_id *cgid)
>    216 {
>    217         struct task_struct *tsk = current;
>    218         struct ring_buffer_event *event = NULL;
>    219         struct ring_buffer *buffer = NULL;
>    220         struct blk_io_trace *t;
>    221         unsigned long flags = 0;
>    222         unsigned long *sequence;
>    223         pid_t pid;
>    224         int cpu, pc = 0;
>    225         bool blk_tracer = blk_tracer_enabled;
>    226         ssize_t cgid_len = cgid ? sizeof(*cgid) : 0;
>    227
>    228         if (unlikely(bt->trace_state != Blktrace_running &&
> !blk_tracer))
>    229                 return;
>    230
>    231         what |= ddir_act[op_is_write(op) ? WRITE : READ];  <---
> 
> 
>> static inline bool op_is_write(unsigned int op)
>> {
>> 	return (op & 1);
>> }
>>
> 

The zone mgmt commands are neither write nor reads commands. I guess, 
one could characterize them as write commands, but they don't write any 
data, they update a state of a zone on a drive. One should keep it as 
is? and make sure the zone mgmt commands don't get categorized as either 
read/write.
Bart Van Assche June 25, 2019, 3:55 p.m. UTC | #7
On 6/25/19 3:35 AM, Matias Bjørling wrote:
> On 6/25/19 12:27 AM, Chaitanya Kulkarni wrote:
>> On 6/24/19 12:43 PM, Bart Van Assche wrote:
>>> static inline bool op_is_write(unsigned int op)
>>> {
>>>     return (op & 1);
>>> }
>>>
>>
> 
> The zone mgmt commands are neither write nor reads commands. I guess, 
> one could characterize them as write commands, but they don't write any 
> data, they update a state of a zone on a drive. One should keep it as 
> is? and make sure the zone mgmt commands don't get categorized as either 
> read/write.

Since the open, close and finish operations support modifying zone data 
I propose to characterize these as write commands. How about the 
following additional changes:
- Make bio_check_ro() refuse open/close/flush/reset zone operations for 
read-only partitions (see also commit a32e236eb93e ("Partially revert 
"block: fail op_is_write() requests to read-only partitions"") # v4.18).
- In submit_bio(), change op_is_write(bio_op(bio)) ? "WRITE" : "READ" 
into something that uses blk_op_str().
- Add open/close/flush zone support be added in blk_partition_remap().

Thanks,

Bart.
Javier González June 25, 2019, 4:30 p.m. UTC | #8
> On 25 Jun 2019, at 17.55, Bart Van Assche <bvanassche@acm.org> wrote:
> 
> On 6/25/19 3:35 AM, Matias Bjørling wrote:
>> On 6/25/19 12:27 AM, Chaitanya Kulkarni wrote:
>>> On 6/24/19 12:43 PM, Bart Van Assche wrote:
>>>> static inline bool op_is_write(unsigned int op)
>>>> {
>>>>     return (op & 1);
>>>> }
>> The zone mgmt commands are neither write nor reads commands. I guess, one could characterize them as write commands, but they don't write any data, they update a state of a zone on a drive. One should keep it as is? and make sure the zone mgmt commands don't get categorized as either read/write.
> 
> Since the open, close and finish operations support modifying zone data I propose to characterize these as write commands. How about the following additional changes:
> - Make bio_check_ro() refuse open/close/flush/reset zone operations for read-only partitions (see also commit a32e236eb93e ("Partially revert "block: fail op_is_write() requests to read-only partitions"") # v4.18).
> - In submit_bio(), change op_is_write(bio_op(bio)) ? "WRITE" : "READ" into something that uses blk_op_str().
> - Add open/close/flush zone support be added in blk_partition_remap().
> 
> Thanks,
> 
> Bart.

Agreed. This is also what we do with REQ_OP_DISCARD and it makes things
easier in the driver.

Apart from the return comment from Damien and Minwoo, the patch looks good to me.

Reviewed-by: Javier González <javier@javigon.com>
Chaitanya Kulkarni June 25, 2019, 4:51 p.m. UTC | #9
On 06/25/2019 08:56 AM, Bart Van Assche wrote:
> On 6/25/19 3:35 AM, Matias Bjørling wrote:
>> On 6/25/19 12:27 AM, Chaitanya Kulkarni wrote:
>>> On 6/24/19 12:43 PM, Bart Van Assche wrote:
>>>> static inline bool op_is_write(unsigned int op)
>>>> {
>>>>      return (op & 1);
>>>> }
>>>>
>>>
>>
>> The zone mgmt commands are neither write nor reads commands. I guess,
>> one could characterize them as write commands, but they don't write any
>> data, they update a state of a zone on a drive. One should keep it as
>> is? and make sure the zone mgmt commands don't get categorized as either
>> read/write.
>
> Since the open, close and finish operations support modifying zone data
> I propose to characterize these as write commands. How about the
> following additional changes:
> - Make bio_check_ro() refuse open/close/flush/reset zone operations for
                                          ^
Since finish also listed above which supports modifying data do we need 
to add finish here with flush in above line ?

> read-only partitions (see also commit a32e236eb93e ("Partially revert
> "block: fail op_is_write() requests to read-only partitions"") # v4.18).
> - In submit_bio(), change op_is_write(bio_op(bio)) ? "WRITE" : "READ"
> into something that uses blk_op_str().
Good idea, I've a patch for blk_op_str() and debugfs just waiting for 
this to merge. Does it make sense to add that patch in this series ?
> - Add open/close/flush zone support be added in blk_partition_remap().
same here for finish ?
>
> Thanks,
>
> Bart.
>
Matias Bjorling June 25, 2019, 4:53 p.m. UTC | #10
On Tue, Jun 25, 2019 at 6:51 PM Chaitanya Kulkarni
<Chaitanya.Kulkarni@wdc.com> wrote:
>
> On 06/25/2019 08:56 AM, Bart Van Assche wrote:
> > On 6/25/19 3:35 AM, Matias Bjørling wrote:
> >> On 6/25/19 12:27 AM, Chaitanya Kulkarni wrote:
> >>> On 6/24/19 12:43 PM, Bart Van Assche wrote:
> >>>> static inline bool op_is_write(unsigned int op)
> >>>> {
> >>>>      return (op & 1);
> >>>> }
> >>>>
> >>>
> >>
> >> The zone mgmt commands are neither write nor reads commands. I guess,
> >> one could characterize them as write commands, but they don't write any
> >> data, they update a state of a zone on a drive. One should keep it as
> >> is? and make sure the zone mgmt commands don't get categorized as either
> >> read/write.
> >
> > Since the open, close and finish operations support modifying zone data
> > I propose to characterize these as write commands. How about the
> > following additional changes:
> > - Make bio_check_ro() refuse open/close/flush/reset zone operations for
>                                           ^
> Since finish also listed above which supports modifying data do we need
> to add finish here with flush in above line ?
>
> > read-only partitions (see also commit a32e236eb93e ("Partially revert
> > "block: fail op_is_write() requests to read-only partitions"") # v4.18).
> > - In submit_bio(), change op_is_write(bio_op(bio)) ? "WRITE" : "READ"
> > into something that uses blk_op_str().
> Good idea, I've a patch for blk_op_str() and debugfs just waiting for
> this to merge. Does it make sense to add that patch in this series ?

Ship it off separately. Your patches can go in first.

> > - Add open/close/flush zone support be added in blk_partition_remap().
> same here for finish ?
> >
> > Thanks,
> >
> > Bart.
> >
>
Damien Le Moal June 26, 2019, 12:42 a.m. UTC | #11
On 2019/06/26 0:56, Bart Van Assche wrote:
> On 6/25/19 3:35 AM, Matias Bjørling wrote:
>> On 6/25/19 12:27 AM, Chaitanya Kulkarni wrote:
>>> On 6/24/19 12:43 PM, Bart Van Assche wrote:
>>>> static inline bool op_is_write(unsigned int op)
>>>> {
>>>>     return (op & 1);
>>>> }
>>>>
>>>
>>
>> The zone mgmt commands are neither write nor reads commands. I guess, 
>> one could characterize them as write commands, but they don't write any 
>> data, they update a state of a zone on a drive. One should keep it as 
>> is? and make sure the zone mgmt commands don't get categorized as either 
>> read/write.
> 
> Since the open, close and finish operations support modifying zone data 
> I propose to characterize these as write commands. How about the 
> following additional changes:
> - Make bio_check_ro() refuse open/close/flush/reset zone operations for 
> read-only partitions (see also commit a32e236eb93e ("Partially revert 
> "block: fail op_is_write() requests to read-only partitions"") # v4.18).
> - In submit_bio(), change op_is_write(bio_op(bio)) ? "WRITE" : "READ" 
> into something that uses blk_op_str().
> - Add open/close/flush zone support be added in blk_partition_remap().

Sounds good to me. And indeed, all operations should be failed for RO
devices/partitions. Of note though is that only reset and finish operations have
an effect on the zone data. Open and close have no effect at all on the zone
data and only change the zone condition/state. But since they do change
something on the drive, we can still consider them as "write" operations.

Best regards.
Damien Le Moal June 26, 2019, 12:44 a.m. UTC | #12
On 2019/06/26 1:51, Chaitanya Kulkarni wrote:
> On 06/25/2019 08:56 AM, Bart Van Assche wrote:
>> On 6/25/19 3:35 AM, Matias Bjørling wrote:
>>> On 6/25/19 12:27 AM, Chaitanya Kulkarni wrote:
>>>> On 6/24/19 12:43 PM, Bart Van Assche wrote:
>>>>> static inline bool op_is_write(unsigned int op)
>>>>> {
>>>>>      return (op & 1);
>>>>> }
>>>>>
>>>>
>>>
>>> The zone mgmt commands are neither write nor reads commands. I guess,
>>> one could characterize them as write commands, but they don't write any
>>> data, they update a state of a zone on a drive. One should keep it as
>>> is? and make sure the zone mgmt commands don't get categorized as either
>>> read/write.
>>
>> Since the open, close and finish operations support modifying zone data
>> I propose to characterize these as write commands. How about the
>> following additional changes:
>> - Make bio_check_ro() refuse open/close/flush/reset zone operations for
>                                           ^
> Since finish also listed above which supports modifying data do we need 
> to add finish here with flush in above line ?

There is no "zone flush" operation. I guess Bart made a typo here and meant
"finish". "flush" on a zoned disk is not different from regular disks.

> 
>> read-only partitions (see also commit a32e236eb93e ("Partially revert
>> "block: fail op_is_write() requests to read-only partitions"") # v4.18).
>> - In submit_bio(), change op_is_write(bio_op(bio)) ? "WRITE" : "READ"
>> into something that uses blk_op_str().
> Good idea, I've a patch for blk_op_str() and debugfs just waiting for 
> this to merge. Does it make sense to add that patch in this series ?
>> - Add open/close/flush zone support be added in blk_partition_remap().
> same here for finish ?
>>
>> Thanks,
>>
>> Bart.
>>
> 
>
diff mbox series

Patch

diff --git a/block/blk-core.c b/block/blk-core.c
index 8340f69670d8..c0f0dbad548d 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -897,6 +897,9 @@  generic_make_request_checks(struct bio *bio)
 			goto not_supported;
 		break;
 	case REQ_OP_ZONE_RESET:
+	case REQ_OP_ZONE_OPEN:
+	case REQ_OP_ZONE_CLOSE:
+	case REQ_OP_ZONE_FINISH:
 		if (!blk_queue_is_zoned(q))
 			goto not_supported;
 		break;
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index ae7e91bd0618..d0c933593b93 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -201,20 +201,22 @@  int blkdev_report_zones(struct block_device *bdev, sector_t sector,
 EXPORT_SYMBOL_GPL(blkdev_report_zones);
 
 /**
- * blkdev_reset_zones - Reset zones write pointer
+ * blkdev_zones_mgmt_op - Perform the specified operation on the zone(s)
  * @bdev:	Target block device
- * @sector:	Start sector of the first zone to reset
- * @nr_sectors:	Number of sectors, at least the length of one zone
+ * @op:		Operation to be performed on the zone(s)
+ * @sector:	Start sector of the first zone to operate on
+ * @nr_sectors:	Number of sectors, at least the length of one zone and
+ *              must be zone size aligned.
  * @gfp_mask:	Memory allocation flags (for bio_alloc)
  *
  * Description:
- *    Reset the write pointer of the zones contained in the range
+ *    Perform the specified operation contained in the range
  *    @sector..@sector+@nr_sectors. Specifying the entire disk sector range
  *    is valid, but the specified range should not contain conventional zones.
  */
-int blkdev_reset_zones(struct block_device *bdev,
-		       sector_t sector, sector_t nr_sectors,
-		       gfp_t gfp_mask)
+int blkdev_zones_mgmt_op(struct block_device *bdev, enum req_opf op,
+			 sector_t sector, sector_t nr_sectors,
+			 gfp_t gfp_mask)
 {
 	struct request_queue *q = bdev_get_queue(bdev);
 	sector_t zone_sectors;
@@ -226,6 +228,9 @@  int blkdev_reset_zones(struct block_device *bdev,
 	if (!blk_queue_is_zoned(q))
 		return -EOPNOTSUPP;
 
+	if (!op_is_zone_mgmt_op(op))
+		return -EOPNOTSUPP;
+
 	if (bdev_read_only(bdev))
 		return -EPERM;
 
@@ -248,7 +253,7 @@  int blkdev_reset_zones(struct block_device *bdev,
 		bio = blk_next_bio(bio, 0, gfp_mask);
 		bio->bi_iter.bi_sector = sector;
 		bio_set_dev(bio, bdev);
-		bio_set_op_attrs(bio, REQ_OP_ZONE_RESET, 0);
+		bio_set_op_attrs(bio, op, 0);
 
 		sector += zone_sectors;
 
@@ -264,7 +269,7 @@  int blkdev_reset_zones(struct block_device *bdev,
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(blkdev_reset_zones);
+EXPORT_SYMBOL_GPL(blkdev_zones_mgmt_op);
 
 /*
  * BLKREPORTZONE ioctl processing.
@@ -329,15 +334,16 @@  int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode,
 }
 
 /*
- * BLKRESETZONE ioctl processing.
+ * Zone operation (open, close, finish or reset) ioctl processing.
  * Called from blkdev_ioctl.
  */
-int blkdev_reset_zones_ioctl(struct block_device *bdev, fmode_t mode,
-			     unsigned int cmd, unsigned long arg)
+int blkdev_zones_mgmt_op_ioctl(struct block_device *bdev, fmode_t mode,
+				unsigned int cmd, unsigned long arg)
 {
 	void __user *argp = (void __user *)arg;
 	struct request_queue *q;
 	struct blk_zone_range zrange;
+	enum req_opf op;
 
 	if (!argp)
 		return -EINVAL;
@@ -358,8 +364,25 @@  int blkdev_reset_zones_ioctl(struct block_device *bdev, fmode_t mode,
 	if (copy_from_user(&zrange, argp, sizeof(struct blk_zone_range)))
 		return -EFAULT;
 
-	return blkdev_reset_zones(bdev, zrange.sector, zrange.nr_sectors,
-				  GFP_KERNEL);
+	switch (cmd) {
+	case BLKRESETZONE:
+		op = REQ_OP_ZONE_RESET;
+		break;
+	case BLKOPENZONE:
+		op = REQ_OP_ZONE_OPEN;
+		break;
+	case BLKCLOSEZONE:
+		op = REQ_OP_ZONE_CLOSE;
+		break;
+	case BLKFINISHZONE:
+		op = REQ_OP_ZONE_FINISH;
+		break;
+	default:
+		return -ENOTTY;
+	}
+
+	return blkdev_zones_mgmt_op(bdev, op, zrange.sector, zrange.nr_sectors,
+				    GFP_KERNEL);
 }
 
 static inline unsigned long *blk_alloc_zone_bitmap(int node,
diff --git a/block/ioctl.c b/block/ioctl.c
index 15a0eb80ada9..df7fe54db158 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -532,7 +532,10 @@  int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
 	case BLKREPORTZONE:
 		return blkdev_report_zones_ioctl(bdev, mode, cmd, arg);
 	case BLKRESETZONE:
-		return blkdev_reset_zones_ioctl(bdev, mode, cmd, arg);
+	case BLKOPENZONE:
+	case BLKCLOSEZONE:
+	case BLKFINISHZONE:
+		return blkdev_zones_mgmt_op_ioctl(bdev, mode, cmd, arg);
 	case BLKGETZONESZ:
 		return put_uint(arg, bdev_zone_sectors(bdev));
 	case BLKGETNRZONES:
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 95202f80676c..067ef9242275 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -284,13 +284,20 @@  enum req_opf {
 	REQ_OP_DISCARD		= 3,
 	/* securely erase sectors */
 	REQ_OP_SECURE_ERASE	= 5,
-	/* reset a zone write pointer */
-	REQ_OP_ZONE_RESET	= 6,
 	/* write the same sector many times */
 	REQ_OP_WRITE_SAME	= 7,
 	/* write the zero filled sector many times */
 	REQ_OP_WRITE_ZEROES	= 9,
 
+	/* reset a zone write pointer */
+	REQ_OP_ZONE_RESET	= 16,
+	/* Open zone(s) */
+	REQ_OP_ZONE_OPEN	= 17,
+	/* Close zone(s) */
+	REQ_OP_ZONE_CLOSE	= 18,
+	/* Finish zone(s) */
+	REQ_OP_ZONE_FINISH	= 19,
+
 	/* SCSI passthrough using struct scsi_request */
 	REQ_OP_SCSI_IN		= 32,
 	REQ_OP_SCSI_OUT		= 33,
@@ -375,6 +382,22 @@  static inline void bio_set_op_attrs(struct bio *bio, unsigned op,
 	bio->bi_opf = op | op_flags;
 }
 
+/*
+ * Check if the op is zoned operation.
+ */
+static inline bool op_is_zone_mgmt_op(enum req_opf op)
+{
+	switch (op) {
+	case REQ_OP_ZONE_RESET:
+	case REQ_OP_ZONE_OPEN:
+	case REQ_OP_ZONE_CLOSE:
+	case REQ_OP_ZONE_FINISH:
+		return true;
+	default:
+		return false;
+	}
+}
+
 static inline bool op_is_write(unsigned int op)
 {
 	return (op & 1);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 592669bcc536..943084f9dc9c 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -348,14 +348,15 @@  extern unsigned int blkdev_nr_zones(struct block_device *bdev);
 extern int blkdev_report_zones(struct block_device *bdev,
 			       sector_t sector, struct blk_zone *zones,
 			       unsigned int *nr_zones, gfp_t gfp_mask);
-extern int blkdev_reset_zones(struct block_device *bdev, sector_t sectors,
-			      sector_t nr_sectors, gfp_t gfp_mask);
 extern int blk_revalidate_disk_zones(struct gendisk *disk);
 
 extern int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode,
 				     unsigned int cmd, unsigned long arg);
-extern int blkdev_reset_zones_ioctl(struct block_device *bdev, fmode_t mode,
-				    unsigned int cmd, unsigned long arg);
+extern int blkdev_zones_mgmt_op_ioctl(struct block_device *bdev, fmode_t mode,
+					unsigned int cmd, unsigned long arg);
+extern int blkdev_zones_mgmt_op(struct block_device *bdev, enum req_opf op,
+				sector_t sector, sector_t nr_sectors,
+				gfp_t gfp_mask);
 
 #else /* CONFIG_BLK_DEV_ZONED */
 
@@ -376,15 +377,57 @@  static inline int blkdev_report_zones_ioctl(struct block_device *bdev,
 	return -ENOTTY;
 }
 
-static inline int blkdev_reset_zones_ioctl(struct block_device *bdev,
-					   fmode_t mode, unsigned int cmd,
-					   unsigned long arg)
+static inline int blkdev_zones_mgmt_op_ioctl(struct block_device *bdev,
+					     fmode_t mode, unsigned int cmd,
+					     unsigned long arg)
+{
+	return -ENOTTY;
+}
+
+static inline int blkdev_zones_mgmt_op(struct block_device *bdev,
+				       enum req_opf op,
+				       sector_t sector, sector_t nr_sectors,
+				       gfp_t gfp_mask)
 {
 	return -ENOTTY;
 }
 
 #endif /* CONFIG_BLK_DEV_ZONED */
 
+static inline int blkdev_reset_zones(struct block_device *bdev,
+				     sector_t sector, sector_t nr_sectors,
+				     gfp_t gfp_mask)
+{
+	return blkdev_zones_mgmt_op(bdev, REQ_OP_ZONE_RESET,
+				    sector, nr_sectors, gfp_mask);
+}
+
+static inline int blkdev_open_zones(struct block_device *bdev,
+				    sector_t sector, sector_t nr_sectors,
+				    gfp_t gfp_mask)
+{
+	return blkdev_zones_mgmt_op(bdev, REQ_OP_ZONE_OPEN,
+				    sector, nr_sectors, gfp_mask);
+}
+
+static inline int blkdev_close_zones(struct block_device *bdev,
+				     sector_t sector, sector_t nr_sectors,
+				     gfp_t gfp_mask)
+{
+	return blkdev_zones_mgmt_op(bdev, REQ_OP_ZONE_CLOSE,
+				    sector, nr_sectors,
+				    gfp_mask);
+}
+
+static inline int blkdev_finish_zones(struct block_device *bdev,
+				      sector_t sector, sector_t nr_sectors,
+				      gfp_t gfp_mask)
+{
+	return blkdev_zones_mgmt_op(bdev, REQ_OP_ZONE_FINISH,
+				    sector, nr_sectors,
+				    gfp_mask);
+}
+
 struct request_queue {
 	/*
 	 * Together with queue_head for cacheline sharing
diff --git a/include/uapi/linux/blkzoned.h b/include/uapi/linux/blkzoned.h
index 498eec813494..701e0692b8d3 100644
--- a/include/uapi/linux/blkzoned.h
+++ b/include/uapi/linux/blkzoned.h
@@ -120,9 +120,11 @@  struct blk_zone_report {
 };
 
 /**
- * struct blk_zone_range - BLKRESETZONE ioctl request
- * @sector: starting sector of the first zone to issue reset write pointer
- * @nr_sectors: Total number of sectors of 1 or more zones to reset
+ * struct blk_zone_range - BLKRESETZONE/BLKOPENZONE/
+ *			   BLKCLOSEZONE/BLKFINISHZONE ioctl
+ *			   request
+ * @sector: starting sector of the first zone to operate on
+ * @nr_sectors: Total number of sectors of all zones to operate on
  */
 struct blk_zone_range {
 	__u64		sector;
@@ -139,10 +141,19 @@  struct blk_zone_range {
  *                sector range. The sector range must be zone aligned.
  * @BLKGETZONESZ: Get the device zone size in number of 512 B sectors.
  * @BLKGETNRZONES: Get the total number of zones of the device.
+ * @BLKOPENZONE: Open the zones in the specified sector range. The
+ *               sector range must be zone aligned.
+ * @BLKCLOSEZONE: Close the zones in the specified sector range. The
+ *                sector range must be zone aligned.
+ * @BLKFINISHZONE: Finish the zones in the specified sector range. The
+ *                 sector range must be zone aligned.
  */
 #define BLKREPORTZONE	_IOWR(0x12, 130, struct blk_zone_report)
 #define BLKRESETZONE	_IOW(0x12, 131, struct blk_zone_range)
 #define BLKGETZONESZ	_IOR(0x12, 132, __u32)
 #define BLKGETNRZONES	_IOR(0x12, 133, __u32)
+#define BLKOPENZONE	_IOW(0x12, 134, struct blk_zone_range)
+#define BLKCLOSEZONE	_IOW(0x12, 135, struct blk_zone_range)
+#define BLKFINISHZONE	_IOW(0x12, 136, struct blk_zone_range)
 
 #endif /* _UAPI_BLKZONED_H */