diff mbox series

[3/3] io_uring: add support for zone-append

Message ID 1592414619-5646-4-git-send-email-joshi.k@samsung.com (mailing list archive)
State New, archived
Headers show
Series zone-append support in aio and io-uring | expand

Commit Message

Kanchan Joshi June 17, 2020, 5:23 p.m. UTC
From: Selvakumar S <selvakuma.s1@samsung.com>

Introduce three new opcodes for zone-append -

   IORING_OP_ZONE_APPEND     : non-vectord, similiar to IORING_OP_WRITE
   IORING_OP_ZONE_APPENDV    : vectored, similar to IORING_OP_WRITEV
   IORING_OP_ZONE_APPEND_FIXED : append using fixed-buffers

Repurpose cqe->flags to return zone-relative offset.

Signed-off-by: SelvaKumar S <selvakuma.s1@samsung.com>
Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
Signed-off-by: Javier Gonzalez <javier.gonz@samsung.com>
---
 fs/io_uring.c                 | 72 +++++++++++++++++++++++++++++++++++++++++--
 include/uapi/linux/io_uring.h |  8 ++++-
 2 files changed, 77 insertions(+), 3 deletions(-)

Comments

Pavel Begunkov June 17, 2020, 6:55 p.m. UTC | #1
On 17/06/2020 20:23, Kanchan Joshi wrote:
> From: Selvakumar S <selvakuma.s1@samsung.com>
> 
> Introduce three new opcodes for zone-append -
> 
>    IORING_OP_ZONE_APPEND     : non-vectord, similiar to IORING_OP_WRITE
>    IORING_OP_ZONE_APPENDV    : vectored, similar to IORING_OP_WRITEV
>    IORING_OP_ZONE_APPEND_FIXED : append using fixed-buffers

I don't like the idea of creating a new set of opcodes for each new flavour.
Did you try to extend write*?

> 
> Repurpose cqe->flags to return zone-relative offset.
> 
> Signed-off-by: SelvaKumar S <selvakuma.s1@samsung.com>
> Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
> Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
> Signed-off-by: Javier Gonzalez <javier.gonz@samsung.com>
> ---
>  fs/io_uring.c                 | 72 +++++++++++++++++++++++++++++++++++++++++--
>  include/uapi/linux/io_uring.h |  8 ++++-
>  2 files changed, 77 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index 155f3d8..c14c873 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -649,6 +649,10 @@ struct io_kiocb {
>  	unsigned long		fsize;
>  	u64			user_data;
>  	u32			result;
> +#ifdef CONFIG_BLK_DEV_ZONED
> +	/* zone-relative offset for append, in bytes */
> +	u32			append_offset;
> +#endif

What is this doing here? It's related to append only.
Should be in struct io_rw or whatever.

>  	u32			sequence;
>  
>  	struct list_head	link_list;
> @@ -875,6 +879,26 @@ static const struct io_op_def io_op_defs[] = {
>  		.hash_reg_file		= 1,
>  		.unbound_nonreg_file	= 1,
>  	},
> +	[IORING_OP_ZONE_APPEND] = {
> +		.needs_mm               = 1,
> +		.needs_file             = 1,
> +		.unbound_nonreg_file    = 1,
> +		.pollout		= 1,
> +	},
> +	[IORING_OP_ZONE_APPENDV] = {
> +	       .async_ctx              = 1,
> +	       .needs_mm               = 1,
> +	       .needs_file             = 1,
> +	       .hash_reg_file          = 1,
> +	       .unbound_nonreg_file    = 1,
> +	       .pollout			= 1,
> +	},
> +	[IORING_OP_ZONE_APPEND_FIXED] = {
> +	       .needs_file             = 1,
> +	       .hash_reg_file          = 1,
> +	       .unbound_nonreg_file    = 1,
> +	       .pollout			= 1,
> +	},
>  };
>  
>  static void io_wq_submit_work(struct io_wq_work **workptr);
> @@ -1285,7 +1309,16 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
>  	if (likely(cqe)) {
>  		WRITE_ONCE(cqe->user_data, req->user_data);
>  		WRITE_ONCE(cqe->res, res);
> +#ifdef CONFIG_BLK_DEV_ZONED
> +		if (req->opcode == IORING_OP_ZONE_APPEND ||
> +				req->opcode == IORING_OP_ZONE_APPENDV ||

Nack, don't create special cases in common path.


> +				req->opcode == IORING_OP_ZONE_APPEND_FIXED)
> +			WRITE_ONCE(cqe->res2, req->append_offset);

Great, a function was asked to return @cflags arg, but it decides to
overwrite the field with its own stuff...

Why not pass it in @cflags?

> +		else
> +			WRITE_ONCE(cqe->flags, cflags);
> +#else
>  		WRITE_ONCE(cqe->flags, cflags);
> +#endif
>  	} else if (ctx->cq_overflow_flushed) {
>  		WRITE_ONCE(ctx->rings->cq_overflow,
>  				atomic_inc_return(&ctx->cached_cq_overflow));
> @@ -1961,6 +1994,9 @@ static void io_complete_rw_common(struct kiocb *kiocb, long res)
>  static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
>  {
>  	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
> +#ifdef CONFIG_BLK_DEV_ZONED
> +	req->append_offset = (u32)res2;
> +#endif

Don't create #ifdef hell all over the code. Try to use a function


>  
>  	io_complete_rw_common(kiocb, res);
>  	io_put_req(req);
> @@ -1976,6 +2012,9 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
>  	if (res != req->result)
>  		req_set_fail_links(req);
>  	req->result = res;
> +#ifdef CONFIG_BLK_DEV_ZONED
> +	req->append_offset = (u32)res2;
> +#endif
>  	if (res != -EAGAIN)
>  		WRITE_ONCE(req->iopoll_completed, 1);
>  }
> @@ -2408,7 +2447,8 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
>  	u8 opcode;
>  
>  	opcode = req->opcode;
> -	if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
> +	if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED ||
> +			opcode == IORING_OP_ZONE_APPEND_FIXED) {
>  		*iovec = NULL;
>  		return io_import_fixed(req, rw, iter);
>  	}
> @@ -2417,7 +2457,8 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
>  	if (req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT))
>  		return -EINVAL;
>  
> -	if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
> +	if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE ||
> +			opcode == IORING_OP_ZONE_APPEND) {
>  		if (req->flags & REQ_F_BUFFER_SELECT) {
>  			buf = io_rw_buffer_select(req, &sqe_len, needs_lock);
>  			if (IS_ERR(buf)) {
> @@ -2704,6 +2745,9 @@ static int io_write(struct io_kiocb *req, bool force_nonblock)
>  		req->rw.kiocb.ki_flags &= ~IOCB_NOWAIT;
>  
>  	req->result = 0;
> +#ifdef CONFIG_BLK_DEV_ZONED
> +	req->append_offset = 0;
> +#endif
>  	io_size = ret;
>  	if (req->flags & REQ_F_LINK_HEAD)
>  		req->result = io_size;
> @@ -2738,6 +2782,13 @@ static int io_write(struct io_kiocb *req, bool force_nonblock)
>  			__sb_writers_release(file_inode(req->file)->i_sb,
>  						SB_FREEZE_WRITE);
>  		}
> +#ifdef CONFIG_BLK_DEV_ZONED
> +		if (req->opcode == IORING_OP_ZONE_APPEND ||
> +				req->opcode == IORING_OP_ZONE_APPENDV ||
> +				req->opcode == IORING_OP_ZONE_APPEND_FIXED)
> +			kiocb->ki_flags |= IOCB_ZONE_APPEND;
> +#endif
> +
>  		kiocb->ki_flags |= IOCB_WRITE;
>  
>  		if (!force_nonblock)
> @@ -4906,6 +4957,12 @@ static int io_req_defer_prep(struct io_kiocb *req,
>  	case IORING_OP_WRITEV:
>  	case IORING_OP_WRITE_FIXED:
>  	case IORING_OP_WRITE:
> +#ifdef CONFIG_BLK_DEV_ZONED
> +	fallthrough;
> +	case IORING_OP_ZONE_APPEND:
> +	case IORING_OP_ZONE_APPENDV:
> +	case IORING_OP_ZONE_APPEND_FIXED:
> +#endif
>  		ret = io_write_prep(req, sqe, true);
>  		break;
>  	case IORING_OP_POLL_ADD:
> @@ -5038,6 +5095,12 @@ static void io_cleanup_req(struct io_kiocb *req)
>  	case IORING_OP_WRITEV:
>  	case IORING_OP_WRITE_FIXED:
>  	case IORING_OP_WRITE:
> +#ifdef CONFIG_BLK_DEV_ZONED
> +	fallthrough;
> +	case IORING_OP_ZONE_APPEND:
> +	case IORING_OP_ZONE_APPENDV:
> +	case IORING_OP_ZONE_APPEND_FIXED:
> +#endif
>  		if (io->rw.iov != io->rw.fast_iov)
>  			kfree(io->rw.iov);
>  		break;
> @@ -5086,6 +5149,11 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
>  		}
>  		ret = io_read(req, force_nonblock);
>  		break;
> +#ifdef CONFIG_BLK_DEV_ZONED
> +	case IORING_OP_ZONE_APPEND:
> +	case IORING_OP_ZONE_APPENDV:
> +	case IORING_OP_ZONE_APPEND_FIXED:
> +#endif
>  	case IORING_OP_WRITEV:
>  	case IORING_OP_WRITE_FIXED:
>  	case IORING_OP_WRITE:
> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
> index 92c2269..6c8e932 100644
> --- a/include/uapi/linux/io_uring.h
> +++ b/include/uapi/linux/io_uring.h
> @@ -130,6 +130,9 @@ enum {
>  	IORING_OP_PROVIDE_BUFFERS,
>  	IORING_OP_REMOVE_BUFFERS,
>  	IORING_OP_TEE,
> +	IORING_OP_ZONE_APPEND,
> +	IORING_OP_ZONE_APPENDV,
> +	IORING_OP_ZONE_APPEND_FIXED,
>  
>  	/* this goes last, obviously */
>  	IORING_OP_LAST,
> @@ -157,7 +160,10 @@ enum {
>  struct io_uring_cqe {
>  	__u64	user_data;	/* sqe->data submission passed back */
>  	__s32	res;		/* result code for this event */
> -	__u32	flags;
> +	union {
> +		__u32	res2; /* res2 like aio, currently used for zone-append */
> +		__u32	flags;
> +	};
>  };
>  
>  /*
>
Damien Le Moal June 18, 2020, 7:39 a.m. UTC | #2
On 2020/06/18 2:27, Kanchan Joshi wrote:
> From: Selvakumar S <selvakuma.s1@samsung.com>
> 
> Introduce three new opcodes for zone-append -
> 
>    IORING_OP_ZONE_APPEND     : non-vectord, similiar to IORING_OP_WRITE
>    IORING_OP_ZONE_APPENDV    : vectored, similar to IORING_OP_WRITEV
>    IORING_OP_ZONE_APPEND_FIXED : append using fixed-buffers
> 
> Repurpose cqe->flags to return zone-relative offset.
> 
> Signed-off-by: SelvaKumar S <selvakuma.s1@samsung.com>
> Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
> Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
> Signed-off-by: Javier Gonzalez <javier.gonz@samsung.com>
> ---
>  fs/io_uring.c                 | 72 +++++++++++++++++++++++++++++++++++++++++--
>  include/uapi/linux/io_uring.h |  8 ++++-
>  2 files changed, 77 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index 155f3d8..c14c873 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -649,6 +649,10 @@ struct io_kiocb {
>  	unsigned long		fsize;
>  	u64			user_data;
>  	u32			result;
> +#ifdef CONFIG_BLK_DEV_ZONED
> +	/* zone-relative offset for append, in bytes */
> +	u32			append_offset;

this can overflow. u64 is needed.

> +#endif
>  	u32			sequence;
>  
>  	struct list_head	link_list;
> @@ -875,6 +879,26 @@ static const struct io_op_def io_op_defs[] = {
>  		.hash_reg_file		= 1,
>  		.unbound_nonreg_file	= 1,
>  	},
> +	[IORING_OP_ZONE_APPEND] = {
> +		.needs_mm               = 1,
> +		.needs_file             = 1,
> +		.unbound_nonreg_file    = 1,
> +		.pollout		= 1,
> +	},
> +	[IORING_OP_ZONE_APPENDV] = {
> +	       .async_ctx              = 1,
> +	       .needs_mm               = 1,
> +	       .needs_file             = 1,
> +	       .hash_reg_file          = 1,
> +	       .unbound_nonreg_file    = 1,
> +	       .pollout			= 1,
> +	},
> +	[IORING_OP_ZONE_APPEND_FIXED] = {
> +	       .needs_file             = 1,
> +	       .hash_reg_file          = 1,
> +	       .unbound_nonreg_file    = 1,
> +	       .pollout			= 1,
> +	},
>  };
>  
>  static void io_wq_submit_work(struct io_wq_work **workptr);
> @@ -1285,7 +1309,16 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
>  	if (likely(cqe)) {
>  		WRITE_ONCE(cqe->user_data, req->user_data);
>  		WRITE_ONCE(cqe->res, res);
> +#ifdef CONFIG_BLK_DEV_ZONED
> +		if (req->opcode == IORING_OP_ZONE_APPEND ||
> +				req->opcode == IORING_OP_ZONE_APPENDV ||
> +				req->opcode == IORING_OP_ZONE_APPEND_FIXED)
> +			WRITE_ONCE(cqe->res2, req->append_offset);
> +		else
> +			WRITE_ONCE(cqe->flags, cflags);
> +#else
>  		WRITE_ONCE(cqe->flags, cflags);
> +#endif
>  	} else if (ctx->cq_overflow_flushed) {
>  		WRITE_ONCE(ctx->rings->cq_overflow,
>  				atomic_inc_return(&ctx->cached_cq_overflow));
> @@ -1961,6 +1994,9 @@ static void io_complete_rw_common(struct kiocb *kiocb, long res)
>  static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
>  {
>  	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
> +#ifdef CONFIG_BLK_DEV_ZONED
> +	req->append_offset = (u32)res2;
> +#endif
>  
>  	io_complete_rw_common(kiocb, res);
>  	io_put_req(req);
> @@ -1976,6 +2012,9 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
>  	if (res != req->result)
>  		req_set_fail_links(req);
>  	req->result = res;
> +#ifdef CONFIG_BLK_DEV_ZONED
> +	req->append_offset = (u32)res2;
> +#endif
>  	if (res != -EAGAIN)
>  		WRITE_ONCE(req->iopoll_completed, 1);
>  }
> @@ -2408,7 +2447,8 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
>  	u8 opcode;
>  
>  	opcode = req->opcode;
> -	if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
> +	if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED ||
> +			opcode == IORING_OP_ZONE_APPEND_FIXED) {
>  		*iovec = NULL;
>  		return io_import_fixed(req, rw, iter);
>  	}
> @@ -2417,7 +2457,8 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
>  	if (req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT))
>  		return -EINVAL;
>  
> -	if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
> +	if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE ||
> +			opcode == IORING_OP_ZONE_APPEND) {
>  		if (req->flags & REQ_F_BUFFER_SELECT) {
>  			buf = io_rw_buffer_select(req, &sqe_len, needs_lock);
>  			if (IS_ERR(buf)) {
> @@ -2704,6 +2745,9 @@ static int io_write(struct io_kiocb *req, bool force_nonblock)
>  		req->rw.kiocb.ki_flags &= ~IOCB_NOWAIT;
>  
>  	req->result = 0;
> +#ifdef CONFIG_BLK_DEV_ZONED
> +	req->append_offset = 0;
> +#endif
>  	io_size = ret;
>  	if (req->flags & REQ_F_LINK_HEAD)
>  		req->result = io_size;
> @@ -2738,6 +2782,13 @@ static int io_write(struct io_kiocb *req, bool force_nonblock)
>  			__sb_writers_release(file_inode(req->file)->i_sb,
>  						SB_FREEZE_WRITE);
>  		}
> +#ifdef CONFIG_BLK_DEV_ZONED
> +		if (req->opcode == IORING_OP_ZONE_APPEND ||
> +				req->opcode == IORING_OP_ZONE_APPENDV ||
> +				req->opcode == IORING_OP_ZONE_APPEND_FIXED)
> +			kiocb->ki_flags |= IOCB_ZONE_APPEND;
> +#endif
> +
>  		kiocb->ki_flags |= IOCB_WRITE;
>  
>  		if (!force_nonblock)
> @@ -4906,6 +4957,12 @@ static int io_req_defer_prep(struct io_kiocb *req,
>  	case IORING_OP_WRITEV:
>  	case IORING_OP_WRITE_FIXED:
>  	case IORING_OP_WRITE:
> +#ifdef CONFIG_BLK_DEV_ZONED
> +	fallthrough;
> +	case IORING_OP_ZONE_APPEND:
> +	case IORING_OP_ZONE_APPENDV:
> +	case IORING_OP_ZONE_APPEND_FIXED:
> +#endif
>  		ret = io_write_prep(req, sqe, true);
>  		break;
>  	case IORING_OP_POLL_ADD:
> @@ -5038,6 +5095,12 @@ static void io_cleanup_req(struct io_kiocb *req)
>  	case IORING_OP_WRITEV:
>  	case IORING_OP_WRITE_FIXED:
>  	case IORING_OP_WRITE:
> +#ifdef CONFIG_BLK_DEV_ZONED
> +	fallthrough;
> +	case IORING_OP_ZONE_APPEND:
> +	case IORING_OP_ZONE_APPENDV:
> +	case IORING_OP_ZONE_APPEND_FIXED:
> +#endif
>  		if (io->rw.iov != io->rw.fast_iov)
>  			kfree(io->rw.iov);
>  		break;
> @@ -5086,6 +5149,11 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
>  		}
>  		ret = io_read(req, force_nonblock);
>  		break;
> +#ifdef CONFIG_BLK_DEV_ZONED
> +	case IORING_OP_ZONE_APPEND:
> +	case IORING_OP_ZONE_APPENDV:
> +	case IORING_OP_ZONE_APPEND_FIXED:
> +#endif
>  	case IORING_OP_WRITEV:
>  	case IORING_OP_WRITE_FIXED:
>  	case IORING_OP_WRITE:
> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
> index 92c2269..6c8e932 100644
> --- a/include/uapi/linux/io_uring.h
> +++ b/include/uapi/linux/io_uring.h
> @@ -130,6 +130,9 @@ enum {
>  	IORING_OP_PROVIDE_BUFFERS,
>  	IORING_OP_REMOVE_BUFFERS,
>  	IORING_OP_TEE,
> +	IORING_OP_ZONE_APPEND,
> +	IORING_OP_ZONE_APPENDV,
> +	IORING_OP_ZONE_APPEND_FIXED,
>  
>  	/* this goes last, obviously */
>  	IORING_OP_LAST,
> @@ -157,7 +160,10 @@ enum {
>  struct io_uring_cqe {
>  	__u64	user_data;	/* sqe->data submission passed back */
>  	__s32	res;		/* result code for this event */
> -	__u32	flags;
> +	union {
> +		__u32	res2; /* res2 like aio, currently used for zone-append */
> +		__u32	flags;
> +	};
>  };
>  
>  /*
>
Javier González June 18, 2020, 8:35 a.m. UTC | #3
On 18.06.2020 07:39, Damien Le Moal wrote:
>On 2020/06/18 2:27, Kanchan Joshi wrote:
>> From: Selvakumar S <selvakuma.s1@samsung.com>
>>
>> Introduce three new opcodes for zone-append -
>>
>>    IORING_OP_ZONE_APPEND     : non-vectord, similiar to IORING_OP_WRITE
>>    IORING_OP_ZONE_APPENDV    : vectored, similar to IORING_OP_WRITEV
>>    IORING_OP_ZONE_APPEND_FIXED : append using fixed-buffers
>>
>> Repurpose cqe->flags to return zone-relative offset.
>>
>> Signed-off-by: SelvaKumar S <selvakuma.s1@samsung.com>
>> Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
>> Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
>> Signed-off-by: Javier Gonzalez <javier.gonz@samsung.com>
>> ---
>>  fs/io_uring.c                 | 72 +++++++++++++++++++++++++++++++++++++++++--
>>  include/uapi/linux/io_uring.h |  8 ++++-
>>  2 files changed, 77 insertions(+), 3 deletions(-)
>>
>> diff --git a/fs/io_uring.c b/fs/io_uring.c
>> index 155f3d8..c14c873 100644
>> --- a/fs/io_uring.c
>> +++ b/fs/io_uring.c
>> @@ -649,6 +649,10 @@ struct io_kiocb {
>>  	unsigned long		fsize;
>>  	u64			user_data;
>>  	u32			result;
>> +#ifdef CONFIG_BLK_DEV_ZONED
>> +	/* zone-relative offset for append, in bytes */
>> +	u32			append_offset;
>
>this can overflow. u64 is needed.

We chose to do it this way to start with because struct io_uring_cqe
only has space for u32 when we reuse the flags.

We can of course create a new cqe structure, but that will come with
larger changes to io_uring for supporting append.

Do you believe this is a better approach?

Javier
Damien Le Moal June 18, 2020, 8:47 a.m. UTC | #4
On 2020/06/18 17:35, javier.gonz@samsung.com wrote:
> On 18.06.2020 07:39, Damien Le Moal wrote:
>> On 2020/06/18 2:27, Kanchan Joshi wrote:
>>> From: Selvakumar S <selvakuma.s1@samsung.com>
>>>
>>> Introduce three new opcodes for zone-append -
>>>
>>>    IORING_OP_ZONE_APPEND     : non-vectord, similiar to IORING_OP_WRITE
>>>    IORING_OP_ZONE_APPENDV    : vectored, similar to IORING_OP_WRITEV
>>>    IORING_OP_ZONE_APPEND_FIXED : append using fixed-buffers
>>>
>>> Repurpose cqe->flags to return zone-relative offset.
>>>
>>> Signed-off-by: SelvaKumar S <selvakuma.s1@samsung.com>
>>> Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
>>> Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
>>> Signed-off-by: Javier Gonzalez <javier.gonz@samsung.com>
>>> ---
>>>  fs/io_uring.c                 | 72 +++++++++++++++++++++++++++++++++++++++++--
>>>  include/uapi/linux/io_uring.h |  8 ++++-
>>>  2 files changed, 77 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/fs/io_uring.c b/fs/io_uring.c
>>> index 155f3d8..c14c873 100644
>>> --- a/fs/io_uring.c
>>> +++ b/fs/io_uring.c
>>> @@ -649,6 +649,10 @@ struct io_kiocb {
>>>  	unsigned long		fsize;
>>>  	u64			user_data;
>>>  	u32			result;
>>> +#ifdef CONFIG_BLK_DEV_ZONED
>>> +	/* zone-relative offset for append, in bytes */
>>> +	u32			append_offset;
>>
>> this can overflow. u64 is needed.
> 
> We chose to do it this way to start with because struct io_uring_cqe
> only has space for u32 when we reuse the flags.
> 
> We can of course create a new cqe structure, but that will come with
> larger changes to io_uring for supporting append.
> 
> Do you believe this is a better approach?

The problem is that zone size are 32 bits in the kernel, as a number of sectors.
So any device that has a zone size smaller or equal to 2^31 512B sectors can be
accepted. Using a zone relative offset in bytes for returning zone append result
is OK-ish, but to match the kernel supported range of possible zone size, you
need 31+9 bits... 32 does not cut it.

Since you need a 64-bit sized result, I would also prefer that you drop the zone
relative offset as a result and return the absolute offset instead. That makes
life easier for the applications since the zone append requests also must use
absolute offsets for zone start. An absolute offset as a result becomes
consistent with that and all other read/write system calls that all use absolute
offsets (seek() is the only one that I know of that can use a relative offset,
but that is not an IO system call).


> 
> Javier
>
Javier González June 18, 2020, 9:11 a.m. UTC | #5
On 18.06.2020 08:47, Damien Le Moal wrote:
>On 2020/06/18 17:35, javier.gonz@samsung.com wrote:
>> On 18.06.2020 07:39, Damien Le Moal wrote:
>>> On 2020/06/18 2:27, Kanchan Joshi wrote:
>>>> From: Selvakumar S <selvakuma.s1@samsung.com>
>>>>
>>>> Introduce three new opcodes for zone-append -
>>>>
>>>>    IORING_OP_ZONE_APPEND     : non-vectord, similiar to IORING_OP_WRITE
>>>>    IORING_OP_ZONE_APPENDV    : vectored, similar to IORING_OP_WRITEV
>>>>    IORING_OP_ZONE_APPEND_FIXED : append using fixed-buffers
>>>>
>>>> Repurpose cqe->flags to return zone-relative offset.
>>>>
>>>> Signed-off-by: SelvaKumar S <selvakuma.s1@samsung.com>
>>>> Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
>>>> Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
>>>> Signed-off-by: Javier Gonzalez <javier.gonz@samsung.com>
>>>> ---
>>>>  fs/io_uring.c                 | 72 +++++++++++++++++++++++++++++++++++++++++--
>>>>  include/uapi/linux/io_uring.h |  8 ++++-
>>>>  2 files changed, 77 insertions(+), 3 deletions(-)
>>>>
>>>> diff --git a/fs/io_uring.c b/fs/io_uring.c
>>>> index 155f3d8..c14c873 100644
>>>> --- a/fs/io_uring.c
>>>> +++ b/fs/io_uring.c
>>>> @@ -649,6 +649,10 @@ struct io_kiocb {
>>>>  	unsigned long		fsize;
>>>>  	u64			user_data;
>>>>  	u32			result;
>>>> +#ifdef CONFIG_BLK_DEV_ZONED
>>>> +	/* zone-relative offset for append, in bytes */
>>>> +	u32			append_offset;
>>>
>>> this can overflow. u64 is needed.
>>
>> We chose to do it this way to start with because struct io_uring_cqe
>> only has space for u32 when we reuse the flags.
>>
>> We can of course create a new cqe structure, but that will come with
>> larger changes to io_uring for supporting append.
>>
>> Do you believe this is a better approach?
>
>The problem is that zone size are 32 bits in the kernel, as a number of sectors.
>So any device that has a zone size smaller or equal to 2^31 512B sectors can be
>accepted. Using a zone relative offset in bytes for returning zone append result
>is OK-ish, but to match the kernel supported range of possible zone size, you
>need 31+9 bits... 32 does not cut it.

Agree. Our initial assumption was that u32 would cover current zone size
requirements, but if this is a no-go, we will take the longer path.

>
>Since you need a 64-bit sized result, I would also prefer that you drop the zone
>relative offset as a result and return the absolute offset instead. That makes
>life easier for the applications since the zone append requests also must use
>absolute offsets for zone start. An absolute offset as a result becomes
>consistent with that and all other read/write system calls that all use absolute
>offsets (seek() is the only one that I know of that can use a relative offset,
>but that is not an IO system call).

Agree. Using relative offsets was a product of reusing the existing u32.
If we move to u64, there is no need to do an extra transformation.

Thanks Damien!
Javier
Javier González June 19, 2020, 9:41 a.m. UTC | #6
Jens,

Would you have time to answer a question below in this thread?

On 18.06.2020 11:11, javier.gonz@samsung.com wrote:
>On 18.06.2020 08:47, Damien Le Moal wrote:
>>On 2020/06/18 17:35, javier.gonz@samsung.com wrote:
>>>On 18.06.2020 07:39, Damien Le Moal wrote:
>>>>On 2020/06/18 2:27, Kanchan Joshi wrote:
>>>>>From: Selvakumar S <selvakuma.s1@samsung.com>
>>>>>
>>>>>Introduce three new opcodes for zone-append -
>>>>>
>>>>>   IORING_OP_ZONE_APPEND     : non-vectord, similiar to IORING_OP_WRITE
>>>>>   IORING_OP_ZONE_APPENDV    : vectored, similar to IORING_OP_WRITEV
>>>>>   IORING_OP_ZONE_APPEND_FIXED : append using fixed-buffers
>>>>>
>>>>>Repurpose cqe->flags to return zone-relative offset.
>>>>>
>>>>>Signed-off-by: SelvaKumar S <selvakuma.s1@samsung.com>
>>>>>Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
>>>>>Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
>>>>>Signed-off-by: Javier Gonzalez <javier.gonz@samsung.com>
>>>>>---
>>>>> fs/io_uring.c                 | 72 +++++++++++++++++++++++++++++++++++++++++--
>>>>> include/uapi/linux/io_uring.h |  8 ++++-
>>>>> 2 files changed, 77 insertions(+), 3 deletions(-)
>>>>>
>>>>>diff --git a/fs/io_uring.c b/fs/io_uring.c
>>>>>index 155f3d8..c14c873 100644
>>>>>--- a/fs/io_uring.c
>>>>>+++ b/fs/io_uring.c
>>>>>@@ -649,6 +649,10 @@ struct io_kiocb {
>>>>> 	unsigned long		fsize;
>>>>> 	u64			user_data;
>>>>> 	u32			result;
>>>>>+#ifdef CONFIG_BLK_DEV_ZONED
>>>>>+	/* zone-relative offset for append, in bytes */
>>>>>+	u32			append_offset;
>>>>
>>>>this can overflow. u64 is needed.
>>>
>>>We chose to do it this way to start with because struct io_uring_cqe
>>>only has space for u32 when we reuse the flags.
>>>
>>>We can of course create a new cqe structure, but that will come with
>>>larger changes to io_uring for supporting append.
>>>
>>>Do you believe this is a better approach?
>>
>>The problem is that zone size are 32 bits in the kernel, as a number of sectors.
>>So any device that has a zone size smaller or equal to 2^31 512B sectors can be
>>accepted. Using a zone relative offset in bytes for returning zone append result
>>is OK-ish, but to match the kernel supported range of possible zone size, you
>>need 31+9 bits... 32 does not cut it.
>
>Agree. Our initial assumption was that u32 would cover current zone size
>requirements, but if this is a no-go, we will take the longer path.

Converting to u64 will require a new version of io_uring_cqe, where we
extend at least 32 bits. I believe this will need a whole new allocation
and probably ioctl().

Is this an acceptable change for you? We will of course add support for
liburing when we agree on the right way to do this.

Thanks,
Javier
Matias Bjorling June 19, 2020, 11:15 a.m. UTC | #7
On 19/06/2020 11.41, javier.gonz@samsung.com wrote:
> Jens,
>
> Would you have time to answer a question below in this thread?
>
> On 18.06.2020 11:11, javier.gonz@samsung.com wrote:
>> On 18.06.2020 08:47, Damien Le Moal wrote:
>>> On 2020/06/18 17:35, javier.gonz@samsung.com wrote:
>>>> On 18.06.2020 07:39, Damien Le Moal wrote:
>>>>> On 2020/06/18 2:27, Kanchan Joshi wrote:
>>>>>> From: Selvakumar S <selvakuma.s1@samsung.com>
>>>>>>
>>>>>> Introduce three new opcodes for zone-append -
>>>>>>
>>>>>>   IORING_OP_ZONE_APPEND     : non-vectord, similiar to 
>>>>>> IORING_OP_WRITE
>>>>>>   IORING_OP_ZONE_APPENDV    : vectored, similar to IORING_OP_WRITEV
>>>>>>   IORING_OP_ZONE_APPEND_FIXED : append using fixed-buffers
>>>>>>
>>>>>> Repurpose cqe->flags to return zone-relative offset.
>>>>>>
>>>>>> Signed-off-by: SelvaKumar S <selvakuma.s1@samsung.com>
>>>>>> Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
>>>>>> Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
>>>>>> Signed-off-by: Javier Gonzalez <javier.gonz@samsung.com>
>>>>>> ---
>>>>>> fs/io_uring.c                 | 72 
>>>>>> +++++++++++++++++++++++++++++++++++++++++--
>>>>>> include/uapi/linux/io_uring.h |  8 ++++-
>>>>>> 2 files changed, 77 insertions(+), 3 deletions(-)
>>>>>>
>>>>>> diff --git a/fs/io_uring.c b/fs/io_uring.c
>>>>>> index 155f3d8..c14c873 100644
>>>>>> --- a/fs/io_uring.c
>>>>>> +++ b/fs/io_uring.c
>>>>>> @@ -649,6 +649,10 @@ struct io_kiocb {
>>>>>>     unsigned long        fsize;
>>>>>>     u64            user_data;
>>>>>>     u32            result;
>>>>>> +#ifdef CONFIG_BLK_DEV_ZONED
>>>>>> +    /* zone-relative offset for append, in bytes */
>>>>>> +    u32            append_offset;
>>>>>
>>>>> this can overflow. u64 is needed.
>>>>
>>>> We chose to do it this way to start with because struct io_uring_cqe
>>>> only has space for u32 when we reuse the flags.
>>>>
>>>> We can of course create a new cqe structure, but that will come with
>>>> larger changes to io_uring for supporting append.
>>>>
>>>> Do you believe this is a better approach?
>>>
>>> The problem is that zone size are 32 bits in the kernel, as a number 
>>> of sectors.
>>> So any device that has a zone size smaller or equal to 2^31 512B 
>>> sectors can be
>>> accepted. Using a zone relative offset in bytes for returning zone 
>>> append result
>>> is OK-ish, but to match the kernel supported range of possible zone 
>>> size, you
>>> need 31+9 bits... 32 does not cut it.
>>
>> Agree. Our initial assumption was that u32 would cover current zone size
>> requirements, but if this is a no-go, we will take the longer path.
>
> Converting to u64 will require a new version of io_uring_cqe, where we
> extend at least 32 bits. I believe this will need a whole new allocation
> and probably ioctl().
>
> Is this an acceptable change for you? We will of course add support for
> liburing when we agree on the right way to do this.

I took a quick look at the code. No expert, but why not use the existing 
userdata variable? use the lowest bits (40 bits) for the Zone Starting 
LBA, and use the highest (24 bits) as index into the completion data 
structure?

If you want to pass the memory address (same as what fio does) for the 
data structure used for completion, one may also play some tricks by 
using a relative memory address to the data structure. For example, the 
x86_64 architecture uses 48 address bits for its memory addresses. With 
24 bit, one can allocate the completion entries in a 32MB memory range, 
and then use base_address + index to get back to the completion data 
structure specified in the sqe.

Best, Matias
Jens Axboe June 19, 2020, 2:15 p.m. UTC | #8
On 6/19/20 3:41 AM, javier.gonz@samsung.com wrote:
> Jens,
> 
> Would you have time to answer a question below in this thread?
> 
> On 18.06.2020 11:11, javier.gonz@samsung.com wrote:
>> On 18.06.2020 08:47, Damien Le Moal wrote:
>>> On 2020/06/18 17:35, javier.gonz@samsung.com wrote:
>>>> On 18.06.2020 07:39, Damien Le Moal wrote:
>>>>> On 2020/06/18 2:27, Kanchan Joshi wrote:
>>>>>> From: Selvakumar S <selvakuma.s1@samsung.com>
>>>>>>
>>>>>> Introduce three new opcodes for zone-append -
>>>>>>
>>>>>>   IORING_OP_ZONE_APPEND     : non-vectord, similiar to IORING_OP_WRITE
>>>>>>   IORING_OP_ZONE_APPENDV    : vectored, similar to IORING_OP_WRITEV
>>>>>>   IORING_OP_ZONE_APPEND_FIXED : append using fixed-buffers
>>>>>>
>>>>>> Repurpose cqe->flags to return zone-relative offset.
>>>>>>
>>>>>> Signed-off-by: SelvaKumar S <selvakuma.s1@samsung.com>
>>>>>> Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
>>>>>> Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
>>>>>> Signed-off-by: Javier Gonzalez <javier.gonz@samsung.com>
>>>>>> ---
>>>>>> fs/io_uring.c                 | 72 +++++++++++++++++++++++++++++++++++++++++--
>>>>>> include/uapi/linux/io_uring.h |  8 ++++-
>>>>>> 2 files changed, 77 insertions(+), 3 deletions(-)
>>>>>>
>>>>>> diff --git a/fs/io_uring.c b/fs/io_uring.c
>>>>>> index 155f3d8..c14c873 100644
>>>>>> --- a/fs/io_uring.c
>>>>>> +++ b/fs/io_uring.c
>>>>>> @@ -649,6 +649,10 @@ struct io_kiocb {
>>>>>> 	unsigned long		fsize;
>>>>>> 	u64			user_data;
>>>>>> 	u32			result;
>>>>>> +#ifdef CONFIG_BLK_DEV_ZONED
>>>>>> +	/* zone-relative offset for append, in bytes */
>>>>>> +	u32			append_offset;
>>>>>
>>>>> this can overflow. u64 is needed.
>>>>
>>>> We chose to do it this way to start with because struct io_uring_cqe
>>>> only has space for u32 when we reuse the flags.
>>>>
>>>> We can of course create a new cqe structure, but that will come with
>>>> larger changes to io_uring for supporting append.
>>>>
>>>> Do you believe this is a better approach?
>>>
>>> The problem is that zone size are 32 bits in the kernel, as a number
>>> of sectors.  So any device that has a zone size smaller or equal to
>>> 2^31 512B sectors can be accepted. Using a zone relative offset in
>>> bytes for returning zone append result is OK-ish, but to match the
>>> kernel supported range of possible zone size, you need 31+9 bits...
>>> 32 does not cut it.
>>
>> Agree. Our initial assumption was that u32 would cover current zone size
>> requirements, but if this is a no-go, we will take the longer path.
> 
> Converting to u64 will require a new version of io_uring_cqe, where we
> extend at least 32 bits. I believe this will need a whole new allocation
> and probably ioctl().
> 
> Is this an acceptable change for you? We will of course add support for
> liburing when we agree on the right way to do this.

If you need 64-bit of return value, then it's not going to work. Even
with the existing patches, reusing cqe->flags isn't going to fly, as
it would conflict with eg doing zone append writes with automatic
buffer selection.

We're not changing the io_uring_cqe. It's important to keep it lean, and
any other request type is generally fine with 64-bit tag + 32-bit result
(and 32-bit flags on the side) for completions.

Only viable alternative I see would be to provide an area to store this
information, and pass in a pointer to this at submission time through
the sqe. One issue I do see with that is if we only have this
information available at completion time, then we'd need some async punt
to copy it to user space... Generally not ideal.

A hackier approach would be to play some tricks with cqe->res and
cqe->flags, setting aside a flag to denote an extension of cqe->res.
That would mean excluding zone append (etc) from using buffer selection,
which probably isn't a huge deal. It'd be more problematic for any other
future flags. But if you just need 40 bits, then it could certainly
work. Rigth now, if cqe->flags & 1 is set, then (cqe->flags >> 16) is
the buffer ID. You could define IORING_CQE_F_ZONE_FOO to be bit 1, so
that:

	uint64_t val = cqe->res; // assuming non-error here

	if (cqe->flags & IORING_CQE_F_ZONE_FOO)
		val |= (cqe->flags >> 16) << 32ULL;

and hence use the upper 16 bits of cqe->flags for the upper bits of your
(then) 48-bit total value.
Jens Axboe June 19, 2020, 2:18 p.m. UTC | #9
On 6/19/20 5:15 AM, Matias Bjørling wrote:
> On 19/06/2020 11.41, javier.gonz@samsung.com wrote:
>> Jens,
>>
>> Would you have time to answer a question below in this thread?
>>
>> On 18.06.2020 11:11, javier.gonz@samsung.com wrote:
>>> On 18.06.2020 08:47, Damien Le Moal wrote:
>>>> On 2020/06/18 17:35, javier.gonz@samsung.com wrote:
>>>>> On 18.06.2020 07:39, Damien Le Moal wrote:
>>>>>> On 2020/06/18 2:27, Kanchan Joshi wrote:
>>>>>>> From: Selvakumar S <selvakuma.s1@samsung.com>
>>>>>>>
>>>>>>> Introduce three new opcodes for zone-append -
>>>>>>>
>>>>>>>   IORING_OP_ZONE_APPEND     : non-vectord, similiar to 
>>>>>>> IORING_OP_WRITE
>>>>>>>   IORING_OP_ZONE_APPENDV    : vectored, similar to IORING_OP_WRITEV
>>>>>>>   IORING_OP_ZONE_APPEND_FIXED : append using fixed-buffers
>>>>>>>
>>>>>>> Repurpose cqe->flags to return zone-relative offset.
>>>>>>>
>>>>>>> Signed-off-by: SelvaKumar S <selvakuma.s1@samsung.com>
>>>>>>> Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
>>>>>>> Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
>>>>>>> Signed-off-by: Javier Gonzalez <javier.gonz@samsung.com>
>>>>>>> ---
>>>>>>> fs/io_uring.c                 | 72 
>>>>>>> +++++++++++++++++++++++++++++++++++++++++--
>>>>>>> include/uapi/linux/io_uring.h |  8 ++++-
>>>>>>> 2 files changed, 77 insertions(+), 3 deletions(-)
>>>>>>>
>>>>>>> diff --git a/fs/io_uring.c b/fs/io_uring.c
>>>>>>> index 155f3d8..c14c873 100644
>>>>>>> --- a/fs/io_uring.c
>>>>>>> +++ b/fs/io_uring.c
>>>>>>> @@ -649,6 +649,10 @@ struct io_kiocb {
>>>>>>>     unsigned long        fsize;
>>>>>>>     u64            user_data;
>>>>>>>     u32            result;
>>>>>>> +#ifdef CONFIG_BLK_DEV_ZONED
>>>>>>> +    /* zone-relative offset for append, in bytes */
>>>>>>> +    u32            append_offset;
>>>>>>
>>>>>> this can overflow. u64 is needed.
>>>>>
>>>>> We chose to do it this way to start with because struct io_uring_cqe
>>>>> only has space for u32 when we reuse the flags.
>>>>>
>>>>> We can of course create a new cqe structure, but that will come with
>>>>> larger changes to io_uring for supporting append.
>>>>>
>>>>> Do you believe this is a better approach?
>>>>
>>>> The problem is that zone size are 32 bits in the kernel, as a number 
>>>> of sectors.
>>>> So any device that has a zone size smaller or equal to 2^31 512B 
>>>> sectors can be
>>>> accepted. Using a zone relative offset in bytes for returning zone 
>>>> append result
>>>> is OK-ish, but to match the kernel supported range of possible zone 
>>>> size, you
>>>> need 31+9 bits... 32 does not cut it.
>>>
>>> Agree. Our initial assumption was that u32 would cover current zone size
>>> requirements, but if this is a no-go, we will take the longer path.
>>
>> Converting to u64 will require a new version of io_uring_cqe, where we
>> extend at least 32 bits. I believe this will need a whole new allocation
>> and probably ioctl().
>>
>> Is this an acceptable change for you? We will of course add support for
>> liburing when we agree on the right way to do this.
> 
> I took a quick look at the code. No expert, but why not use the existing 
> userdata variable? use the lowest bits (40 bits) for the Zone Starting 
> LBA, and use the highest (24 bits) as index into the completion data 
> structure?
> 
> If you want to pass the memory address (same as what fio does) for the 
> data structure used for completion, one may also play some tricks by 
> using a relative memory address to the data structure. For example, the 
> x86_64 architecture uses 48 address bits for its memory addresses. With 
> 24 bit, one can allocate the completion entries in a 32MB memory range, 
> and then use base_address + index to get back to the completion data 
> structure specified in the sqe.

For any current request, sqe->user_data is just provided back as
cqe->user_data. This would make these requests behave differently
from everything else in that sense, which seems very confusing to me
if I was an application writer.

But generally I do agree with you, there are lots of ways to make
< 64-bit work as a tag without losing anything or having to jump
through hoops to do so. The lack of consistency introduced by having
zone append work differently is ugly, though.
Pavel Begunkov June 19, 2020, 2:59 p.m. UTC | #10
On 19/06/2020 17:15, Jens Axboe wrote:
> On 6/19/20 3:41 AM, javier.gonz@samsung.com wrote:
>> Jens,
>>
>> Would you have time to answer a question below in this thread?
>>
>> On 18.06.2020 11:11, javier.gonz@samsung.com wrote:
>>> On 18.06.2020 08:47, Damien Le Moal wrote:
>>>> On 2020/06/18 17:35, javier.gonz@samsung.com wrote:
>>>>> On 18.06.2020 07:39, Damien Le Moal wrote:
>>>>>> On 2020/06/18 2:27, Kanchan Joshi wrote:
>>>>>>> From: Selvakumar S <selvakuma.s1@samsung.com>
>>>>>>>
>>>>>>> Introduce three new opcodes for zone-append -
>>>>>>>
>>>>>>>   IORING_OP_ZONE_APPEND     : non-vectord, similiar to IORING_OP_WRITE
>>>>>>>   IORING_OP_ZONE_APPENDV    : vectored, similar to IORING_OP_WRITEV
>>>>>>>   IORING_OP_ZONE_APPEND_FIXED : append using fixed-buffers
>>>>>>>
>>>>>>> Repurpose cqe->flags to return zone-relative offset.
>>>>>>>
>>>>>>> Signed-off-by: SelvaKumar S <selvakuma.s1@samsung.com>
>>>>>>> Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
>>>>>>> Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
>>>>>>> Signed-off-by: Javier Gonzalez <javier.gonz@samsung.com>
>>>>>>> ---
>>>>>>> fs/io_uring.c                 | 72 +++++++++++++++++++++++++++++++++++++++++--
>>>>>>> include/uapi/linux/io_uring.h |  8 ++++-
>>>>>>> 2 files changed, 77 insertions(+), 3 deletions(-)
>>>>>>>
>>>>>>> diff --git a/fs/io_uring.c b/fs/io_uring.c
>>>>>>> index 155f3d8..c14c873 100644
>>>>>>> --- a/fs/io_uring.c
>>>>>>> +++ b/fs/io_uring.c
>>>>>>> @@ -649,6 +649,10 @@ struct io_kiocb {
>>>>>>> 	unsigned long		fsize;
>>>>>>> 	u64			user_data;
>>>>>>> 	u32			result;
>>>>>>> +#ifdef CONFIG_BLK_DEV_ZONED
>>>>>>> +	/* zone-relative offset for append, in bytes */
>>>>>>> +	u32			append_offset;
>>>>>>
>>>>>> this can overflow. u64 is needed.
>>>>>
>>>>> We chose to do it this way to start with because struct io_uring_cqe
>>>>> only has space for u32 when we reuse the flags.
>>>>>
>>>>> We can of course create a new cqe structure, but that will come with
>>>>> larger changes to io_uring for supporting append.
>>>>>
>>>>> Do you believe this is a better approach?
>>>>
>>>> The problem is that zone size are 32 bits in the kernel, as a number
>>>> of sectors.  So any device that has a zone size smaller or equal to
>>>> 2^31 512B sectors can be accepted. Using a zone relative offset in
>>>> bytes for returning zone append result is OK-ish, but to match the
>>>> kernel supported range of possible zone size, you need 31+9 bits...
>>>> 32 does not cut it.
>>>
>>> Agree. Our initial assumption was that u32 would cover current zone size
>>> requirements, but if this is a no-go, we will take the longer path.
>>
>> Converting to u64 will require a new version of io_uring_cqe, where we
>> extend at least 32 bits. I believe this will need a whole new allocation
>> and probably ioctl().
>>
>> Is this an acceptable change for you? We will of course add support for
>> liburing when we agree on the right way to do this.
> 
> If you need 64-bit of return value, then it's not going to work. Even
> with the existing patches, reusing cqe->flags isn't going to fly, as
> it would conflict with eg doing zone append writes with automatic
> buffer selection.

Buffer selection is for reads/recv kind of requests, but appends
are writes. In theory they can co-exist using cqe->flags.


> 
> We're not changing the io_uring_cqe. It's important to keep it lean, and
> any other request type is generally fine with 64-bit tag + 32-bit result
> (and 32-bit flags on the side) for completions.
> 
> Only viable alternative I see would be to provide an area to store this
> information, and pass in a pointer to this at submission time through
> the sqe. One issue I do see with that is if we only have this
> information available at completion time, then we'd need some async punt
> to copy it to user space... Generally not ideal.
> 
> A hackier approach would be to play some tricks with cqe->res and
> cqe->flags, setting aside a flag to denote an extension of cqe->res.
> That would mean excluding zone append (etc) from using buffer selection,
> which probably isn't a huge deal. It'd be more problematic for any other
> future flags. But if you just need 40 bits, then it could certainly
> work. Rigth now, if cqe->flags & 1 is set, then (cqe->flags >> 16) is
> the buffer ID. You could define IORING_CQE_F_ZONE_FOO to be bit 1, so
> that:
> 
> 	uint64_t val = cqe->res; // assuming non-error here
> 
> 	if (cqe->flags & IORING_CQE_F_ZONE_FOO)
> 		val |= (cqe->flags >> 16) << 32ULL;
> 
> and hence use the upper 16 bits of cqe->flags for the upper bits of your
> (then) 48-bit total value.

How about returning offset in terms of 512-bytes chunks? NVMe is 512B
atomic/aligned. We'll lose an ability to do non-512 aligned appends, but
it won't hit media as such anyway (will be padded or cached), so
personally I don't see much benefit in having it.
Jens Axboe June 19, 2020, 3:02 p.m. UTC | #11
On 6/19/20 8:59 AM, Pavel Begunkov wrote:
> On 19/06/2020 17:15, Jens Axboe wrote:
>> On 6/19/20 3:41 AM, javier.gonz@samsung.com wrote:
>>> Jens,
>>>
>>> Would you have time to answer a question below in this thread?
>>>
>>> On 18.06.2020 11:11, javier.gonz@samsung.com wrote:
>>>> On 18.06.2020 08:47, Damien Le Moal wrote:
>>>>> On 2020/06/18 17:35, javier.gonz@samsung.com wrote:
>>>>>> On 18.06.2020 07:39, Damien Le Moal wrote:
>>>>>>> On 2020/06/18 2:27, Kanchan Joshi wrote:
>>>>>>>> From: Selvakumar S <selvakuma.s1@samsung.com>
>>>>>>>>
>>>>>>>> Introduce three new opcodes for zone-append -
>>>>>>>>
>>>>>>>>   IORING_OP_ZONE_APPEND     : non-vectord, similiar to IORING_OP_WRITE
>>>>>>>>   IORING_OP_ZONE_APPENDV    : vectored, similar to IORING_OP_WRITEV
>>>>>>>>   IORING_OP_ZONE_APPEND_FIXED : append using fixed-buffers
>>>>>>>>
>>>>>>>> Repurpose cqe->flags to return zone-relative offset.
>>>>>>>>
>>>>>>>> Signed-off-by: SelvaKumar S <selvakuma.s1@samsung.com>
>>>>>>>> Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
>>>>>>>> Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
>>>>>>>> Signed-off-by: Javier Gonzalez <javier.gonz@samsung.com>
>>>>>>>> ---
>>>>>>>> fs/io_uring.c                 | 72 +++++++++++++++++++++++++++++++++++++++++--
>>>>>>>> include/uapi/linux/io_uring.h |  8 ++++-
>>>>>>>> 2 files changed, 77 insertions(+), 3 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/fs/io_uring.c b/fs/io_uring.c
>>>>>>>> index 155f3d8..c14c873 100644
>>>>>>>> --- a/fs/io_uring.c
>>>>>>>> +++ b/fs/io_uring.c
>>>>>>>> @@ -649,6 +649,10 @@ struct io_kiocb {
>>>>>>>> 	unsigned long		fsize;
>>>>>>>> 	u64			user_data;
>>>>>>>> 	u32			result;
>>>>>>>> +#ifdef CONFIG_BLK_DEV_ZONED
>>>>>>>> +	/* zone-relative offset for append, in bytes */
>>>>>>>> +	u32			append_offset;
>>>>>>>
>>>>>>> this can overflow. u64 is needed.
>>>>>>
>>>>>> We chose to do it this way to start with because struct io_uring_cqe
>>>>>> only has space for u32 when we reuse the flags.
>>>>>>
>>>>>> We can of course create a new cqe structure, but that will come with
>>>>>> larger changes to io_uring for supporting append.
>>>>>>
>>>>>> Do you believe this is a better approach?
>>>>>
>>>>> The problem is that zone size are 32 bits in the kernel, as a number
>>>>> of sectors.  So any device that has a zone size smaller or equal to
>>>>> 2^31 512B sectors can be accepted. Using a zone relative offset in
>>>>> bytes for returning zone append result is OK-ish, but to match the
>>>>> kernel supported range of possible zone size, you need 31+9 bits...
>>>>> 32 does not cut it.
>>>>
>>>> Agree. Our initial assumption was that u32 would cover current zone size
>>>> requirements, but if this is a no-go, we will take the longer path.
>>>
>>> Converting to u64 will require a new version of io_uring_cqe, where we
>>> extend at least 32 bits. I believe this will need a whole new allocation
>>> and probably ioctl().
>>>
>>> Is this an acceptable change for you? We will of course add support for
>>> liburing when we agree on the right way to do this.
>>
>> If you need 64-bit of return value, then it's not going to work. Even
>> with the existing patches, reusing cqe->flags isn't going to fly, as
>> it would conflict with eg doing zone append writes with automatic
>> buffer selection.
> 
> Buffer selection is for reads/recv kind of requests, but appends
> are writes. In theory they can co-exist using cqe->flags.

Yeah good point, since it's just writes, doesn't matter. But the other
point still stands, it could potentially conflict with other flags, but
I guess only to the extent where both flags would need extra storage in
->flags. So not a huge concern imho.
Matias Bjorling June 19, 2020, 3:14 p.m. UTC | #12
On 19/06/2020 16.18, Jens Axboe wrote:
> On 6/19/20 5:15 AM, Matias Bjørling wrote:
>> On 19/06/2020 11.41, javier.gonz@samsung.com wrote:
>>> Jens,
>>>
>>> Would you have time to answer a question below in this thread?
>>>
>>> On 18.06.2020 11:11, javier.gonz@samsung.com wrote:
>>>> On 18.06.2020 08:47, Damien Le Moal wrote:
>>>>> On 2020/06/18 17:35, javier.gonz@samsung.com wrote:
>>>>>> On 18.06.2020 07:39, Damien Le Moal wrote:
>>>>>>> On 2020/06/18 2:27, Kanchan Joshi wrote:
>>>>>>>> From: Selvakumar S <selvakuma.s1@samsung.com>
>>>>>>>>
>>>>>>>> Introduce three new opcodes for zone-append -
>>>>>>>>
>>>>>>>>    IORING_OP_ZONE_APPEND     : non-vectord, similiar to
>>>>>>>> IORING_OP_WRITE
>>>>>>>>    IORING_OP_ZONE_APPENDV    : vectored, similar to IORING_OP_WRITEV
>>>>>>>>    IORING_OP_ZONE_APPEND_FIXED : append using fixed-buffers
>>>>>>>>
>>>>>>>> Repurpose cqe->flags to return zone-relative offset.
>>>>>>>>
>>>>>>>> Signed-off-by: SelvaKumar S <selvakuma.s1@samsung.com>
>>>>>>>> Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
>>>>>>>> Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
>>>>>>>> Signed-off-by: Javier Gonzalez <javier.gonz@samsung.com>
>>>>>>>> ---
>>>>>>>> fs/io_uring.c                 | 72
>>>>>>>> +++++++++++++++++++++++++++++++++++++++++--
>>>>>>>> include/uapi/linux/io_uring.h |  8 ++++-
>>>>>>>> 2 files changed, 77 insertions(+), 3 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/fs/io_uring.c b/fs/io_uring.c
>>>>>>>> index 155f3d8..c14c873 100644
>>>>>>>> --- a/fs/io_uring.c
>>>>>>>> +++ b/fs/io_uring.c
>>>>>>>> @@ -649,6 +649,10 @@ struct io_kiocb {
>>>>>>>>      unsigned long        fsize;
>>>>>>>>      u64            user_data;
>>>>>>>>      u32            result;
>>>>>>>> +#ifdef CONFIG_BLK_DEV_ZONED
>>>>>>>> +    /* zone-relative offset for append, in bytes */
>>>>>>>> +    u32            append_offset;
>>>>>>> this can overflow. u64 is needed.
>>>>>> We chose to do it this way to start with because struct io_uring_cqe
>>>>>> only has space for u32 when we reuse the flags.
>>>>>>
>>>>>> We can of course create a new cqe structure, but that will come with
>>>>>> larger changes to io_uring for supporting append.
>>>>>>
>>>>>> Do you believe this is a better approach?
>>>>> The problem is that zone size are 32 bits in the kernel, as a number
>>>>> of sectors.
>>>>> So any device that has a zone size smaller or equal to 2^31 512B
>>>>> sectors can be
>>>>> accepted. Using a zone relative offset in bytes for returning zone
>>>>> append result
>>>>> is OK-ish, but to match the kernel supported range of possible zone
>>>>> size, you
>>>>> need 31+9 bits... 32 does not cut it.
>>>> Agree. Our initial assumption was that u32 would cover current zone size
>>>> requirements, but if this is a no-go, we will take the longer path.
>>> Converting to u64 will require a new version of io_uring_cqe, where we
>>> extend at least 32 bits. I believe this will need a whole new allocation
>>> and probably ioctl().
>>>
>>> Is this an acceptable change for you? We will of course add support for
>>> liburing when we agree on the right way to do this.
>> I took a quick look at the code. No expert, but why not use the existing
>> userdata variable? use the lowest bits (40 bits) for the Zone Starting
>> LBA, and use the highest (24 bits) as index into the completion data
>> structure?
>>
>> If you want to pass the memory address (same as what fio does) for the
>> data structure used for completion, one may also play some tricks by
>> using a relative memory address to the data structure. For example, the
>> x86_64 architecture uses 48 address bits for its memory addresses. With
>> 24 bit, one can allocate the completion entries in a 32MB memory range,
>> and then use base_address + index to get back to the completion data
>> structure specified in the sqe.
> For any current request, sqe->user_data is just provided back as
> cqe->user_data. This would make these requests behave differently
> from everything else in that sense, which seems very confusing to me
> if I was an application writer.
>
> But generally I do agree with you, there are lots of ways to make
> < 64-bit work as a tag without losing anything or having to jump
> through hoops to do so. The lack of consistency introduced by having
> zone append work differently is ugly, though.
>
Yep, agree, and extending to three cachelines is big no-go. We could add 
a flag that said the kernel has changes the userdata variable. That'll 
make it very explicit.
Jens Axboe June 19, 2020, 3:20 p.m. UTC | #13
On 6/19/20 9:14 AM, Matias Bjørling wrote:
> On 19/06/2020 16.18, Jens Axboe wrote:
>> On 6/19/20 5:15 AM, Matias Bjørling wrote:
>>> On 19/06/2020 11.41, javier.gonz@samsung.com wrote:
>>>> Jens,
>>>>
>>>> Would you have time to answer a question below in this thread?
>>>>
>>>> On 18.06.2020 11:11, javier.gonz@samsung.com wrote:
>>>>> On 18.06.2020 08:47, Damien Le Moal wrote:
>>>>>> On 2020/06/18 17:35, javier.gonz@samsung.com wrote:
>>>>>>> On 18.06.2020 07:39, Damien Le Moal wrote:
>>>>>>>> On 2020/06/18 2:27, Kanchan Joshi wrote:
>>>>>>>>> From: Selvakumar S <selvakuma.s1@samsung.com>
>>>>>>>>>
>>>>>>>>> Introduce three new opcodes for zone-append -
>>>>>>>>>
>>>>>>>>>    IORING_OP_ZONE_APPEND     : non-vectord, similiar to
>>>>>>>>> IORING_OP_WRITE
>>>>>>>>>    IORING_OP_ZONE_APPENDV    : vectored, similar to IORING_OP_WRITEV
>>>>>>>>>    IORING_OP_ZONE_APPEND_FIXED : append using fixed-buffers
>>>>>>>>>
>>>>>>>>> Repurpose cqe->flags to return zone-relative offset.
>>>>>>>>>
>>>>>>>>> Signed-off-by: SelvaKumar S <selvakuma.s1@samsung.com>
>>>>>>>>> Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
>>>>>>>>> Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
>>>>>>>>> Signed-off-by: Javier Gonzalez <javier.gonz@samsung.com>
>>>>>>>>> ---
>>>>>>>>> fs/io_uring.c                 | 72
>>>>>>>>> +++++++++++++++++++++++++++++++++++++++++--
>>>>>>>>> include/uapi/linux/io_uring.h |  8 ++++-
>>>>>>>>> 2 files changed, 77 insertions(+), 3 deletions(-)
>>>>>>>>>
>>>>>>>>> diff --git a/fs/io_uring.c b/fs/io_uring.c
>>>>>>>>> index 155f3d8..c14c873 100644
>>>>>>>>> --- a/fs/io_uring.c
>>>>>>>>> +++ b/fs/io_uring.c
>>>>>>>>> @@ -649,6 +649,10 @@ struct io_kiocb {
>>>>>>>>>      unsigned long        fsize;
>>>>>>>>>      u64            user_data;
>>>>>>>>>      u32            result;
>>>>>>>>> +#ifdef CONFIG_BLK_DEV_ZONED
>>>>>>>>> +    /* zone-relative offset for append, in bytes */
>>>>>>>>> +    u32            append_offset;
>>>>>>>> this can overflow. u64 is needed.
>>>>>>> We chose to do it this way to start with because struct io_uring_cqe
>>>>>>> only has space for u32 when we reuse the flags.
>>>>>>>
>>>>>>> We can of course create a new cqe structure, but that will come with
>>>>>>> larger changes to io_uring for supporting append.
>>>>>>>
>>>>>>> Do you believe this is a better approach?
>>>>>> The problem is that zone size are 32 bits in the kernel, as a number
>>>>>> of sectors.
>>>>>> So any device that has a zone size smaller or equal to 2^31 512B
>>>>>> sectors can be
>>>>>> accepted. Using a zone relative offset in bytes for returning zone
>>>>>> append result
>>>>>> is OK-ish, but to match the kernel supported range of possible zone
>>>>>> size, you
>>>>>> need 31+9 bits... 32 does not cut it.
>>>>> Agree. Our initial assumption was that u32 would cover current zone size
>>>>> requirements, but if this is a no-go, we will take the longer path.
>>>> Converting to u64 will require a new version of io_uring_cqe, where we
>>>> extend at least 32 bits. I believe this will need a whole new allocation
>>>> and probably ioctl().
>>>>
>>>> Is this an acceptable change for you? We will of course add support for
>>>> liburing when we agree on the right way to do this.
>>> I took a quick look at the code. No expert, but why not use the existing
>>> userdata variable? use the lowest bits (40 bits) for the Zone Starting
>>> LBA, and use the highest (24 bits) as index into the completion data
>>> structure?
>>>
>>> If you want to pass the memory address (same as what fio does) for the
>>> data structure used for completion, one may also play some tricks by
>>> using a relative memory address to the data structure. For example, the
>>> x86_64 architecture uses 48 address bits for its memory addresses. With
>>> 24 bit, one can allocate the completion entries in a 32MB memory range,
>>> and then use base_address + index to get back to the completion data
>>> structure specified in the sqe.
>> For any current request, sqe->user_data is just provided back as
>> cqe->user_data. This would make these requests behave differently
>> from everything else in that sense, which seems very confusing to me
>> if I was an application writer.
>>
>> But generally I do agree with you, there are lots of ways to make
>> < 64-bit work as a tag without losing anything or having to jump
>> through hoops to do so. The lack of consistency introduced by having
>> zone append work differently is ugly, though.
>>
> Yep, agree, and extending to three cachelines is big no-go. We could add 
> a flag that said the kernel has changes the userdata variable. That'll 
> make it very explicit.

Don't like that either, as it doesn't really change the fact that you're
now doing something very different with the user_data field, which is
just supposed to be passed in/out directly. Adding a random flag to
signal this behavior isn't very explicit either, imho. It's still some
out-of-band (ish) notification of behavior that is different from any
other command. This is very different from having a flag that says
"there's extra information in this other field", which is much cleaner.
Matias Bjorling June 19, 2020, 3:40 p.m. UTC | #14
On 19/06/2020 17.20, Jens Axboe wrote:
> On 6/19/20 9:14 AM, Matias Bjørling wrote:
>> On 19/06/2020 16.18, Jens Axboe wrote:
>>> On 6/19/20 5:15 AM, Matias Bjørling wrote:
>>>> On 19/06/2020 11.41, javier.gonz@samsung.com wrote:
>>>>> Jens,
>>>>>
>>>>> Would you have time to answer a question below in this thread?
>>>>>
>>>>> On 18.06.2020 11:11, javier.gonz@samsung.com wrote:
>>>>>> On 18.06.2020 08:47, Damien Le Moal wrote:
>>>>>>> On 2020/06/18 17:35, javier.gonz@samsung.com wrote:
>>>>>>>> On 18.06.2020 07:39, Damien Le Moal wrote:
>>>>>>>>> On 2020/06/18 2:27, Kanchan Joshi wrote:
>>>>>>>>>> From: Selvakumar S <selvakuma.s1@samsung.com>
>>>>>>>>>>
>>>>>>>>>> Introduce three new opcodes for zone-append -
>>>>>>>>>>
>>>>>>>>>>     IORING_OP_ZONE_APPEND     : non-vectord, similiar to
>>>>>>>>>> IORING_OP_WRITE
>>>>>>>>>>     IORING_OP_ZONE_APPENDV    : vectored, similar to IORING_OP_WRITEV
>>>>>>>>>>     IORING_OP_ZONE_APPEND_FIXED : append using fixed-buffers
>>>>>>>>>>
>>>>>>>>>> Repurpose cqe->flags to return zone-relative offset.
>>>>>>>>>>
>>>>>>>>>> Signed-off-by: SelvaKumar S <selvakuma.s1@samsung.com>
>>>>>>>>>> Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
>>>>>>>>>> Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
>>>>>>>>>> Signed-off-by: Javier Gonzalez <javier.gonz@samsung.com>
>>>>>>>>>> ---
>>>>>>>>>> fs/io_uring.c                 | 72
>>>>>>>>>> +++++++++++++++++++++++++++++++++++++++++--
>>>>>>>>>> include/uapi/linux/io_uring.h |  8 ++++-
>>>>>>>>>> 2 files changed, 77 insertions(+), 3 deletions(-)
>>>>>>>>>>
>>>>>>>>>> diff --git a/fs/io_uring.c b/fs/io_uring.c
>>>>>>>>>> index 155f3d8..c14c873 100644
>>>>>>>>>> --- a/fs/io_uring.c
>>>>>>>>>> +++ b/fs/io_uring.c
>>>>>>>>>> @@ -649,6 +649,10 @@ struct io_kiocb {
>>>>>>>>>>       unsigned long        fsize;
>>>>>>>>>>       u64            user_data;
>>>>>>>>>>       u32            result;
>>>>>>>>>> +#ifdef CONFIG_BLK_DEV_ZONED
>>>>>>>>>> +    /* zone-relative offset for append, in bytes */
>>>>>>>>>> +    u32            append_offset;
>>>>>>>>> this can overflow. u64 is needed.
>>>>>>>> We chose to do it this way to start with because struct io_uring_cqe
>>>>>>>> only has space for u32 when we reuse the flags.
>>>>>>>>
>>>>>>>> We can of course create a new cqe structure, but that will come with
>>>>>>>> larger changes to io_uring for supporting append.
>>>>>>>>
>>>>>>>> Do you believe this is a better approach?
>>>>>>> The problem is that zone size are 32 bits in the kernel, as a number
>>>>>>> of sectors.
>>>>>>> So any device that has a zone size smaller or equal to 2^31 512B
>>>>>>> sectors can be
>>>>>>> accepted. Using a zone relative offset in bytes for returning zone
>>>>>>> append result
>>>>>>> is OK-ish, but to match the kernel supported range of possible zone
>>>>>>> size, you
>>>>>>> need 31+9 bits... 32 does not cut it.
>>>>>> Agree. Our initial assumption was that u32 would cover current zone size
>>>>>> requirements, but if this is a no-go, we will take the longer path.
>>>>> Converting to u64 will require a new version of io_uring_cqe, where we
>>>>> extend at least 32 bits. I believe this will need a whole new allocation
>>>>> and probably ioctl().
>>>>>
>>>>> Is this an acceptable change for you? We will of course add support for
>>>>> liburing when we agree on the right way to do this.
>>>> I took a quick look at the code. No expert, but why not use the existing
>>>> userdata variable? use the lowest bits (40 bits) for the Zone Starting
>>>> LBA, and use the highest (24 bits) as index into the completion data
>>>> structure?
>>>>
>>>> If you want to pass the memory address (same as what fio does) for the
>>>> data structure used for completion, one may also play some tricks by
>>>> using a relative memory address to the data structure. For example, the
>>>> x86_64 architecture uses 48 address bits for its memory addresses. With
>>>> 24 bit, one can allocate the completion entries in a 32MB memory range,
>>>> and then use base_address + index to get back to the completion data
>>>> structure specified in the sqe.
>>> For any current request, sqe->user_data is just provided back as
>>> cqe->user_data. This would make these requests behave differently
>>> from everything else in that sense, which seems very confusing to me
>>> if I was an application writer.
>>>
>>> But generally I do agree with you, there are lots of ways to make
>>> < 64-bit work as a tag without losing anything or having to jump
>>> through hoops to do so. The lack of consistency introduced by having
>>> zone append work differently is ugly, though.
>>>
>> Yep, agree, and extending to three cachelines is big no-go. We could add
>> a flag that said the kernel has changes the userdata variable. That'll
>> make it very explicit.
> Don't like that either, as it doesn't really change the fact that you're
> now doing something very different with the user_data field, which is
> just supposed to be passed in/out directly. Adding a random flag to
> signal this behavior isn't very explicit either, imho. It's still some
> out-of-band (ish) notification of behavior that is different from any
> other command. This is very different from having a flag that says
> "there's extra information in this other field", which is much cleaner.
>
Ok. Then it's pulling in the bits from cqe->res and cqe->flags that you 
mention in the other mail. Sounds good.
Jens Axboe June 19, 2020, 3:44 p.m. UTC | #15
On 6/19/20 9:40 AM, Matias Bjørling wrote:
> On 19/06/2020 17.20, Jens Axboe wrote:
>> On 6/19/20 9:14 AM, Matias Bjørling wrote:
>>> On 19/06/2020 16.18, Jens Axboe wrote:
>>>> On 6/19/20 5:15 AM, Matias Bjørling wrote:
>>>>> On 19/06/2020 11.41, javier.gonz@samsung.com wrote:
>>>>>> Jens,
>>>>>>
>>>>>> Would you have time to answer a question below in this thread?
>>>>>>
>>>>>> On 18.06.2020 11:11, javier.gonz@samsung.com wrote:
>>>>>>> On 18.06.2020 08:47, Damien Le Moal wrote:
>>>>>>>> On 2020/06/18 17:35, javier.gonz@samsung.com wrote:
>>>>>>>>> On 18.06.2020 07:39, Damien Le Moal wrote:
>>>>>>>>>> On 2020/06/18 2:27, Kanchan Joshi wrote:
>>>>>>>>>>> From: Selvakumar S <selvakuma.s1@samsung.com>
>>>>>>>>>>>
>>>>>>>>>>> Introduce three new opcodes for zone-append -
>>>>>>>>>>>
>>>>>>>>>>>     IORING_OP_ZONE_APPEND     : non-vectord, similiar to
>>>>>>>>>>> IORING_OP_WRITE
>>>>>>>>>>>     IORING_OP_ZONE_APPENDV    : vectored, similar to IORING_OP_WRITEV
>>>>>>>>>>>     IORING_OP_ZONE_APPEND_FIXED : append using fixed-buffers
>>>>>>>>>>>
>>>>>>>>>>> Repurpose cqe->flags to return zone-relative offset.
>>>>>>>>>>>
>>>>>>>>>>> Signed-off-by: SelvaKumar S <selvakuma.s1@samsung.com>
>>>>>>>>>>> Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
>>>>>>>>>>> Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
>>>>>>>>>>> Signed-off-by: Javier Gonzalez <javier.gonz@samsung.com>
>>>>>>>>>>> ---
>>>>>>>>>>> fs/io_uring.c                 | 72
>>>>>>>>>>> +++++++++++++++++++++++++++++++++++++++++--
>>>>>>>>>>> include/uapi/linux/io_uring.h |  8 ++++-
>>>>>>>>>>> 2 files changed, 77 insertions(+), 3 deletions(-)
>>>>>>>>>>>
>>>>>>>>>>> diff --git a/fs/io_uring.c b/fs/io_uring.c
>>>>>>>>>>> index 155f3d8..c14c873 100644
>>>>>>>>>>> --- a/fs/io_uring.c
>>>>>>>>>>> +++ b/fs/io_uring.c
>>>>>>>>>>> @@ -649,6 +649,10 @@ struct io_kiocb {
>>>>>>>>>>>       unsigned long        fsize;
>>>>>>>>>>>       u64            user_data;
>>>>>>>>>>>       u32            result;
>>>>>>>>>>> +#ifdef CONFIG_BLK_DEV_ZONED
>>>>>>>>>>> +    /* zone-relative offset for append, in bytes */
>>>>>>>>>>> +    u32            append_offset;
>>>>>>>>>> this can overflow. u64 is needed.
>>>>>>>>> We chose to do it this way to start with because struct io_uring_cqe
>>>>>>>>> only has space for u32 when we reuse the flags.
>>>>>>>>>
>>>>>>>>> We can of course create a new cqe structure, but that will come with
>>>>>>>>> larger changes to io_uring for supporting append.
>>>>>>>>>
>>>>>>>>> Do you believe this is a better approach?
>>>>>>>> The problem is that zone size are 32 bits in the kernel, as a number
>>>>>>>> of sectors.
>>>>>>>> So any device that has a zone size smaller or equal to 2^31 512B
>>>>>>>> sectors can be
>>>>>>>> accepted. Using a zone relative offset in bytes for returning zone
>>>>>>>> append result
>>>>>>>> is OK-ish, but to match the kernel supported range of possible zone
>>>>>>>> size, you
>>>>>>>> need 31+9 bits... 32 does not cut it.
>>>>>>> Agree. Our initial assumption was that u32 would cover current zone size
>>>>>>> requirements, but if this is a no-go, we will take the longer path.
>>>>>> Converting to u64 will require a new version of io_uring_cqe, where we
>>>>>> extend at least 32 bits. I believe this will need a whole new allocation
>>>>>> and probably ioctl().
>>>>>>
>>>>>> Is this an acceptable change for you? We will of course add support for
>>>>>> liburing when we agree on the right way to do this.
>>>>> I took a quick look at the code. No expert, but why not use the existing
>>>>> userdata variable? use the lowest bits (40 bits) for the Zone Starting
>>>>> LBA, and use the highest (24 bits) as index into the completion data
>>>>> structure?
>>>>>
>>>>> If you want to pass the memory address (same as what fio does) for the
>>>>> data structure used for completion, one may also play some tricks by
>>>>> using a relative memory address to the data structure. For example, the
>>>>> x86_64 architecture uses 48 address bits for its memory addresses. With
>>>>> 24 bit, one can allocate the completion entries in a 32MB memory range,
>>>>> and then use base_address + index to get back to the completion data
>>>>> structure specified in the sqe.
>>>> For any current request, sqe->user_data is just provided back as
>>>> cqe->user_data. This would make these requests behave differently
>>>> from everything else in that sense, which seems very confusing to me
>>>> if I was an application writer.
>>>>
>>>> But generally I do agree with you, there are lots of ways to make
>>>> < 64-bit work as a tag without losing anything or having to jump
>>>> through hoops to do so. The lack of consistency introduced by having
>>>> zone append work differently is ugly, though.
>>>>
>>> Yep, agree, and extending to three cachelines is big no-go. We could add
>>> a flag that said the kernel has changes the userdata variable. That'll
>>> make it very explicit.
>> Don't like that either, as it doesn't really change the fact that you're
>> now doing something very different with the user_data field, which is
>> just supposed to be passed in/out directly. Adding a random flag to
>> signal this behavior isn't very explicit either, imho. It's still some
>> out-of-band (ish) notification of behavior that is different from any
>> other command. This is very different from having a flag that says
>> "there's extra information in this other field", which is much cleaner.
>>
> Ok. Then it's pulling in the bits from cqe->res and cqe->flags that you 
> mention in the other mail. Sounds good.

I think that's the best approach, if we need > 32-bits. Maybe we can get
by just using ->res, if we switch to multiples of 512b instead for the
result like Pavel suggested. That'd provide enough room in ->res, and
would be preferable imho. But if we do need > 32-bits, then we can use
this approach.
Javier González June 21, 2020, 6:52 p.m. UTC | #16
On 19.06.2020 09:02, Jens Axboe wrote:
>On 6/19/20 8:59 AM, Pavel Begunkov wrote:
>> On 19/06/2020 17:15, Jens Axboe wrote:
>>> On 6/19/20 3:41 AM, javier.gonz@samsung.com wrote:
>>>> Jens,
>>>>
>>>> Would you have time to answer a question below in this thread?
>>>>
>>>> On 18.06.2020 11:11, javier.gonz@samsung.com wrote:
>>>>> On 18.06.2020 08:47, Damien Le Moal wrote:
>>>>>> On 2020/06/18 17:35, javier.gonz@samsung.com wrote:
>>>>>>> On 18.06.2020 07:39, Damien Le Moal wrote:
>>>>>>>> On 2020/06/18 2:27, Kanchan Joshi wrote:
>>>>>>>>> From: Selvakumar S <selvakuma.s1@samsung.com>
>>>>>>>>>
>>>>>>>>> Introduce three new opcodes for zone-append -
>>>>>>>>>
>>>>>>>>>   IORING_OP_ZONE_APPEND     : non-vectord, similiar to IORING_OP_WRITE
>>>>>>>>>   IORING_OP_ZONE_APPENDV    : vectored, similar to IORING_OP_WRITEV
>>>>>>>>>   IORING_OP_ZONE_APPEND_FIXED : append using fixed-buffers
>>>>>>>>>
>>>>>>>>> Repurpose cqe->flags to return zone-relative offset.
>>>>>>>>>
>>>>>>>>> Signed-off-by: SelvaKumar S <selvakuma.s1@samsung.com>
>>>>>>>>> Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
>>>>>>>>> Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
>>>>>>>>> Signed-off-by: Javier Gonzalez <javier.gonz@samsung.com>
>>>>>>>>> ---
>>>>>>>>> fs/io_uring.c                 | 72 +++++++++++++++++++++++++++++++++++++++++--
>>>>>>>>> include/uapi/linux/io_uring.h |  8 ++++-
>>>>>>>>> 2 files changed, 77 insertions(+), 3 deletions(-)
>>>>>>>>>
>>>>>>>>> diff --git a/fs/io_uring.c b/fs/io_uring.c
>>>>>>>>> index 155f3d8..c14c873 100644
>>>>>>>>> --- a/fs/io_uring.c
>>>>>>>>> +++ b/fs/io_uring.c
>>>>>>>>> @@ -649,6 +649,10 @@ struct io_kiocb {
>>>>>>>>> 	unsigned long		fsize;
>>>>>>>>> 	u64			user_data;
>>>>>>>>> 	u32			result;
>>>>>>>>> +#ifdef CONFIG_BLK_DEV_ZONED
>>>>>>>>> +	/* zone-relative offset for append, in bytes */
>>>>>>>>> +	u32			append_offset;
>>>>>>>>
>>>>>>>> this can overflow. u64 is needed.
>>>>>>>
>>>>>>> We chose to do it this way to start with because struct io_uring_cqe
>>>>>>> only has space for u32 when we reuse the flags.
>>>>>>>
>>>>>>> We can of course create a new cqe structure, but that will come with
>>>>>>> larger changes to io_uring for supporting append.
>>>>>>>
>>>>>>> Do you believe this is a better approach?
>>>>>>
>>>>>> The problem is that zone size are 32 bits in the kernel, as a number
>>>>>> of sectors.  So any device that has a zone size smaller or equal to
>>>>>> 2^31 512B sectors can be accepted. Using a zone relative offset in
>>>>>> bytes for returning zone append result is OK-ish, but to match the
>>>>>> kernel supported range of possible zone size, you need 31+9 bits...
>>>>>> 32 does not cut it.
>>>>>
>>>>> Agree. Our initial assumption was that u32 would cover current zone size
>>>>> requirements, but if this is a no-go, we will take the longer path.
>>>>
>>>> Converting to u64 will require a new version of io_uring_cqe, where we
>>>> extend at least 32 bits. I believe this will need a whole new allocation
>>>> and probably ioctl().
>>>>
>>>> Is this an acceptable change for you? We will of course add support for
>>>> liburing when we agree on the right way to do this.
>>>
>>> If you need 64-bit of return value, then it's not going to work. Even
>>> with the existing patches, reusing cqe->flags isn't going to fly, as
>>> it would conflict with eg doing zone append writes with automatic
>>> buffer selection.
>>
>> Buffer selection is for reads/recv kind of requests, but appends
>> are writes. In theory they can co-exist using cqe->flags.
>
>Yeah good point, since it's just writes, doesn't matter. But the other
>point still stands, it could potentially conflict with other flags, but
>I guess only to the extent where both flags would need extra storage in
>->flags. So not a huge concern imho.

Very good point Pavel!

If co-existing with the current flags is an option, I'll explore this
for the next version.

Thanks Jens and Pavel for the time and ideas!

Javier
Javier González June 21, 2020, 6:55 p.m. UTC | #17
On 19.06.2020 09:44, Jens Axboe wrote:
>On 6/19/20 9:40 AM, Matias Bjørling wrote:
>> On 19/06/2020 17.20, Jens Axboe wrote:
>>> On 6/19/20 9:14 AM, Matias Bjørling wrote:
>>>> On 19/06/2020 16.18, Jens Axboe wrote:
>>>>> On 6/19/20 5:15 AM, Matias Bjørling wrote:
>>>>>> On 19/06/2020 11.41, javier.gonz@samsung.com wrote:
>>>>>>> Jens,
>>>>>>>
>>>>>>> Would you have time to answer a question below in this thread?
>>>>>>>
>>>>>>> On 18.06.2020 11:11, javier.gonz@samsung.com wrote:
>>>>>>>> On 18.06.2020 08:47, Damien Le Moal wrote:
>>>>>>>>> On 2020/06/18 17:35, javier.gonz@samsung.com wrote:
>>>>>>>>>> On 18.06.2020 07:39, Damien Le Moal wrote:
>>>>>>>>>>> On 2020/06/18 2:27, Kanchan Joshi wrote:
>>>>>>>>>>>> From: Selvakumar S <selvakuma.s1@samsung.com>
>>>>>>>>>>>>
>>>>>>>>>>>> Introduce three new opcodes for zone-append -
>>>>>>>>>>>>
>>>>>>>>>>>>     IORING_OP_ZONE_APPEND     : non-vectord, similiar to
>>>>>>>>>>>> IORING_OP_WRITE
>>>>>>>>>>>>     IORING_OP_ZONE_APPENDV    : vectored, similar to IORING_OP_WRITEV
>>>>>>>>>>>>     IORING_OP_ZONE_APPEND_FIXED : append using fixed-buffers
>>>>>>>>>>>>
>>>>>>>>>>>> Repurpose cqe->flags to return zone-relative offset.
>>>>>>>>>>>>
>>>>>>>>>>>> Signed-off-by: SelvaKumar S <selvakuma.s1@samsung.com>
>>>>>>>>>>>> Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
>>>>>>>>>>>> Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
>>>>>>>>>>>> Signed-off-by: Javier Gonzalez <javier.gonz@samsung.com>
>>>>>>>>>>>> ---
>>>>>>>>>>>> fs/io_uring.c                 | 72
>>>>>>>>>>>> +++++++++++++++++++++++++++++++++++++++++--
>>>>>>>>>>>> include/uapi/linux/io_uring.h |  8 ++++-
>>>>>>>>>>>> 2 files changed, 77 insertions(+), 3 deletions(-)
>>>>>>>>>>>>
>>>>>>>>>>>> diff --git a/fs/io_uring.c b/fs/io_uring.c
>>>>>>>>>>>> index 155f3d8..c14c873 100644
>>>>>>>>>>>> --- a/fs/io_uring.c
>>>>>>>>>>>> +++ b/fs/io_uring.c
>>>>>>>>>>>> @@ -649,6 +649,10 @@ struct io_kiocb {
>>>>>>>>>>>>       unsigned long        fsize;
>>>>>>>>>>>>       u64            user_data;
>>>>>>>>>>>>       u32            result;
>>>>>>>>>>>> +#ifdef CONFIG_BLK_DEV_ZONED
>>>>>>>>>>>> +    /* zone-relative offset for append, in bytes */
>>>>>>>>>>>> +    u32            append_offset;
>>>>>>>>>>> this can overflow. u64 is needed.
>>>>>>>>>> We chose to do it this way to start with because struct io_uring_cqe
>>>>>>>>>> only has space for u32 when we reuse the flags.
>>>>>>>>>>
>>>>>>>>>> We can of course create a new cqe structure, but that will come with
>>>>>>>>>> larger changes to io_uring for supporting append.
>>>>>>>>>>
>>>>>>>>>> Do you believe this is a better approach?
>>>>>>>>> The problem is that zone size are 32 bits in the kernel, as a number
>>>>>>>>> of sectors.
>>>>>>>>> So any device that has a zone size smaller or equal to 2^31 512B
>>>>>>>>> sectors can be
>>>>>>>>> accepted. Using a zone relative offset in bytes for returning zone
>>>>>>>>> append result
>>>>>>>>> is OK-ish, but to match the kernel supported range of possible zone
>>>>>>>>> size, you
>>>>>>>>> need 31+9 bits... 32 does not cut it.
>>>>>>>> Agree. Our initial assumption was that u32 would cover current zone size
>>>>>>>> requirements, but if this is a no-go, we will take the longer path.
>>>>>>> Converting to u64 will require a new version of io_uring_cqe, where we
>>>>>>> extend at least 32 bits. I believe this will need a whole new allocation
>>>>>>> and probably ioctl().
>>>>>>>
>>>>>>> Is this an acceptable change for you? We will of course add support for
>>>>>>> liburing when we agree on the right way to do this.
>>>>>> I took a quick look at the code. No expert, but why not use the existing
>>>>>> userdata variable? use the lowest bits (40 bits) for the Zone Starting
>>>>>> LBA, and use the highest (24 bits) as index into the completion data
>>>>>> structure?
>>>>>>
>>>>>> If you want to pass the memory address (same as what fio does) for the
>>>>>> data structure used for completion, one may also play some tricks by
>>>>>> using a relative memory address to the data structure. For example, the
>>>>>> x86_64 architecture uses 48 address bits for its memory addresses. With
>>>>>> 24 bit, one can allocate the completion entries in a 32MB memory range,
>>>>>> and then use base_address + index to get back to the completion data
>>>>>> structure specified in the sqe.
>>>>> For any current request, sqe->user_data is just provided back as
>>>>> cqe->user_data. This would make these requests behave differently
>>>>> from everything else in that sense, which seems very confusing to me
>>>>> if I was an application writer.
>>>>>
>>>>> But generally I do agree with you, there are lots of ways to make
>>>>> < 64-bit work as a tag without losing anything or having to jump
>>>>> through hoops to do so. The lack of consistency introduced by having
>>>>> zone append work differently is ugly, though.
>>>>>
>>>> Yep, agree, and extending to three cachelines is big no-go. We could add
>>>> a flag that said the kernel has changes the userdata variable. That'll
>>>> make it very explicit.
>>> Don't like that either, as it doesn't really change the fact that you're
>>> now doing something very different with the user_data field, which is
>>> just supposed to be passed in/out directly. Adding a random flag to
>>> signal this behavior isn't very explicit either, imho. It's still some
>>> out-of-band (ish) notification of behavior that is different from any
>>> other command. This is very different from having a flag that says
>>> "there's extra information in this other field", which is much cleaner.
>>>
>> Ok. Then it's pulling in the bits from cqe->res and cqe->flags that you
>> mention in the other mail. Sounds good.
>
>I think that's the best approach, if we need > 32-bits. Maybe we can get
>by just using ->res, if we switch to multiples of 512b instead for the
>result like Pavel suggested. That'd provide enough room in ->res, and
>would be preferable imho. But if we do need > 32-bits, then we can use
>this approach.

Sounds good.

Thanks Matias too for chipping in with more ideas. We have enough for a
v2.

Javier
diff mbox series

Patch

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 155f3d8..c14c873 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -649,6 +649,10 @@  struct io_kiocb {
 	unsigned long		fsize;
 	u64			user_data;
 	u32			result;
+#ifdef CONFIG_BLK_DEV_ZONED
+	/* zone-relative offset for append, in bytes */
+	u32			append_offset;
+#endif
 	u32			sequence;
 
 	struct list_head	link_list;
@@ -875,6 +879,26 @@  static const struct io_op_def io_op_defs[] = {
 		.hash_reg_file		= 1,
 		.unbound_nonreg_file	= 1,
 	},
+	[IORING_OP_ZONE_APPEND] = {
+		.needs_mm               = 1,
+		.needs_file             = 1,
+		.unbound_nonreg_file    = 1,
+		.pollout		= 1,
+	},
+	[IORING_OP_ZONE_APPENDV] = {
+	       .async_ctx              = 1,
+	       .needs_mm               = 1,
+	       .needs_file             = 1,
+	       .hash_reg_file          = 1,
+	       .unbound_nonreg_file    = 1,
+	       .pollout			= 1,
+	},
+	[IORING_OP_ZONE_APPEND_FIXED] = {
+	       .needs_file             = 1,
+	       .hash_reg_file          = 1,
+	       .unbound_nonreg_file    = 1,
+	       .pollout			= 1,
+	},
 };
 
 static void io_wq_submit_work(struct io_wq_work **workptr);
@@ -1285,7 +1309,16 @@  static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
 	if (likely(cqe)) {
 		WRITE_ONCE(cqe->user_data, req->user_data);
 		WRITE_ONCE(cqe->res, res);
+#ifdef CONFIG_BLK_DEV_ZONED
+		if (req->opcode == IORING_OP_ZONE_APPEND ||
+				req->opcode == IORING_OP_ZONE_APPENDV ||
+				req->opcode == IORING_OP_ZONE_APPEND_FIXED)
+			WRITE_ONCE(cqe->res2, req->append_offset);
+		else
+			WRITE_ONCE(cqe->flags, cflags);
+#else
 		WRITE_ONCE(cqe->flags, cflags);
+#endif
 	} else if (ctx->cq_overflow_flushed) {
 		WRITE_ONCE(ctx->rings->cq_overflow,
 				atomic_inc_return(&ctx->cached_cq_overflow));
@@ -1961,6 +1994,9 @@  static void io_complete_rw_common(struct kiocb *kiocb, long res)
 static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
 {
 	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
+#ifdef CONFIG_BLK_DEV_ZONED
+	req->append_offset = (u32)res2;
+#endif
 
 	io_complete_rw_common(kiocb, res);
 	io_put_req(req);
@@ -1976,6 +2012,9 @@  static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
 	if (res != req->result)
 		req_set_fail_links(req);
 	req->result = res;
+#ifdef CONFIG_BLK_DEV_ZONED
+	req->append_offset = (u32)res2;
+#endif
 	if (res != -EAGAIN)
 		WRITE_ONCE(req->iopoll_completed, 1);
 }
@@ -2408,7 +2447,8 @@  static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
 	u8 opcode;
 
 	opcode = req->opcode;
-	if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
+	if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED ||
+			opcode == IORING_OP_ZONE_APPEND_FIXED) {
 		*iovec = NULL;
 		return io_import_fixed(req, rw, iter);
 	}
@@ -2417,7 +2457,8 @@  static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
 	if (req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT))
 		return -EINVAL;
 
-	if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
+	if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE ||
+			opcode == IORING_OP_ZONE_APPEND) {
 		if (req->flags & REQ_F_BUFFER_SELECT) {
 			buf = io_rw_buffer_select(req, &sqe_len, needs_lock);
 			if (IS_ERR(buf)) {
@@ -2704,6 +2745,9 @@  static int io_write(struct io_kiocb *req, bool force_nonblock)
 		req->rw.kiocb.ki_flags &= ~IOCB_NOWAIT;
 
 	req->result = 0;
+#ifdef CONFIG_BLK_DEV_ZONED
+	req->append_offset = 0;
+#endif
 	io_size = ret;
 	if (req->flags & REQ_F_LINK_HEAD)
 		req->result = io_size;
@@ -2738,6 +2782,13 @@  static int io_write(struct io_kiocb *req, bool force_nonblock)
 			__sb_writers_release(file_inode(req->file)->i_sb,
 						SB_FREEZE_WRITE);
 		}
+#ifdef CONFIG_BLK_DEV_ZONED
+		if (req->opcode == IORING_OP_ZONE_APPEND ||
+				req->opcode == IORING_OP_ZONE_APPENDV ||
+				req->opcode == IORING_OP_ZONE_APPEND_FIXED)
+			kiocb->ki_flags |= IOCB_ZONE_APPEND;
+#endif
+
 		kiocb->ki_flags |= IOCB_WRITE;
 
 		if (!force_nonblock)
@@ -4906,6 +4957,12 @@  static int io_req_defer_prep(struct io_kiocb *req,
 	case IORING_OP_WRITEV:
 	case IORING_OP_WRITE_FIXED:
 	case IORING_OP_WRITE:
+#ifdef CONFIG_BLK_DEV_ZONED
+	fallthrough;
+	case IORING_OP_ZONE_APPEND:
+	case IORING_OP_ZONE_APPENDV:
+	case IORING_OP_ZONE_APPEND_FIXED:
+#endif
 		ret = io_write_prep(req, sqe, true);
 		break;
 	case IORING_OP_POLL_ADD:
@@ -5038,6 +5095,12 @@  static void io_cleanup_req(struct io_kiocb *req)
 	case IORING_OP_WRITEV:
 	case IORING_OP_WRITE_FIXED:
 	case IORING_OP_WRITE:
+#ifdef CONFIG_BLK_DEV_ZONED
+	fallthrough;
+	case IORING_OP_ZONE_APPEND:
+	case IORING_OP_ZONE_APPENDV:
+	case IORING_OP_ZONE_APPEND_FIXED:
+#endif
 		if (io->rw.iov != io->rw.fast_iov)
 			kfree(io->rw.iov);
 		break;
@@ -5086,6 +5149,11 @@  static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 		}
 		ret = io_read(req, force_nonblock);
 		break;
+#ifdef CONFIG_BLK_DEV_ZONED
+	case IORING_OP_ZONE_APPEND:
+	case IORING_OP_ZONE_APPENDV:
+	case IORING_OP_ZONE_APPEND_FIXED:
+#endif
 	case IORING_OP_WRITEV:
 	case IORING_OP_WRITE_FIXED:
 	case IORING_OP_WRITE:
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 92c2269..6c8e932 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -130,6 +130,9 @@  enum {
 	IORING_OP_PROVIDE_BUFFERS,
 	IORING_OP_REMOVE_BUFFERS,
 	IORING_OP_TEE,
+	IORING_OP_ZONE_APPEND,
+	IORING_OP_ZONE_APPENDV,
+	IORING_OP_ZONE_APPEND_FIXED,
 
 	/* this goes last, obviously */
 	IORING_OP_LAST,
@@ -157,7 +160,10 @@  enum {
 struct io_uring_cqe {
 	__u64	user_data;	/* sqe->data submission passed back */
 	__s32	res;		/* result code for this event */
-	__u32	flags;
+	union {
+		__u32	res2; /* res2 like aio, currently used for zone-append */
+		__u32	flags;
+	};
 };
 
 /*