[v2,1/2] fs,block: Introduce RWF_ZONE_APPEND and handling in direct IO path
diff mbox series

Message ID 1593105349-19270-2-git-send-email-joshi.k@samsung.com
State New
Headers show
Series
  • zone-append support in io-uring and aio
Related show

Commit Message

Kanchan Joshi June 25, 2020, 5:15 p.m. UTC
Introduce RWF_ZONE_APPEND flag to represent zone-append. User-space
sends this with write. Add IOCB_ZONE_APPEND which is set in
kiocb->ki_flags on receiving RWF_ZONE_APPEND.
Make direct IO submission path use IOCB_ZONE_APPEND to send bio with
append op. Direct IO completion returns zone-relative offset, in sector
unit, to upper layer using kiocb->ki_complete interface.
Report error if zone-append is requested on regular file or on sync
kiocb (i.e. one without ki_complete).

Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
Signed-off-by: SelvaKumar S <selvakuma.s1@samsung.com>
Signed-off-by: Arnav Dawn <a.dawn@samsung.com>
Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
Signed-off-by: Javier Gonzalez <javier.gonz@samsung.com>
---
 fs/block_dev.c          | 28 ++++++++++++++++++++++++----
 include/linux/fs.h      |  9 +++++++++
 include/uapi/linux/fs.h |  5 ++++-
 3 files changed, 37 insertions(+), 5 deletions(-)

Comments

Damien Le Moal June 26, 2020, 2:50 a.m. UTC | #1
On 2020/06/26 2:18, Kanchan Joshi wrote:
> Introduce RWF_ZONE_APPEND flag to represent zone-append. User-space
> sends this with write. Add IOCB_ZONE_APPEND which is set in
> kiocb->ki_flags on receiving RWF_ZONE_APPEND.
> Make direct IO submission path use IOCB_ZONE_APPEND to send bio with
> append op. Direct IO completion returns zone-relative offset, in sector
> unit, to upper layer using kiocb->ki_complete interface.
> Report error if zone-append is requested on regular file or on sync
> kiocb (i.e. one without ki_complete).
> 
> Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
> Signed-off-by: SelvaKumar S <selvakuma.s1@samsung.com>
> Signed-off-by: Arnav Dawn <a.dawn@samsung.com>
> Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
> Signed-off-by: Javier Gonzalez <javier.gonz@samsung.com>
> ---
>  fs/block_dev.c          | 28 ++++++++++++++++++++++++----
>  include/linux/fs.h      |  9 +++++++++
>  include/uapi/linux/fs.h |  5 ++++-
>  3 files changed, 37 insertions(+), 5 deletions(-)
> 
> diff --git a/fs/block_dev.c b/fs/block_dev.c
> index 47860e5..5180268 100644
> --- a/fs/block_dev.c
> +++ b/fs/block_dev.c
> @@ -185,6 +185,10 @@ static unsigned int dio_bio_write_op(struct kiocb *iocb)
>  	/* avoid the need for a I/O completion work item */
>  	if (iocb->ki_flags & IOCB_DSYNC)
>  		op |= REQ_FUA;
> +
> +	if (iocb->ki_flags & IOCB_ZONE_APPEND)
> +		op |= REQ_OP_ZONE_APPEND;

This is wrong. REQ_OP_WRITE is already set in the declaration of "op". How can
this work ?

> +
>  	return op;
>  }
>  
> @@ -295,6 +299,14 @@ static int blkdev_iopoll(struct kiocb *kiocb, bool wait)
>  	return blk_poll(q, READ_ONCE(kiocb->ki_cookie), wait);
>  }
>  
> +static inline long blkdev_bio_end_io_append(struct bio *bio)
> +{
> +	sector_t zone_sectors = blk_queue_zone_sectors(bio->bi_disk->queue);
> +
> +	/* calculate zone relative offset for zone append */

The name of the function may be better spelling out zone_append instead of just
append. But see next comment.

> +	return bio->bi_iter.bi_sector & (zone_sectors - 1);
> +}
> +
>  static void blkdev_bio_end_io(struct bio *bio)
>  {
>  	struct blkdev_dio *dio = bio->bi_private;
> @@ -307,15 +319,19 @@ static void blkdev_bio_end_io(struct bio *bio)
>  		if (!dio->is_sync) {
>  			struct kiocb *iocb = dio->iocb;
>  			ssize_t ret;
> +			long res2 = 0;
>  
>  			if (likely(!dio->bio.bi_status)) {
>  				ret = dio->size;
>  				iocb->ki_pos += ret;
> +

Blank line not needed.

> +				if (iocb->ki_flags & IOCB_ZONE_APPEND)> +					res2 = blkdev_bio_end_io_append(bio);

The name blkdev_bio_end_io_append() implies a bio end_io callback function,
which is not the case. What about naming this blkdev_bio_res2() and move the if
inside it ?

>  			} else {
>  				ret = blk_status_to_errno(dio->bio.bi_status);

add "res2 = 0;" here and drop the declaration initialization. That will avoid
doing the assignment twice for zone append case.

>  			}
>  
> -			dio->iocb->ki_complete(iocb, ret, 0);
> +			dio->iocb->ki_complete(iocb, ret, res2);
>  			if (dio->multi_bio)
>  				bio_put(&dio->bio);
>  		} else {
> @@ -382,6 +398,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
>  		bio->bi_private = dio;
>  		bio->bi_end_io = blkdev_bio_end_io;
>  		bio->bi_ioprio = iocb->ki_ioprio;
> +		bio->bi_opf = is_read ? REQ_OP_READ : dio_bio_write_op(iocb);

Personally, I would prefer a plain "if () else". Or even better: change
dio_bio_write_op() into dio_bio_op() and just have:

		bio->bi_opf = dio_bio_op(iocb);

>  
>  		ret = bio_iov_iter_get_pages(bio, iter);
>  		if (unlikely(ret)) {
> @@ -391,11 +408,9 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
>  		}
>  
>  		if (is_read) {
> -			bio->bi_opf = REQ_OP_READ;
>  			if (dio->should_dirty)
>  				bio_set_pages_dirty(bio);
>  		} else {
> -			bio->bi_opf = dio_bio_write_op(iocb);
>  			task_io_account_write(bio->bi_iter.bi_size);
>  		}
>  
> @@ -465,12 +480,17 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
>  static ssize_t
>  blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
>  {
> +	bool is_sync = is_sync_kiocb(iocb);
>  	int nr_pages;
>  
> +	/* zone-append is supported only on async-kiocb */
> +	if (is_sync && iocb->ki_flags & IOCB_ZONE_APPEND)
> +		return -EINVAL;
> +
>  	nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES + 1);
>  	if (!nr_pages)
>  		return 0;
> -	if (is_sync_kiocb(iocb) && nr_pages <= BIO_MAX_PAGES)
> +	if (is_sync && nr_pages <= BIO_MAX_PAGES)
>  		return __blkdev_direct_IO_simple(iocb, iter, nr_pages);
>  
>  	return __blkdev_direct_IO(iocb, iter, min(nr_pages, BIO_MAX_PAGES));
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 6c4ab4d..3202d9a 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -315,6 +315,7 @@ enum rw_hint {
>  #define IOCB_SYNC		(1 << 5)
>  #define IOCB_WRITE		(1 << 6)
>  #define IOCB_NOWAIT		(1 << 7)
> +#define IOCB_ZONE_APPEND	(1 << 8)
>  
>  struct kiocb {
>  	struct file		*ki_filp;
> @@ -3456,6 +3457,14 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags)
>  		ki->ki_flags |= (IOCB_DSYNC | IOCB_SYNC);
>  	if (flags & RWF_APPEND)
>  		ki->ki_flags |= IOCB_APPEND;
> +	if (flags & RWF_ZONE_APPEND) {
> +		/* currently support block device only */
> +		umode_t mode = file_inode(ki->ki_filp)->i_mode;
> +
> +		if (!(S_ISBLK(mode)))
> +			return -EOPNOTSUPP;
> +		ki->ki_flags |= IOCB_ZONE_APPEND;
> +	}
>  	return 0;
>  }
>  
> diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
> index 379a612..1ce06e9 100644
> --- a/include/uapi/linux/fs.h
> +++ b/include/uapi/linux/fs.h
> @@ -299,8 +299,11 @@ typedef int __bitwise __kernel_rwf_t;
>  /* per-IO O_APPEND */
>  #define RWF_APPEND	((__force __kernel_rwf_t)0x00000010)
>  
> +/* per-IO O_APPEND */
> +#define RWF_ZONE_APPEND	((__force __kernel_rwf_t)0x00000020)
> +
>  /* mask of flags supported by the kernel */
>  #define RWF_SUPPORTED	(RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\
> -			 RWF_APPEND)
> +			 RWF_APPEND | RWF_ZONE_APPEND)
>  
>  #endif /* _UAPI_LINUX_FS_H */
>
Christoph Hellwig June 26, 2020, 8:58 a.m. UTC | #2
To restate my previous NAK:

A low-level protocol detail like RWF_ZONE_APPEND has absolutely no
business being exposed in the Linux file system interface.

And as mentioned before I think the idea of returning the actual
position written for O_APPEND writes totally makes sense, and actually
is generalizable to all files.  Together with zonefs that gives you a
perfect interface for zone append.

On Thu, Jun 25, 2020 at 10:45:48PM +0530, Kanchan Joshi wrote:
> Introduce RWF_ZONE_APPEND flag to represent zone-append.

And no one but us select few even know what zone append is, nevermind
what the detailed semantics are.  If you add a userspace API you need
to very clearly document the semantics inluding errors and corner cases.
Kanchan Joshi June 26, 2020, 9:15 p.m. UTC | #3
On Fri, Jun 26, 2020 at 09:58:46AM +0100, Christoph Hellwig wrote:
>To restate my previous NAK:
>
>A low-level protocol detail like RWF_ZONE_APPEND has absolutely no
>business being exposed in the Linux file system interface.
>
>And as mentioned before I think the idea of returning the actual
>position written for O_APPEND writes totally makes sense, and actually
>is generalizable to all files.  Together with zonefs that gives you a
>perfect interface for zone append.
>
>On Thu, Jun 25, 2020 at 10:45:48PM +0530, Kanchan Joshi wrote:
>> Introduce RWF_ZONE_APPEND flag to represent zone-append.
>
>And no one but us select few even know what zone append is, nevermind
>what the detailed semantics are.  If you add a userspace API you need
>to very clearly document the semantics inluding errors and corner cases.

For block IO path (which is the scope of this patchset) there is no
probelm in using RWF_APPEND for zone-append, because it does not do
anything for block device. We can use that, avoiding introduction of
RWF_ZONE_APPEND in user-space.

In kernel, will it be fine to keep IOCB_ZONE_APPEND apart from
IOCB_APPEND? Reason being, this can help to isolate the code meant only
for zone-append from the one that is already present for conventional
append.

Snippet from quick reference -

static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags)
        ki->ki_flags |= (IOCB_DSYNC | IOCB_SYNC);
        if (flags & RWF_APPEND)
                ki->ki_flags |= IOCB_APPEND;
+       if (flags & RWF_ZONE_APPEND) {
+               /* currently support block device only */
+               umode_t mode = file_inode(ki->ki_filp)->i_mode;
+
+               if (!(S_ISBLK(mode)))
+                       return -EOPNOTSUPP;
+               ki->ki_flags |= IOCB_ZONE_APPEND;
+       }


As for file I/O in future, I see a potential problem with RWF_APPEND.
In io_uring, zone-append requires bit of pre/post processing, which
ideally should be done only for zone-append case. A ZoneFS file using
RWF_APPEND as a mean to invoke zone-append vs a regular file (hosted on
some other FS) requiring conventional RWF_APPEND - both will execute
that processing.
Is there a good way to differentiate ZoneFS file from another file which
only wants use conventional file-append?
Christoph Hellwig June 27, 2020, 6:51 a.m. UTC | #4
On Sat, Jun 27, 2020 at 02:45:14AM +0530, Kanchan Joshi wrote:
> For block IO path (which is the scope of this patchset) there is no
> probelm in using RWF_APPEND for zone-append, because it does not do
> anything for block device. We can use that, avoiding introduction of
> RWF_ZONE_APPEND in user-space.

No, you are not just touching the block I/O path.  This is all over the
general file code, and all RWF_* flag are about file I/O.
Kanchan Joshi June 29, 2020, 6:32 p.m. UTC | #5
On Fri, Jun 26, 2020 at 02:50:20AM +0000, Damien Le Moal wrote:
>On 2020/06/26 2:18, Kanchan Joshi wrote:
>> Introduce RWF_ZONE_APPEND flag to represent zone-append. User-space
>> sends this with write. Add IOCB_ZONE_APPEND which is set in
>> kiocb->ki_flags on receiving RWF_ZONE_APPEND.
>> Make direct IO submission path use IOCB_ZONE_APPEND to send bio with
>> append op. Direct IO completion returns zone-relative offset, in sector
>> unit, to upper layer using kiocb->ki_complete interface.
>> Report error if zone-append is requested on regular file or on sync
>> kiocb (i.e. one without ki_complete).
>>
>> Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
>> Signed-off-by: SelvaKumar S <selvakuma.s1@samsung.com>
>> Signed-off-by: Arnav Dawn <a.dawn@samsung.com>
>> Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
>> Signed-off-by: Javier Gonzalez <javier.gonz@samsung.com>
>> ---
>>  fs/block_dev.c          | 28 ++++++++++++++++++++++++----
>>  include/linux/fs.h      |  9 +++++++++
>>  include/uapi/linux/fs.h |  5 ++++-
>>  3 files changed, 37 insertions(+), 5 deletions(-)
>>
>> diff --git a/fs/block_dev.c b/fs/block_dev.c
>> index 47860e5..5180268 100644
>> --- a/fs/block_dev.c
>> +++ b/fs/block_dev.c
>> @@ -185,6 +185,10 @@ static unsigned int dio_bio_write_op(struct kiocb *iocb)
>>  	/* avoid the need for a I/O completion work item */
>>  	if (iocb->ki_flags & IOCB_DSYNC)
>>  		op |= REQ_FUA;
>> +
>> +	if (iocb->ki_flags & IOCB_ZONE_APPEND)
>> +		op |= REQ_OP_ZONE_APPEND;
>
>This is wrong. REQ_OP_WRITE is already set in the declaration of "op". How can
>this work ?
REQ_OP_ZONE_APPEND will override the REQ_WRITE op, while previously set op
flags (REQ_FUA etc.) will be retained. But yes, this can be made to look
cleaner.
V3 will include the other changes you pointed out. Thanks for the review.
Damien Le Moal June 30, 2020, 12:37 a.m. UTC | #6
On 2020/06/30 3:35, Kanchan Joshi wrote:
> On Fri, Jun 26, 2020 at 02:50:20AM +0000, Damien Le Moal wrote:
>> On 2020/06/26 2:18, Kanchan Joshi wrote:
>>> Introduce RWF_ZONE_APPEND flag to represent zone-append. User-space
>>> sends this with write. Add IOCB_ZONE_APPEND which is set in
>>> kiocb->ki_flags on receiving RWF_ZONE_APPEND.
>>> Make direct IO submission path use IOCB_ZONE_APPEND to send bio with
>>> append op. Direct IO completion returns zone-relative offset, in sector
>>> unit, to upper layer using kiocb->ki_complete interface.
>>> Report error if zone-append is requested on regular file or on sync
>>> kiocb (i.e. one without ki_complete).
>>>
>>> Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
>>> Signed-off-by: SelvaKumar S <selvakuma.s1@samsung.com>
>>> Signed-off-by: Arnav Dawn <a.dawn@samsung.com>
>>> Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
>>> Signed-off-by: Javier Gonzalez <javier.gonz@samsung.com>
>>> ---
>>>  fs/block_dev.c          | 28 ++++++++++++++++++++++++----
>>>  include/linux/fs.h      |  9 +++++++++
>>>  include/uapi/linux/fs.h |  5 ++++-
>>>  3 files changed, 37 insertions(+), 5 deletions(-)
>>>
>>> diff --git a/fs/block_dev.c b/fs/block_dev.c
>>> index 47860e5..5180268 100644
>>> --- a/fs/block_dev.c
>>> +++ b/fs/block_dev.c
>>> @@ -185,6 +185,10 @@ static unsigned int dio_bio_write_op(struct kiocb *iocb)
>>>  	/* avoid the need for a I/O completion work item */
>>>  	if (iocb->ki_flags & IOCB_DSYNC)
>>>  		op |= REQ_FUA;
>>> +
>>> +	if (iocb->ki_flags & IOCB_ZONE_APPEND)
>>> +		op |= REQ_OP_ZONE_APPEND;
>>
>> This is wrong. REQ_OP_WRITE is already set in the declaration of "op". How can
>> this work ?
> REQ_OP_ZONE_APPEND will override the REQ_WRITE op, while previously set op
> flags (REQ_FUA etc.) will be retained. But yes, this can be made to look
> cleaner.
> V3 will include the other changes you pointed out. Thanks for the review.
> 

REQ_OP_WRITE and REQ_OP_ZONE_APPEND are different bits, so there is no
"override". A well formed BIO bi_opf is one op+flags. Specifying multiple OP
codes does not make sense.
Kanchan Joshi June 30, 2020, 7:40 a.m. UTC | #7
On Tue, Jun 30, 2020 at 12:37:07AM +0000, Damien Le Moal wrote:
>On 2020/06/30 3:35, Kanchan Joshi wrote:
>> On Fri, Jun 26, 2020 at 02:50:20AM +0000, Damien Le Moal wrote:
>>> On 2020/06/26 2:18, Kanchan Joshi wrote:
>>>> Introduce RWF_ZONE_APPEND flag to represent zone-append. User-space
>>>> sends this with write. Add IOCB_ZONE_APPEND which is set in
>>>> kiocb->ki_flags on receiving RWF_ZONE_APPEND.
>>>> Make direct IO submission path use IOCB_ZONE_APPEND to send bio with
>>>> append op. Direct IO completion returns zone-relative offset, in sector
>>>> unit, to upper layer using kiocb->ki_complete interface.
>>>> Report error if zone-append is requested on regular file or on sync
>>>> kiocb (i.e. one without ki_complete).
>>>>
>>>> Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
>>>> Signed-off-by: SelvaKumar S <selvakuma.s1@samsung.com>
>>>> Signed-off-by: Arnav Dawn <a.dawn@samsung.com>
>>>> Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
>>>> Signed-off-by: Javier Gonzalez <javier.gonz@samsung.com>
>>>> ---
>>>>  fs/block_dev.c          | 28 ++++++++++++++++++++++++----
>>>>  include/linux/fs.h      |  9 +++++++++
>>>>  include/uapi/linux/fs.h |  5 ++++-
>>>>  3 files changed, 37 insertions(+), 5 deletions(-)
>>>>
>>>> diff --git a/fs/block_dev.c b/fs/block_dev.c
>>>> index 47860e5..5180268 100644
>>>> --- a/fs/block_dev.c
>>>> +++ b/fs/block_dev.c
>>>> @@ -185,6 +185,10 @@ static unsigned int dio_bio_write_op(struct kiocb *iocb)
>>>>  	/* avoid the need for a I/O completion work item */
>>>>  	if (iocb->ki_flags & IOCB_DSYNC)
>>>>  		op |= REQ_FUA;
>>>> +
>>>> +	if (iocb->ki_flags & IOCB_ZONE_APPEND)
>>>> +		op |= REQ_OP_ZONE_APPEND;
>>>
>>> This is wrong. REQ_OP_WRITE is already set in the declaration of "op". How can
>>> this work ?
>> REQ_OP_ZONE_APPEND will override the REQ_WRITE op, while previously set op
>> flags (REQ_FUA etc.) will be retained. But yes, this can be made to look
>> cleaner.
>> V3 will include the other changes you pointed out. Thanks for the review.
>>
>
>REQ_OP_WRITE and REQ_OP_ZONE_APPEND are different bits, so there is no
>"override". A well formed BIO bi_opf is one op+flags. Specifying multiple OP
>codes does not make sense.

one op+flags behavior is retained here. OP is not about bits (op flags are).
Had it been, REQ_OP_WRITE (value 1) can not be differentiated from
REQ_OP_ZONE_APPEND (value 13).
We do not do "bio_op(bio) & REQ_OP_WRITE", rather we look at the
absolute value "bio_op(bio) == REQ_OP_WRITE".
Damien Le Moal June 30, 2020, 7:52 a.m. UTC | #8
On 2020/06/30 16:43, Kanchan Joshi wrote:
> On Tue, Jun 30, 2020 at 12:37:07AM +0000, Damien Le Moal wrote:
>> On 2020/06/30 3:35, Kanchan Joshi wrote:
>>> On Fri, Jun 26, 2020 at 02:50:20AM +0000, Damien Le Moal wrote:
>>>> On 2020/06/26 2:18, Kanchan Joshi wrote:
>>>>> Introduce RWF_ZONE_APPEND flag to represent zone-append. User-space
>>>>> sends this with write. Add IOCB_ZONE_APPEND which is set in
>>>>> kiocb->ki_flags on receiving RWF_ZONE_APPEND.
>>>>> Make direct IO submission path use IOCB_ZONE_APPEND to send bio with
>>>>> append op. Direct IO completion returns zone-relative offset, in sector
>>>>> unit, to upper layer using kiocb->ki_complete interface.
>>>>> Report error if zone-append is requested on regular file or on sync
>>>>> kiocb (i.e. one without ki_complete).
>>>>>
>>>>> Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
>>>>> Signed-off-by: SelvaKumar S <selvakuma.s1@samsung.com>
>>>>> Signed-off-by: Arnav Dawn <a.dawn@samsung.com>
>>>>> Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
>>>>> Signed-off-by: Javier Gonzalez <javier.gonz@samsung.com>
>>>>> ---
>>>>>  fs/block_dev.c          | 28 ++++++++++++++++++++++++----
>>>>>  include/linux/fs.h      |  9 +++++++++
>>>>>  include/uapi/linux/fs.h |  5 ++++-
>>>>>  3 files changed, 37 insertions(+), 5 deletions(-)
>>>>>
>>>>> diff --git a/fs/block_dev.c b/fs/block_dev.c
>>>>> index 47860e5..5180268 100644
>>>>> --- a/fs/block_dev.c
>>>>> +++ b/fs/block_dev.c
>>>>> @@ -185,6 +185,10 @@ static unsigned int dio_bio_write_op(struct kiocb *iocb)
>>>>>  	/* avoid the need for a I/O completion work item */
>>>>>  	if (iocb->ki_flags & IOCB_DSYNC)
>>>>>  		op |= REQ_FUA;
>>>>> +
>>>>> +	if (iocb->ki_flags & IOCB_ZONE_APPEND)
>>>>> +		op |= REQ_OP_ZONE_APPEND;
>>>>
>>>> This is wrong. REQ_OP_WRITE is already set in the declaration of "op". How can
>>>> this work ?
>>> REQ_OP_ZONE_APPEND will override the REQ_WRITE op, while previously set op
>>> flags (REQ_FUA etc.) will be retained. But yes, this can be made to look
>>> cleaner.
>>> V3 will include the other changes you pointed out. Thanks for the review.
>>>
>>
>> REQ_OP_WRITE and REQ_OP_ZONE_APPEND are different bits, so there is no
>> "override". A well formed BIO bi_opf is one op+flags. Specifying multiple OP
>> codes does not make sense.
> 
> one op+flags behavior is retained here. OP is not about bits (op flags are).
> Had it been, REQ_OP_WRITE (value 1) can not be differentiated from
> REQ_OP_ZONE_APPEND (value 13).
> We do not do "bio_op(bio) & REQ_OP_WRITE", rather we look at the
> absolute value "bio_op(bio) == REQ_OP_WRITE".

Sure, the ops are not bits like the flags, but (excluding the flags) doing:

op |= REQ_OP_ZONE_APPEND;

will give you op == (REQ_OP_WRITE | REQ_OP_ZONE_APPEND). That's not what you want...
Damien Le Moal June 30, 2020, 7:56 a.m. UTC | #9
On 2020/06/30 16:53, Damien Le Moal wrote:
> On 2020/06/30 16:43, Kanchan Joshi wrote:
>> On Tue, Jun 30, 2020 at 12:37:07AM +0000, Damien Le Moal wrote:
>>> On 2020/06/30 3:35, Kanchan Joshi wrote:
>>>> On Fri, Jun 26, 2020 at 02:50:20AM +0000, Damien Le Moal wrote:
>>>>> On 2020/06/26 2:18, Kanchan Joshi wrote:
>>>>>> Introduce RWF_ZONE_APPEND flag to represent zone-append. User-space
>>>>>> sends this with write. Add IOCB_ZONE_APPEND which is set in
>>>>>> kiocb->ki_flags on receiving RWF_ZONE_APPEND.
>>>>>> Make direct IO submission path use IOCB_ZONE_APPEND to send bio with
>>>>>> append op. Direct IO completion returns zone-relative offset, in sector
>>>>>> unit, to upper layer using kiocb->ki_complete interface.
>>>>>> Report error if zone-append is requested on regular file or on sync
>>>>>> kiocb (i.e. one without ki_complete).
>>>>>>
>>>>>> Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
>>>>>> Signed-off-by: SelvaKumar S <selvakuma.s1@samsung.com>
>>>>>> Signed-off-by: Arnav Dawn <a.dawn@samsung.com>
>>>>>> Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
>>>>>> Signed-off-by: Javier Gonzalez <javier.gonz@samsung.com>
>>>>>> ---
>>>>>>  fs/block_dev.c          | 28 ++++++++++++++++++++++++----
>>>>>>  include/linux/fs.h      |  9 +++++++++
>>>>>>  include/uapi/linux/fs.h |  5 ++++-
>>>>>>  3 files changed, 37 insertions(+), 5 deletions(-)
>>>>>>
>>>>>> diff --git a/fs/block_dev.c b/fs/block_dev.c
>>>>>> index 47860e5..5180268 100644
>>>>>> --- a/fs/block_dev.c
>>>>>> +++ b/fs/block_dev.c
>>>>>> @@ -185,6 +185,10 @@ static unsigned int dio_bio_write_op(struct kiocb *iocb)
>>>>>>  	/* avoid the need for a I/O completion work item */
>>>>>>  	if (iocb->ki_flags & IOCB_DSYNC)
>>>>>>  		op |= REQ_FUA;
>>>>>> +
>>>>>> +	if (iocb->ki_flags & IOCB_ZONE_APPEND)
>>>>>> +		op |= REQ_OP_ZONE_APPEND;
>>>>>
>>>>> This is wrong. REQ_OP_WRITE is already set in the declaration of "op". How can
>>>>> this work ?
>>>> REQ_OP_ZONE_APPEND will override the REQ_WRITE op, while previously set op
>>>> flags (REQ_FUA etc.) will be retained. But yes, this can be made to look
>>>> cleaner.
>>>> V3 will include the other changes you pointed out. Thanks for the review.
>>>>
>>>
>>> REQ_OP_WRITE and REQ_OP_ZONE_APPEND are different bits, so there is no
>>> "override". A well formed BIO bi_opf is one op+flags. Specifying multiple OP
>>> codes does not make sense.
>>
>> one op+flags behavior is retained here. OP is not about bits (op flags are).
>> Had it been, REQ_OP_WRITE (value 1) can not be differentiated from
>> REQ_OP_ZONE_APPEND (value 13).
>> We do not do "bio_op(bio) & REQ_OP_WRITE", rather we look at the
>> absolute value "bio_op(bio) == REQ_OP_WRITE".
> 
> Sure, the ops are not bits like the flags, but (excluding the flags) doing:
> 
> op |= REQ_OP_ZONE_APPEND;
> 
> will give you op == (REQ_OP_WRITE | REQ_OP_ZONE_APPEND). That's not what you want...

And yes, REQ_OP_WRITE | REQ_OP_ZONE_APPEND == REQ_OP_ZONE_APPEND... But still
not a reason for not setting the op correctly :)
Kanchan Joshi June 30, 2020, 8:16 a.m. UTC | #10
On Tue, Jun 30, 2020 at 07:56:46AM +0000, Damien Le Moal wrote:
>On 2020/06/30 16:53, Damien Le Moal wrote:
>> On 2020/06/30 16:43, Kanchan Joshi wrote:
>>> On Tue, Jun 30, 2020 at 12:37:07AM +0000, Damien Le Moal wrote:
>>>> On 2020/06/30 3:35, Kanchan Joshi wrote:
>>>>> On Fri, Jun 26, 2020 at 02:50:20AM +0000, Damien Le Moal wrote:
>>>>>> On 2020/06/26 2:18, Kanchan Joshi wrote:
>>>>>>> Introduce RWF_ZONE_APPEND flag to represent zone-append. User-space
>>>>>>> sends this with write. Add IOCB_ZONE_APPEND which is set in
>>>>>>> kiocb->ki_flags on receiving RWF_ZONE_APPEND.
>>>>>>> Make direct IO submission path use IOCB_ZONE_APPEND to send bio with
>>>>>>> append op. Direct IO completion returns zone-relative offset, in sector
>>>>>>> unit, to upper layer using kiocb->ki_complete interface.
>>>>>>> Report error if zone-append is requested on regular file or on sync
>>>>>>> kiocb (i.e. one without ki_complete).
>>>>>>>
>>>>>>> Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
>>>>>>> Signed-off-by: SelvaKumar S <selvakuma.s1@samsung.com>
>>>>>>> Signed-off-by: Arnav Dawn <a.dawn@samsung.com>
>>>>>>> Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
>>>>>>> Signed-off-by: Javier Gonzalez <javier.gonz@samsung.com>
>>>>>>> ---
>>>>>>>  fs/block_dev.c          | 28 ++++++++++++++++++++++++----
>>>>>>>  include/linux/fs.h      |  9 +++++++++
>>>>>>>  include/uapi/linux/fs.h |  5 ++++-
>>>>>>>  3 files changed, 37 insertions(+), 5 deletions(-)
>>>>>>>
>>>>>>> diff --git a/fs/block_dev.c b/fs/block_dev.c
>>>>>>> index 47860e5..5180268 100644
>>>>>>> --- a/fs/block_dev.c
>>>>>>> +++ b/fs/block_dev.c
>>>>>>> @@ -185,6 +185,10 @@ static unsigned int dio_bio_write_op(struct kiocb *iocb)
>>>>>>>  	/* avoid the need for a I/O completion work item */
>>>>>>>  	if (iocb->ki_flags & IOCB_DSYNC)
>>>>>>>  		op |= REQ_FUA;
>>>>>>> +
>>>>>>> +	if (iocb->ki_flags & IOCB_ZONE_APPEND)
>>>>>>> +		op |= REQ_OP_ZONE_APPEND;
>>>>>>
>>>>>> This is wrong. REQ_OP_WRITE is already set in the declaration of "op". How can
>>>>>> this work ?
>>>>> REQ_OP_ZONE_APPEND will override the REQ_WRITE op, while previously set op
>>>>> flags (REQ_FUA etc.) will be retained. But yes, this can be made to look
>>>>> cleaner.
>>>>> V3 will include the other changes you pointed out. Thanks for the review.
>>>>>
>>>>
>>>> REQ_OP_WRITE and REQ_OP_ZONE_APPEND are different bits, so there is no
>>>> "override". A well formed BIO bi_opf is one op+flags. Specifying multiple OP
>>>> codes does not make sense.
>>>
>>> one op+flags behavior is retained here. OP is not about bits (op flags are).
>>> Had it been, REQ_OP_WRITE (value 1) can not be differentiated from
>>> REQ_OP_ZONE_APPEND (value 13).
>>> We do not do "bio_op(bio) & REQ_OP_WRITE", rather we look at the
>>> absolute value "bio_op(bio) == REQ_OP_WRITE".
>>
>> Sure, the ops are not bits like the flags, but (excluding the flags) doing:
>>
>> op |= REQ_OP_ZONE_APPEND;
>>
>> will give you op == (REQ_OP_WRITE | REQ_OP_ZONE_APPEND). That's not what you want...
>
>And yes, REQ_OP_WRITE | REQ_OP_ZONE_APPEND == REQ_OP_ZONE_APPEND... But still
>not a reason for not setting the op correctly :)
Right, this is what op override was about, and intent was to minimize
the changes in the existing function. As said before, completely agree about
changing, code should not draw suspicion.

Patch
diff mbox series

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 47860e5..5180268 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -185,6 +185,10 @@  static unsigned int dio_bio_write_op(struct kiocb *iocb)
 	/* avoid the need for a I/O completion work item */
 	if (iocb->ki_flags & IOCB_DSYNC)
 		op |= REQ_FUA;
+
+	if (iocb->ki_flags & IOCB_ZONE_APPEND)
+		op |= REQ_OP_ZONE_APPEND;
+
 	return op;
 }
 
@@ -295,6 +299,14 @@  static int blkdev_iopoll(struct kiocb *kiocb, bool wait)
 	return blk_poll(q, READ_ONCE(kiocb->ki_cookie), wait);
 }
 
+static inline long blkdev_bio_end_io_append(struct bio *bio)
+{
+	sector_t zone_sectors = blk_queue_zone_sectors(bio->bi_disk->queue);
+
+	/* calculate zone relative offset for zone append */
+	return bio->bi_iter.bi_sector & (zone_sectors - 1);
+}
+
 static void blkdev_bio_end_io(struct bio *bio)
 {
 	struct blkdev_dio *dio = bio->bi_private;
@@ -307,15 +319,19 @@  static void blkdev_bio_end_io(struct bio *bio)
 		if (!dio->is_sync) {
 			struct kiocb *iocb = dio->iocb;
 			ssize_t ret;
+			long res2 = 0;
 
 			if (likely(!dio->bio.bi_status)) {
 				ret = dio->size;
 				iocb->ki_pos += ret;
+
+				if (iocb->ki_flags & IOCB_ZONE_APPEND)
+					res2 = blkdev_bio_end_io_append(bio);
 			} else {
 				ret = blk_status_to_errno(dio->bio.bi_status);
 			}
 
-			dio->iocb->ki_complete(iocb, ret, 0);
+			dio->iocb->ki_complete(iocb, ret, res2);
 			if (dio->multi_bio)
 				bio_put(&dio->bio);
 		} else {
@@ -382,6 +398,7 @@  __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
 		bio->bi_private = dio;
 		bio->bi_end_io = blkdev_bio_end_io;
 		bio->bi_ioprio = iocb->ki_ioprio;
+		bio->bi_opf = is_read ? REQ_OP_READ : dio_bio_write_op(iocb);
 
 		ret = bio_iov_iter_get_pages(bio, iter);
 		if (unlikely(ret)) {
@@ -391,11 +408,9 @@  __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
 		}
 
 		if (is_read) {
-			bio->bi_opf = REQ_OP_READ;
 			if (dio->should_dirty)
 				bio_set_pages_dirty(bio);
 		} else {
-			bio->bi_opf = dio_bio_write_op(iocb);
 			task_io_account_write(bio->bi_iter.bi_size);
 		}
 
@@ -465,12 +480,17 @@  __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
 static ssize_t
 blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 {
+	bool is_sync = is_sync_kiocb(iocb);
 	int nr_pages;
 
+	/* zone-append is supported only on async-kiocb */
+	if (is_sync && iocb->ki_flags & IOCB_ZONE_APPEND)
+		return -EINVAL;
+
 	nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES + 1);
 	if (!nr_pages)
 		return 0;
-	if (is_sync_kiocb(iocb) && nr_pages <= BIO_MAX_PAGES)
+	if (is_sync && nr_pages <= BIO_MAX_PAGES)
 		return __blkdev_direct_IO_simple(iocb, iter, nr_pages);
 
 	return __blkdev_direct_IO(iocb, iter, min(nr_pages, BIO_MAX_PAGES));
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 6c4ab4d..3202d9a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -315,6 +315,7 @@  enum rw_hint {
 #define IOCB_SYNC		(1 << 5)
 #define IOCB_WRITE		(1 << 6)
 #define IOCB_NOWAIT		(1 << 7)
+#define IOCB_ZONE_APPEND	(1 << 8)
 
 struct kiocb {
 	struct file		*ki_filp;
@@ -3456,6 +3457,14 @@  static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags)
 		ki->ki_flags |= (IOCB_DSYNC | IOCB_SYNC);
 	if (flags & RWF_APPEND)
 		ki->ki_flags |= IOCB_APPEND;
+	if (flags & RWF_ZONE_APPEND) {
+		/* currently support block device only */
+		umode_t mode = file_inode(ki->ki_filp)->i_mode;
+
+		if (!(S_ISBLK(mode)))
+			return -EOPNOTSUPP;
+		ki->ki_flags |= IOCB_ZONE_APPEND;
+	}
 	return 0;
 }
 
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 379a612..1ce06e9 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -299,8 +299,11 @@  typedef int __bitwise __kernel_rwf_t;
 /* per-IO O_APPEND */
 #define RWF_APPEND	((__force __kernel_rwf_t)0x00000010)
 
+/* per-IO O_APPEND */
+#define RWF_ZONE_APPEND	((__force __kernel_rwf_t)0x00000020)
+
 /* mask of flags supported by the kernel */
 #define RWF_SUPPORTED	(RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\
-			 RWF_APPEND)
+			 RWF_APPEND | RWF_ZONE_APPEND)
 
 #endif /* _UAPI_LINUX_FS_H */