[1/3] fs,block: Introduce IOCB_ZONE_APPEND and direct-io handling
diff mbox series

Message ID 1592414619-5646-2-git-send-email-joshi.k@samsung.com
State New
Headers show
Series
  • zone-append support in aio and io-uring
Related show

Commit Message

Kanchan Joshi June 17, 2020, 5:23 p.m. UTC
From: Selvakumar S <selvakuma.s1@samsung.com>

Introduce IOCB_ZONE_APPEND flag, which is set in kiocb->ki_flags for
zone-append. Direct I/O submission path uses this flag to send bio with
append op. And completion path uses the same to return zone-relative
offset to upper layer.

Signed-off-by: SelvaKumar S <selvakuma.s1@samsung.com>
Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
Signed-off-by: Javier Gonzalez <javier.gonz@samsung.com>
---
 fs/block_dev.c     | 19 ++++++++++++++++++-
 include/linux/fs.h |  1 +
 2 files changed, 19 insertions(+), 1 deletion(-)

Comments

Pavel Begunkov June 17, 2020, 7:02 p.m. UTC | #1
On 17/06/2020 20:23, Kanchan Joshi wrote:
> From: Selvakumar S <selvakuma.s1@samsung.com>
> 
> Introduce IOCB_ZONE_APPEND flag, which is set in kiocb->ki_flags for
> zone-append. Direct I/O submission path uses this flag to send bio with
> append op. And completion path uses the same to return zone-relative
> offset to upper layer.
> 
> Signed-off-by: SelvaKumar S <selvakuma.s1@samsung.com>
> Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
> Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
> Signed-off-by: Javier Gonzalez <javier.gonz@samsung.com>
> ---
>  fs/block_dev.c     | 19 ++++++++++++++++++-
>  include/linux/fs.h |  1 +
>  2 files changed, 19 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/block_dev.c b/fs/block_dev.c
> index 47860e5..4c84b4d0 100644
> --- a/fs/block_dev.c
> +++ b/fs/block_dev.c
> @@ -185,6 +185,10 @@ static unsigned int dio_bio_write_op(struct kiocb *iocb)
>  	/* avoid the need for a I/O completion work item */
>  	if (iocb->ki_flags & IOCB_DSYNC)
>  		op |= REQ_FUA;
> +#ifdef CONFIG_BLK_DEV_ZONED
> +	if (iocb->ki_flags & IOCB_ZONE_APPEND)
> +		op |= REQ_OP_ZONE_APPEND | REQ_NOMERGE;
> +#endif
>  	return op;
>  }
>  
> @@ -295,6 +299,14 @@ static int blkdev_iopoll(struct kiocb *kiocb, bool wait)
>  	return blk_poll(q, READ_ONCE(kiocb->ki_cookie), wait);
>  }
>  
> +#ifdef CONFIG_BLK_DEV_ZONED
> +static inline long blkdev_bio_end_io_append(struct bio *bio)
> +{
> +	return (bio->bi_iter.bi_sector %
> +		blk_queue_zone_sectors(bio->bi_disk->queue)) << SECTOR_SHIFT;

IIRC, zone_size is pow2 and bit operations should suffice.
Anyway, this can use a temporary variable.

> +}
> +#endif
> +
>  static void blkdev_bio_end_io(struct bio *bio)
>  {
>  	struct blkdev_dio *dio = bio->bi_private;
> @@ -307,15 +319,20 @@ static void blkdev_bio_end_io(struct bio *bio)
>  		if (!dio->is_sync) {
>  			struct kiocb *iocb = dio->iocb;
>  			ssize_t ret;
> +			long res = 0;
>  
>  			if (likely(!dio->bio.bi_status)) {
>  				ret = dio->size;
>  				iocb->ki_pos += ret;
> +#ifdef CONFIG_BLK_DEV_ZONED
> +				if (iocb->ki_flags & IOCB_ZONE_APPEND)
> +					res = blkdev_bio_end_io_append(bio);
> +#endif
>  			} else {
>  				ret = blk_status_to_errno(dio->bio.bi_status);
>  			}
>  
> -			dio->iocb->ki_complete(iocb, ret, 0);
> +			dio->iocb->ki_complete(iocb, ret, res);
>  			if (dio->multi_bio)
>  				bio_put(&dio->bio);
>  		} else {
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 6c4ab4d..dc547b9 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -315,6 +315,7 @@ enum rw_hint {
>  #define IOCB_SYNC		(1 << 5)
>  #define IOCB_WRITE		(1 << 6)
>  #define IOCB_NOWAIT		(1 << 7)
> +#define IOCB_ZONE_APPEND	(1 << 8)
>  
>  struct kiocb {
>  	struct file		*ki_filp;
>
Damien Le Moal June 18, 2020, 7:16 a.m. UTC | #2
On 2020/06/18 2:27, Kanchan Joshi wrote:
> From: Selvakumar S <selvakuma.s1@samsung.com>
> 
> Introduce IOCB_ZONE_APPEND flag, which is set in kiocb->ki_flags for
> zone-append. Direct I/O submission path uses this flag to send bio with
> append op. And completion path uses the same to return zone-relative
> offset to upper layer.
> 
> Signed-off-by: SelvaKumar S <selvakuma.s1@samsung.com>
> Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
> Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
> Signed-off-by: Javier Gonzalez <javier.gonz@samsung.com>
> ---
>  fs/block_dev.c     | 19 ++++++++++++++++++-
>  include/linux/fs.h |  1 +
>  2 files changed, 19 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/block_dev.c b/fs/block_dev.c
> index 47860e5..4c84b4d0 100644
> --- a/fs/block_dev.c
> +++ b/fs/block_dev.c
> @@ -185,6 +185,10 @@ static unsigned int dio_bio_write_op(struct kiocb *iocb)
>  	/* avoid the need for a I/O completion work item */
>  	if (iocb->ki_flags & IOCB_DSYNC)
>  		op |= REQ_FUA;
> +#ifdef CONFIG_BLK_DEV_ZONED
> +	if (iocb->ki_flags & IOCB_ZONE_APPEND)
> +		op |= REQ_OP_ZONE_APPEND | REQ_NOMERGE;
> +#endif

No need for the #ifdef. And no need for the REQ_NOMERGE either since
REQ_OP_ZONE_APPEND requests are defined as not mergeable already.

>  	return op;
>  }
>  
> @@ -295,6 +299,14 @@ static int blkdev_iopoll(struct kiocb *kiocb, bool wait)
>  	return blk_poll(q, READ_ONCE(kiocb->ki_cookie), wait);
>  }
>  
> +#ifdef CONFIG_BLK_DEV_ZONED
> +static inline long blkdev_bio_end_io_append(struct bio *bio)
> +{
> +	return (bio->bi_iter.bi_sector %
> +		blk_queue_zone_sectors(bio->bi_disk->queue)) << SECTOR_SHIFT;

A zone size is at most 4G sectors as defined by the queue chunk_sectors limit
(unsigned int). It means that the return value here can overflow due to the
shift, at least on 32bits arch.

And as Pavel already commented, zone sizes are power of 2 so you can use
bitmasks instead of costly divisions.

> +}
> +#endif
> +
>  static void blkdev_bio_end_io(struct bio *bio)
>  {
>  	struct blkdev_dio *dio = bio->bi_private;
> @@ -307,15 +319,20 @@ static void blkdev_bio_end_io(struct bio *bio)
>  		if (!dio->is_sync) {
>  			struct kiocb *iocb = dio->iocb;
>  			ssize_t ret;
> +			long res = 0;
>  
>  			if (likely(!dio->bio.bi_status)) {
>  				ret = dio->size;
>  				iocb->ki_pos += ret;
> +#ifdef CONFIG_BLK_DEV_ZONED
> +				if (iocb->ki_flags & IOCB_ZONE_APPEND)
> +					res = blkdev_bio_end_io_append(bio);

overflow... And no need for the #ifdef.

> +#endif
>  			} else {
>  				ret = blk_status_to_errno(dio->bio.bi_status);
>  			}
>  
> -			dio->iocb->ki_complete(iocb, ret, 0);
> +			dio->iocb->ki_complete(iocb, ret, res);

ki_complete interface also needs to be changed to have a 64bit last argument to
avoid overflow.

And this all does not work anyway because __blkdev_direct_IO() and
__blkdev_direct_IO_simple() both call bio_iov_iter_get_pages() *before*
dio_bio_write_op() is called. This means that bio_iov_iter_get_pages() will not
see that it needs to get the pages for a zone append command and so will not
call __bio_iov_append_get_pages(). That leads to BIO split and potential
corruption of the user data as fragments of the kiocb may get reordered.

There is a lot more to do to the block_dev direct IO code for this to work.


>  			if (dio->multi_bio)
>  				bio_put(&dio->bio);
>  		} else {
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 6c4ab4d..dc547b9 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -315,6 +315,7 @@ enum rw_hint {
>  #define IOCB_SYNC		(1 << 5)
>  #define IOCB_WRITE		(1 << 6)
>  #define IOCB_NOWAIT		(1 << 7)
> +#define IOCB_ZONE_APPEND	(1 << 8)
>  
>  struct kiocb {
>  	struct file		*ki_filp;
>
Kanchan Joshi June 18, 2020, 6:35 p.m. UTC | #3
On Thu, Jun 18, 2020 at 07:16:19AM +0000, Damien Le Moal wrote:
>On 2020/06/18 2:27, Kanchan Joshi wrote:
>> From: Selvakumar S <selvakuma.s1@samsung.com>
>>
>> Introduce IOCB_ZONE_APPEND flag, which is set in kiocb->ki_flags for
>> zone-append. Direct I/O submission path uses this flag to send bio with
>> append op. And completion path uses the same to return zone-relative
>> offset to upper layer.
>>
>> Signed-off-by: SelvaKumar S <selvakuma.s1@samsung.com>
>> Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
>> Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
>> Signed-off-by: Javier Gonzalez <javier.gonz@samsung.com>
>> ---
>>  fs/block_dev.c     | 19 ++++++++++++++++++-
>>  include/linux/fs.h |  1 +
>>  2 files changed, 19 insertions(+), 1 deletion(-)
>>
>> diff --git a/fs/block_dev.c b/fs/block_dev.c
>> index 47860e5..4c84b4d0 100644
>> --- a/fs/block_dev.c
>> +++ b/fs/block_dev.c
>> @@ -185,6 +185,10 @@ static unsigned int dio_bio_write_op(struct kiocb *iocb)
>>  	/* avoid the need for a I/O completion work item */
>>  	if (iocb->ki_flags & IOCB_DSYNC)
>>  		op |= REQ_FUA;
>> +#ifdef CONFIG_BLK_DEV_ZONED
>> +	if (iocb->ki_flags & IOCB_ZONE_APPEND)
>> +		op |= REQ_OP_ZONE_APPEND | REQ_NOMERGE;
>> +#endif
>
>No need for the #ifdef. And no need for the REQ_NOMERGE either since
>REQ_OP_ZONE_APPEND requests are defined as not mergeable already.
>
>>  	return op;
>>  }
>>
>> @@ -295,6 +299,14 @@ static int blkdev_iopoll(struct kiocb *kiocb, bool wait)
>>  	return blk_poll(q, READ_ONCE(kiocb->ki_cookie), wait);
>>  }
>>
>> +#ifdef CONFIG_BLK_DEV_ZONED
>> +static inline long blkdev_bio_end_io_append(struct bio *bio)
>> +{
>> +	return (bio->bi_iter.bi_sector %
>> +		blk_queue_zone_sectors(bio->bi_disk->queue)) << SECTOR_SHIFT;
>
>A zone size is at most 4G sectors as defined by the queue chunk_sectors limit
>(unsigned int). It means that the return value here can overflow due to the
>shift, at least on 32bits arch.
>
>And as Pavel already commented, zone sizes are power of 2 so you can use
>bitmasks instead of costly divisions.
>
>> +}
>> +#endif
>> +
>>  static void blkdev_bio_end_io(struct bio *bio)
>>  {
>>  	struct blkdev_dio *dio = bio->bi_private;
>> @@ -307,15 +319,20 @@ static void blkdev_bio_end_io(struct bio *bio)
>>  		if (!dio->is_sync) {
>>  			struct kiocb *iocb = dio->iocb;
>>  			ssize_t ret;
>> +			long res = 0;
>>
>>  			if (likely(!dio->bio.bi_status)) {
>>  				ret = dio->size;
>>  				iocb->ki_pos += ret;
>> +#ifdef CONFIG_BLK_DEV_ZONED
>> +				if (iocb->ki_flags & IOCB_ZONE_APPEND)
>> +					res = blkdev_bio_end_io_append(bio);
>
>overflow... And no need for the #ifdef.
>
>> +#endif
>>  			} else {
>>  				ret = blk_status_to_errno(dio->bio.bi_status);
>>  			}
>>
>> -			dio->iocb->ki_complete(iocb, ret, 0);
>> +			dio->iocb->ki_complete(iocb, ret, res);
>
>ki_complete interface also needs to be changed to have a 64bit last argument to
>avoid overflow.
>
>And this all does not work anyway because __blkdev_direct_IO() and
>__blkdev_direct_IO_simple() both call bio_iov_iter_get_pages() *before*
>dio_bio_write_op() is called. This means that bio_iov_iter_get_pages() will not
>see that it needs to get the pages for a zone append command and so will not
>call __bio_iov_append_get_pages(). That leads to BIO split and potential
>corruption of the user data as fragments of the kiocb may get reordered.
>
>There is a lot more to do to the block_dev direct IO code for this to work.

Thanks a lot for the great review. Very helpful. We'll fix in V2.

Patch
diff mbox series

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 47860e5..4c84b4d0 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -185,6 +185,10 @@  static unsigned int dio_bio_write_op(struct kiocb *iocb)
 	/* avoid the need for a I/O completion work item */
 	if (iocb->ki_flags & IOCB_DSYNC)
 		op |= REQ_FUA;
+#ifdef CONFIG_BLK_DEV_ZONED
+	if (iocb->ki_flags & IOCB_ZONE_APPEND)
+		op |= REQ_OP_ZONE_APPEND | REQ_NOMERGE;
+#endif
 	return op;
 }
 
@@ -295,6 +299,14 @@  static int blkdev_iopoll(struct kiocb *kiocb, bool wait)
 	return blk_poll(q, READ_ONCE(kiocb->ki_cookie), wait);
 }
 
+#ifdef CONFIG_BLK_DEV_ZONED
+static inline long blkdev_bio_end_io_append(struct bio *bio)
+{
+	return (bio->bi_iter.bi_sector %
+		blk_queue_zone_sectors(bio->bi_disk->queue)) << SECTOR_SHIFT;
+}
+#endif
+
 static void blkdev_bio_end_io(struct bio *bio)
 {
 	struct blkdev_dio *dio = bio->bi_private;
@@ -307,15 +319,20 @@  static void blkdev_bio_end_io(struct bio *bio)
 		if (!dio->is_sync) {
 			struct kiocb *iocb = dio->iocb;
 			ssize_t ret;
+			long res = 0;
 
 			if (likely(!dio->bio.bi_status)) {
 				ret = dio->size;
 				iocb->ki_pos += ret;
+#ifdef CONFIG_BLK_DEV_ZONED
+				if (iocb->ki_flags & IOCB_ZONE_APPEND)
+					res = blkdev_bio_end_io_append(bio);
+#endif
 			} else {
 				ret = blk_status_to_errno(dio->bio.bi_status);
 			}
 
-			dio->iocb->ki_complete(iocb, ret, 0);
+			dio->iocb->ki_complete(iocb, ret, res);
 			if (dio->multi_bio)
 				bio_put(&dio->bio);
 		} else {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 6c4ab4d..dc547b9 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -315,6 +315,7 @@  enum rw_hint {
 #define IOCB_SYNC		(1 << 5)
 #define IOCB_WRITE		(1 << 6)
 #define IOCB_NOWAIT		(1 << 7)
+#define IOCB_ZONE_APPEND	(1 << 8)
 
 struct kiocb {
 	struct file		*ki_filp;