diff mbox series

[v4,4/6] block: add zone append handling for direct I/O path

Message ID 1595605762-17010-5-git-send-email-joshi.k@samsung.com (mailing list archive)
State New, archived
Headers show
Series zone-append support in io-uring and aio | expand

Commit Message

Kanchan Joshi July 24, 2020, 3:49 p.m. UTC
For zoned block device, opt in for zone-append by setting
FMODE_ZONE_APPEND during open. Make direct IO submission path use
IOCB_ZONE_APPEND to send bio with append op. Make direct IO completion
return written-offset, in bytes, to upper layer via ret2 of
kiocb->ki_complete interface.
Write with the flag IOCB_ZONE_APPEND are ensured not be be short.
Prevent short write and instead return failure if appending write spans
beyond end of device.
Return failure if write is larger than max_append_limit and therefore
requires formation of multiple bios.

Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
Signed-off-by: Selvakumar S <selvakuma.s1@samsung.com>
Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
Signed-off-by: Arnav Dawn <a.dawn@samsung.com>
Signed-off-by: Javier Gonzalez <javier.gonz@samsung.com>
---
 fs/block_dev.c | 51 +++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 41 insertions(+), 10 deletions(-)

Comments

Christoph Hellwig July 26, 2020, 3:19 p.m. UTC | #1
On Fri, Jul 24, 2020 at 09:19:20PM +0530, Kanchan Joshi wrote:
> For zoned block device, opt in for zone-append by setting
> FMODE_ZONE_APPEND during open. Make direct IO submission path use
> IOCB_ZONE_APPEND to send bio with append op. Make direct IO completion
> return written-offset, in bytes, to upper layer via ret2 of
> kiocb->ki_complete interface.
> Write with the flag IOCB_ZONE_APPEND are ensured not be be short.
> Prevent short write and instead return failure if appending write spans
> beyond end of device.
> Return failure if write is larger than max_append_limit and therefore
> requires formation of multiple bios.

We should support reporting the append offset for all block devices
and all file systems support by iomap at least.  There is nothing that
requires actual zone append support here.
diff mbox series

Patch

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 47860e5..3b5836b 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -178,10 +178,19 @@  static struct inode *bdev_file_inode(struct file *file)
 	return file->f_mapping->host;
 }
 
-static unsigned int dio_bio_write_op(struct kiocb *iocb)
+static unsigned int dio_bio_op(bool is_read, struct kiocb *iocb)
 {
-	unsigned int op = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE;
+	unsigned int op;
 
+	if (is_read)
+		return REQ_OP_READ;
+
+	if (iocb->ki_flags & IOCB_ZONE_APPEND)
+		op = REQ_OP_ZONE_APPEND;
+	else
+		op = REQ_OP_WRITE;
+
+	op |= REQ_SYNC | REQ_IDLE;
 	/* avoid the need for a I/O completion work item */
 	if (iocb->ki_flags & IOCB_DSYNC)
 		op |= REQ_FUA;
@@ -207,6 +216,7 @@  __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
 	struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *vecs;
 	loff_t pos = iocb->ki_pos;
 	bool should_dirty = false;
+	bool is_read = (iov_iter_rw(iter) == READ);
 	struct bio bio;
 	ssize_t ret;
 	blk_qc_t qc;
@@ -231,18 +241,17 @@  __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
 	bio.bi_private = current;
 	bio.bi_end_io = blkdev_bio_end_io_simple;
 	bio.bi_ioprio = iocb->ki_ioprio;
+	bio.bi_opf = dio_bio_op(is_read, iocb);
 
 	ret = bio_iov_iter_get_pages(&bio, iter);
 	if (unlikely(ret))
 		goto out;
 	ret = bio.bi_iter.bi_size;
 
-	if (iov_iter_rw(iter) == READ) {
-		bio.bi_opf = REQ_OP_READ;
+	if (is_read) {
 		if (iter_is_iovec(iter))
 			should_dirty = true;
 	} else {
-		bio.bi_opf = dio_bio_write_op(iocb);
 		task_io_account_write(ret);
 	}
 	if (iocb->ki_flags & IOCB_HIPRI)
@@ -295,6 +304,14 @@  static int blkdev_iopoll(struct kiocb *kiocb, bool wait)
 	return blk_poll(q, READ_ONCE(kiocb->ki_cookie), wait);
 }
 
+static inline long long blkdev_bio_ret2(struct kiocb *iocb, struct bio *bio)
+{
+	/* return written-offset for zone append in bytes */
+	if (op_is_write(bio_op(bio)) && iocb->ki_flags & IOCB_ZONE_APPEND)
+		return bio->bi_iter.bi_sector << SECTOR_SHIFT;
+	return 0;
+}
+
 static void blkdev_bio_end_io(struct bio *bio)
 {
 	struct blkdev_dio *dio = bio->bi_private;
@@ -307,15 +324,17 @@  static void blkdev_bio_end_io(struct bio *bio)
 		if (!dio->is_sync) {
 			struct kiocb *iocb = dio->iocb;
 			ssize_t ret;
+			long long ret2;
 
 			if (likely(!dio->bio.bi_status)) {
 				ret = dio->size;
 				iocb->ki_pos += ret;
+				ret2 = blkdev_bio_ret2(iocb, bio);
 			} else {
 				ret = blk_status_to_errno(dio->bio.bi_status);
 			}
 
-			dio->iocb->ki_complete(iocb, ret, 0);
+			dio->iocb->ki_complete(iocb, ret, ret2);
 			if (dio->multi_bio)
 				bio_put(&dio->bio);
 		} else {
@@ -382,6 +401,7 @@  __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
 		bio->bi_private = dio;
 		bio->bi_end_io = blkdev_bio_end_io;
 		bio->bi_ioprio = iocb->ki_ioprio;
+		bio->bi_opf = dio_bio_op(is_read, iocb);
 
 		ret = bio_iov_iter_get_pages(bio, iter);
 		if (unlikely(ret)) {
@@ -391,11 +411,9 @@  __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
 		}
 
 		if (is_read) {
-			bio->bi_opf = REQ_OP_READ;
 			if (dio->should_dirty)
 				bio_set_pages_dirty(bio);
 		} else {
-			bio->bi_opf = dio_bio_write_op(iocb);
 			task_io_account_write(bio->bi_iter.bi_size);
 		}
 
@@ -419,6 +437,12 @@  __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
 		}
 
 		if (!dio->multi_bio) {
+			/* zone-append cannot work with multi bio*/
+			if (!is_read && iocb->ki_flags & IOCB_ZONE_APPEND) {
+				bio->bi_status = BLK_STS_IOERR;
+				bio_endio(bio);
+				break;
+			}
 			/*
 			 * AIO needs an extra reference to ensure the dio
 			 * structure which is embedded into the first bio
@@ -1841,6 +1865,7 @@  EXPORT_SYMBOL(blkdev_get_by_dev);
 static int blkdev_open(struct inode * inode, struct file * filp)
 {
 	struct block_device *bdev;
+	int ret;
 
 	/*
 	 * Preserve backwards compatibility and allow large file access
@@ -1866,7 +1891,11 @@  static int blkdev_open(struct inode * inode, struct file * filp)
 	filp->f_mapping = bdev->bd_inode->i_mapping;
 	filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping);
 
-	return blkdev_get(bdev, filp->f_mode, filp);
+	ret = blkdev_get(bdev, filp->f_mode, filp);
+	if (blk_queue_is_zoned(bdev->bd_disk->queue))
+		filp->f_mode |= FMODE_ZONE_APPEND;
+
+	return ret;
 }
 
 static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
@@ -2017,7 +2046,9 @@  ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	if ((iocb->ki_flags & (IOCB_NOWAIT | IOCB_DIRECT)) == IOCB_NOWAIT)
 		return -EOPNOTSUPP;
 
-	iov_iter_truncate(from, size - iocb->ki_pos);
+	if (iov_iter_truncate(from, size - iocb->ki_pos) &&
+			(iocb->ki_flags & IOCB_ZONE_APPEND))
+		return -ENOSPC;
 
 	blk_start_plug(&plug);
 	ret = __generic_file_write_iter(iocb, from);