diff mbox series

[8/8] iomap: Add writethrough for O_SYNC

Message ID 20210811024647.3067739-9-willy@infradead.org (mailing list archive)
State New, archived
Headers show
Series iomap writethrough for O_SYNC writes | expand

Commit Message

Matthew Wilcox Aug. 11, 2021, 2:46 a.m. UTC
For O_SYNC writes, if the filesystem has already allocated blocks for
the range, we can avoid marking the page as dirty and skip straight to
marking the page as writeback.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
 fs/iomap/buffered-io.c | 78 +++++++++++++++++++++++++++++++++++-------
 1 file changed, 66 insertions(+), 12 deletions(-)

Comments

Christoph Hellwig Aug. 12, 2021, 1:16 p.m. UTC | #1
On Wed, Aug 11, 2021 at 03:46:47AM +0100, Matthew Wilcox (Oracle) wrote:
> For O_SYNC writes, if the filesystem has already allocated blocks for
> the range, we can avoid marking the page as dirty and skip straight to
> marking the page as writeback.

So this just optimizes O_SYNC overwrites.  How common are those for
bufered I/O?  I know databases use them a lot with direct I/O, but for
buffere I/O this seems like an odd I/O pattern.
Matthew Wilcox Aug. 12, 2021, 1:28 p.m. UTC | #2
On Thu, Aug 12, 2021 at 02:16:03PM +0100, Christoph Hellwig wrote:
> On Wed, Aug 11, 2021 at 03:46:47AM +0100, Matthew Wilcox (Oracle) wrote:
> > For O_SYNC writes, if the filesystem has already allocated blocks for
> > the range, we can avoid marking the page as dirty and skip straight to
> > marking the page as writeback.
> 
> So this just optimizes O_SYNC overwrites.  How common are those for
> bufered I/O?  I know databases use them a lot with direct I/O, but for
> buffere I/O this seems like an odd I/O pattern.

As the comment says:

+       /* Can't allocate blocks here because we don't have ->prepare_ioend */

Give me a way to allocate blocks and it can do better!  I didn't realise
this was going to be a problem when I embarked on this, but attempting
to do IO to wild addresses made me realise that most appending O_SYNC
writes are IOMAP_DELALLOC and so don't have allocated blocks.

And it's not just overwrites.  If you open(O_SYNC|O_TRUNC) and then
write ten bytes at a time, the first write to each block will cause us
to fall back to writeback pages, but subsequent writes to a block will
writethrough.
diff mbox series

Patch

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index eb068e21d3bb..93b889338172 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -657,8 +657,45 @@  iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags,
 	return status;
 }
 
-static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len,
-		size_t copied, struct page *page)
+/* Rearrange file so we don't need this forward declaration */
+static struct iomap_ioend *iomap_add_to_ioend(struct inode *inode,
+		loff_t pos, size_t len, struct page *page,
+		struct iomap_page *iop, struct iomap *iomap,
+		struct iomap_ioend *ioend, struct writeback_control *wbc,
+		struct list_head *iolist);
+
+/* Returns true if we can skip dirtying the page */
+static bool iomap_write_through(struct iomap_write_ctx *iwc,
+		struct iomap *iomap, struct inode *inode, struct page *page,
+		loff_t pos, size_t len)
+{
+	unsigned int blksize = i_blocksize(inode);
+
+	if (!iwc || !iwc->write_through)
+		return false;
+	if (PageDirty(page))
+		return true;
+	if (PageWriteback(page))
+		return false;
+
+	/* Can't allocate blocks here because we don't have ->prepare_ioend */
+	if (iomap->type != IOMAP_MAPPED || iomap->type != IOMAP_UNWRITTEN ||
+	    iomap->flags & IOMAP_F_SHARED)
+		return false;
+
+	len = round_up(pos + len - 1, blksize);
+	pos = round_down(pos, blksize);
+	len -= pos;
+	iwc->ioend = iomap_add_to_ioend(inode, pos, len, page,
+			iomap_page_create(inode, page), iomap, iwc->ioend, NULL,
+			&iwc->iolist);
+	set_page_writeback(page);
+	return true;
+}
+
+static size_t __iomap_write_end(struct iomap_write_ctx *iwc,
+		struct iomap *iomap, struct inode *inode, loff_t pos,
+		size_t len, size_t copied, struct page *page)
 {
 	flush_dcache_page(page);
 
@@ -676,7 +713,8 @@  static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len,
 	if (unlikely(copied < len && !PageUptodate(page)))
 		return 0;
 	iomap_set_range_uptodate(page, offset_in_page(pos), len);
-	__set_page_dirty_nobuffers(page);
+	if (!iomap_write_through(iwc, iomap, inode, page, pos, len))
+		__set_page_dirty_nobuffers(page);
 	return copied;
 }
 
@@ -698,9 +736,9 @@  static size_t iomap_write_end_inline(struct inode *inode, struct page *page,
 }
 
 /* Returns the number of bytes copied.  May be 0.  Cannot be an errno. */
-static size_t iomap_write_end(struct inode *inode, loff_t pos, size_t len,
-		size_t copied, struct page *page, struct iomap *iomap,
-		struct iomap *srcmap)
+static size_t iomap_write_end(struct iomap_write_ctx *iwc, struct inode *inode,
+		loff_t pos, size_t len, size_t copied, struct page *page,
+		struct iomap *iomap, struct iomap *srcmap)
 {
 	const struct iomap_page_ops *page_ops = iomap->page_ops;
 	loff_t old_size = inode->i_size;
@@ -712,7 +750,8 @@  static size_t iomap_write_end(struct inode *inode, loff_t pos, size_t len,
 		ret = block_write_end(NULL, inode->i_mapping, pos, len, copied,
 				page, NULL);
 	} else {
-		ret = __iomap_write_end(inode, pos, len, copied, page);
+		ret = __iomap_write_end(iwc, iomap, inode, pos, len, copied,
+				page);
 	}
 
 	/*
@@ -780,8 +819,8 @@  iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 
 		copied = copy_page_from_iter_atomic(page, offset, bytes, i);
 
-		status = iomap_write_end(inode, pos, bytes, copied, page, iomap,
-				srcmap);
+		status = iomap_write_end(iwc, inode, pos, bytes, copied, page,
+				iomap, srcmap);
 
 		if (unlikely(copied != status))
 			iov_iter_revert(i, copied - status);
@@ -808,6 +847,10 @@  iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 	return written ? written : status;
 }
 
+/* Also rearrange */
+static int iomap_submit_ioend(struct iomap_writepage_ctx *wpc,
+		struct iomap_ioend *ioend, int error);
+
 ssize_t
 iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *iter,
 		const struct iomap_ops *ops)
@@ -817,6 +860,7 @@  iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *iter,
 		.iolist = LIST_HEAD_INIT(iwc.iolist),
 		.write_through = iocb->ki_flags & IOCB_SYNC,
 	};
+	struct iomap_ioend *ioend, *next;
 	struct inode *inode = iocb->ki_filp->f_mapping->host;
 	loff_t pos = iocb->ki_pos, ret = 0, written = 0;
 
@@ -829,6 +873,15 @@  iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *iter,
 		written += ret;
 	}
 
+	if (ret > 0)
+		ret = 0;
+
+	list_for_each_entry_safe(ioend, next, &iwc.iolist, io_list) {
+		list_del_init(&ioend->io_list);
+		ret = iomap_submit_ioend(NULL, ioend, ret);
+	}
+	if (iwc.ioend)
+		ret = iomap_submit_ioend(NULL, iwc.ioend, ret);
 	return written ? written : ret;
 }
 EXPORT_SYMBOL_GPL(iomap_file_buffered_write);
@@ -857,8 +910,8 @@  iomap_unshare_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 		if (unlikely(status))
 			return status;
 
-		status = iomap_write_end(inode, pos, bytes, bytes, page, iomap,
-				srcmap);
+		status = iomap_write_end(NULL, inode, pos, bytes, bytes, page,
+				iomap, srcmap);
 		if (WARN_ON_ONCE(status == 0))
 			return -EIO;
 
@@ -908,7 +961,8 @@  static s64 iomap_zero(struct inode *inode, loff_t pos, u64 length,
 	zero_user(page, offset, bytes);
 	mark_page_accessed(page);
 
-	return iomap_write_end(inode, pos, bytes, bytes, page, iomap, srcmap);
+	return iomap_write_end(NULL, inode, pos, bytes, bytes, page, iomap,
+			srcmap);
 }
 
 static loff_t iomap_zero_range_actor(struct inode *inode, loff_t pos,