diff mbox series

[5/5] iomap: support RWF_UNCACHED for buffered writes

Message ID 20191210204304.12266-6-axboe@kernel.dk (mailing list archive)
State New, archived
Headers show
Series Support for RWF_UNCACHED | expand

Commit Message

Jens Axboe Dec. 10, 2019, 8:43 p.m. UTC
This adds support for RWF_UNCACHED for file systems using iomap to
perform buffered writes. We use the generic infrastructure for this,
by tracking pages we created and calling write_drop_cached_pages()
to issue writeback and prune those pages.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/iomap/buffered-io.c | 72 +++++++++++++++++++++++++++++++++++-------
 include/linux/iomap.h  |  1 +
 2 files changed, 62 insertions(+), 11 deletions(-)

Comments

Dave Chinner Dec. 11, 2019, 1:14 a.m. UTC | #1
On Tue, Dec 10, 2019 at 01:43:04PM -0700, Jens Axboe wrote:
> This adds support for RWF_UNCACHED for file systems using iomap to
> perform buffered writes. We use the generic infrastructure for this,
> by tracking pages we created and calling write_drop_cached_pages()
> to issue writeback and prune those pages.
> 
> Signed-off-by: Jens Axboe <axboe@kernel.dk>
.....
>  static loff_t
>  iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
>  		unsigned flags, struct iomap *iomap, struct iomap *srcmap)
>  {
> +	struct address_space *mapping = inode->i_mapping;
>  	struct iov_iter *i = data;
> +	struct pagevec pvec;
>  	long status = 0;
>  	ssize_t written = 0;
>  
> +	pagevec_init(&pvec);
> +

Ok, so the actor is called after we've already mapped and allocated
an extent of arbitrary length. It may be a delalloc extent, it might
be unwritten, it could be a COW mapping, etc.

>  	do {
>  		struct page *page;
>  		unsigned long offset;	/* Offset into pagecache page */
>  		unsigned long bytes;	/* Bytes to write to page */
>  		size_t copied;		/* Bytes copied from user */
> +		bool drop_page = false;	/* drop page after IO */
> +		unsigned lflags = flags;
>  
>  		offset = offset_in_page(pos);
>  		bytes = min_t(unsigned long, PAGE_SIZE - offset,
> @@ -832,10 +851,17 @@ iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
>  			break;
>  		}
>  
> -		status = iomap_write_begin(inode, pos, bytes, 0, &page, iomap,
> -				srcmap);
> -		if (unlikely(status))
> +retry:
> +		status = iomap_write_begin(inode, pos, bytes, lflags, &page,
> +						iomap, srcmap);
> +		if (unlikely(status)) {
> +			if (status == -ENOMEM && (lflags & IOMAP_UNCACHED)) {
> +				drop_page = true;
> +				lflags &= ~IOMAP_UNCACHED;
> +				goto retry;
> +			}
>  			break;
> +		}
>  
>  		if (mapping_writably_mapped(inode->i_mapping))
>  			flush_dcache_page(page);
> @@ -844,10 +870,16 @@ iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
>  
>  		flush_dcache_page(page);
>  
> +		if (drop_page)
> +			get_page(page);
> +
>  		status = iomap_write_end(inode, pos, bytes, copied, page, iomap,
>  				srcmap);
> -		if (unlikely(status < 0))
> +		if (unlikely(status < 0)) {
> +			if (drop_page)
> +				put_page(page);
>  			break;
> +		}
>  		copied = status;
>  
>  		cond_resched();
> @@ -864,15 +896,29 @@ iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
>  			 */
>  			bytes = min_t(unsigned long, PAGE_SIZE - offset,
>  						iov_iter_single_seg_count(i));
> +			if (drop_page)
> +				put_page(page);
>  			goto again;
>  		}
> +
> +		if (drop_page &&
> +		    ((pos >> PAGE_SHIFT) != ((pos + copied) >> PAGE_SHIFT))) {
> +			if (!pagevec_add(&pvec, page))
> +				write_drop_cached_pages(&pvec, mapping);
> +		} else {
> +			if (drop_page)
> +				put_page(page);
> +			balance_dirty_pages_ratelimited(inode->i_mapping);
> +		}

This looks like it's a problem: this is going to write the
data, which can cause the extent mapping of the file to change
beyond the range that was written (e.g. due to speculative delayed
allocation) and so the iomap we have already cached to direct write
behaviour may now be stale.

IOWs, to be safe we need to terminate the write loop at this point,
return to iomap_apply() and remap the range we are writing into so
that we don't end up using a stale iomap. That kinda defeats the
purpose of iomap - we are trying to do a single extent mapping per
IO instead of per-page, and this pulls it back to an iomap per 16
pages for large user IOs. And it has the issues with breaking
delayed allocation optimisations, too.

Hence, IMO, this is the wrong layer in iomap to be dealing with
writeback and cache residency for uncached IO. We should be caching
residency/invalidation at a per-IO level, not a per-page level.

Sure, have the write actor return a flag (e.g. in the iomap) to say
that it encountered cached pages so that we can decide whether or
not to invalidate the entire range we just wrote in iomap_apply, but
doing it between mappings in iomap_apply means that the writeback is
done once per user IO, and cache invalidation only occurs if no
cached pages were encountered during that IO. i.e. add this to
iomap_apply() after ops->iomap_end() has been called:


	if (flags & RWF_UNCACHED) {
		ret = filemap_write_and_wait_range(mapping, start, end);
		if (ret)
			goto out;

		if (!drop_cache)
			goto out;

		/*
		 * Try to invalidate cache pages for the range we
		 * just wrote. We don't care if invalidation fails
		 * as the write has still worked and leaving clean
		 * uptodate pages * in the page cache isn't a
		 * corruption vector for uncached IO.
		 */
		invalidate_inode_pages2_range(mapping,
				start >> PAGE_SHIFT, end >> PAGE_SHIFT);
	}
out:
	return written ? written : ret;
}

Note that this doesn't solve the write error return issue. i.e.
if filemap_write_and_wait_range() fails, should that error be
returned or ignored?

And that leads to my next question: what data integrity guarantees
does RWF_UNCACHED give? What if the underlying device has a volatile
write cache or we dirtied metadata during block allocation? i.e.  to
a user, "UNCACHED" kinda implies that the write has ended up on
stable storage because they are saying "do not cache this data". To
me, none of this implementation guarantees data integrity, and users
would still need O_DSYNC or fsync() with RWF_UNCACHED IO. That seems
sane to me (same as direct io requirements) but whatever is decided
here, it will need to be spelled out clearly in the man page so that 
users don't get it wrong.

Cheers,

Dave.
Jens Axboe Dec. 11, 2019, 2:44 p.m. UTC | #2
On 12/10/19 6:14 PM, Dave Chinner wrote:
>> @@ -864,15 +896,29 @@ iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
>>  			 */
>>  			bytes = min_t(unsigned long, PAGE_SIZE - offset,
>>  						iov_iter_single_seg_count(i));
>> +			if (drop_page)
>> +				put_page(page);
>>  			goto again;
>>  		}
>> +
>> +		if (drop_page &&
>> +		    ((pos >> PAGE_SHIFT) != ((pos + copied) >> PAGE_SHIFT))) {
>> +			if (!pagevec_add(&pvec, page))
>> +				write_drop_cached_pages(&pvec, mapping);
>> +		} else {
>> +			if (drop_page)
>> +				put_page(page);
>> +			balance_dirty_pages_ratelimited(inode->i_mapping);
>> +		}
> 
> This looks like it's a problem: this is going to write the
> data, which can cause the extent mapping of the file to change
> beyond the range that was written (e.g. due to speculative delayed
> allocation) and so the iomap we have already cached to direct write
> behaviour may now be stale.
> 
> IOWs, to be safe we need to terminate the write loop at this point,
> return to iomap_apply() and remap the range we are writing into so
> that we don't end up using a stale iomap. That kinda defeats the
> purpose of iomap - we are trying to do a single extent mapping per
> IO instead of per-page, and this pulls it back to an iomap per 16
> pages for large user IOs. And it has the issues with breaking
> delayed allocation optimisations, too.
> 
> Hence, IMO, this is the wrong layer in iomap to be dealing with
> writeback and cache residency for uncached IO. We should be caching
> residency/invalidation at a per-IO level, not a per-page level.
> 
> Sure, have the write actor return a flag (e.g. in the iomap) to say
> that it encountered cached pages so that we can decide whether or
> not to invalidate the entire range we just wrote in iomap_apply, but
> doing it between mappings in iomap_apply means that the writeback is
> done once per user IO, and cache invalidation only occurs if no
> cached pages were encountered during that IO. i.e. add this to
> iomap_apply() after ops->iomap_end() has been called:
> 
> 
> 	if (flags & RWF_UNCACHED) {
> 		ret = filemap_write_and_wait_range(mapping, start, end);
> 		if (ret)
> 			goto out;
> 
> 		if (!drop_cache)
> 			goto out;
> 
> 		/*
> 		 * Try to invalidate cache pages for the range we
> 		 * just wrote. We don't care if invalidation fails
> 		 * as the write has still worked and leaving clean
> 		 * uptodate pages * in the page cache isn't a
> 		 * corruption vector for uncached IO.
> 		 */
> 		invalidate_inode_pages2_range(mapping,
> 				start >> PAGE_SHIFT, end >> PAGE_SHIFT);
> 	}
> out:
> 	return written ? written : ret;
> }

I really like this, and some variant of that solution can also be
applied to the non-iomap case. It'll make for a cleaner implementation
as well.

Once we do it per-write we'll have solved the performance and iomap
weirdness, but it does mean that we need to make a decision on when to
invalidate. For the per-page case it's clear - if the page is already
there, leave it. If not, (attempt to) kill it. For the full write range,
what if just one page was not there? Do we invalidate then? What if one
page was there?

I'm going to move forward with the notion that if we had to allocate any
page in the range, we invalidate the range. Only ranges that were fully
cached already will remain in the cache. I think those semantics make
the most sense.

> Note that this doesn't solve the write error return issue. i.e.
> if filemap_write_and_wait_range() fails, should that error be
> returned or ignored?

I think we should handle this exactly like we do the O_DIRECT case on
buffered IO write-and-wait.

> And that leads to my next question: what data integrity guarantees
> does RWF_UNCACHED give? What if the underlying device has a volatile
> write cache or we dirtied metadata during block allocation? i.e.  to
> a user, "UNCACHED" kinda implies that the write has ended up on
> stable storage because they are saying "do not cache this data". To
> me, none of this implementation guarantees data integrity, and users
> would still need O_DSYNC or fsync() with RWF_UNCACHED IO. That seems
> sane to me (same as direct io requirements) but whatever is decided
> here, it will need to be spelled out clearly in the man page so that 
> users don't get it wrong.

Fully agree, this isn't about data integrity guarantees. You will still
need the appropriate sync magic to make it safe. This is exactly like
O_DIRECT, and I think we should retain those exact same semantics for
the RWF_UNCACHED case.

Thanks for taking a look at this! I'll respin (and re-test) with the
write side changed.
diff mbox series

Patch

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 9b5b770ca4c7..3a18a6af8cb3 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -17,6 +17,7 @@ 
 #include <linux/bio.h>
 #include <linux/sched/signal.h>
 #include <linux/migrate.h>
+#include <linux/pagevec.h>
 #include "trace.h"
 
 #include "../internal.h"
@@ -566,6 +567,7 @@  EXPORT_SYMBOL_GPL(iomap_migrate_page);
 
 enum {
 	IOMAP_WRITE_F_UNSHARE		= (1 << 0),
+	IOMAP_WRITE_F_UNCACHED		= (1 << 1),
 };
 
 static void
@@ -643,6 +645,7 @@  iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags,
 		struct page **pagep, struct iomap *iomap, struct iomap *srcmap)
 {
 	const struct iomap_page_ops *page_ops = iomap->page_ops;
+	unsigned aop_flags;
 	struct page *page;
 	int status = 0;
 
@@ -659,8 +662,11 @@  iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags,
 			return status;
 	}
 
+	aop_flags = AOP_FLAG_NOFS;
+	if (flags & IOMAP_UNCACHED)
+		aop_flags |= AOP_FLAG_UNCACHED;
 	page = grab_cache_page_write_begin(inode->i_mapping, pos >> PAGE_SHIFT,
-			AOP_FLAG_NOFS);
+						aop_flags);
 	if (!page) {
 		status = -ENOMEM;
 		goto out_no_page;
@@ -670,9 +676,14 @@  iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags,
 		iomap_read_inline_data(inode, page, srcmap);
 	else if (iomap->flags & IOMAP_F_BUFFER_HEAD)
 		status = __block_write_begin_int(page, pos, len, NULL, srcmap);
-	else
-		status = __iomap_write_begin(inode, pos, len, flags, page,
+	else {
+		unsigned wb_flags = 0;
+
+		if (flags & IOMAP_UNCACHED)
+			wb_flags = IOMAP_WRITE_F_UNCACHED;
+		status = __iomap_write_begin(inode, pos, len, wb_flags, page,
 				srcmap);
+	}
 
 	if (unlikely(status))
 		goto out_unlock;
@@ -796,19 +807,27 @@  iomap_write_end(struct inode *inode, loff_t pos, unsigned len, unsigned copied,
 	return ret;
 }
 
+#define GPW_PAGE_BATCH		16
+
 static loff_t
 iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 		unsigned flags, struct iomap *iomap, struct iomap *srcmap)
 {
+	struct address_space *mapping = inode->i_mapping;
 	struct iov_iter *i = data;
+	struct pagevec pvec;
 	long status = 0;
 	ssize_t written = 0;
 
+	pagevec_init(&pvec);
+
 	do {
 		struct page *page;
 		unsigned long offset;	/* Offset into pagecache page */
 		unsigned long bytes;	/* Bytes to write to page */
 		size_t copied;		/* Bytes copied from user */
+		bool drop_page = false;	/* drop page after IO */
+		unsigned lflags = flags;
 
 		offset = offset_in_page(pos);
 		bytes = min_t(unsigned long, PAGE_SIZE - offset,
@@ -832,10 +851,17 @@  iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 			break;
 		}
 
-		status = iomap_write_begin(inode, pos, bytes, 0, &page, iomap,
-				srcmap);
-		if (unlikely(status))
+retry:
+		status = iomap_write_begin(inode, pos, bytes, lflags, &page,
+						iomap, srcmap);
+		if (unlikely(status)) {
+			if (status == -ENOMEM && (lflags & IOMAP_UNCACHED)) {
+				drop_page = true;
+				lflags &= ~IOMAP_UNCACHED;
+				goto retry;
+			}
 			break;
+		}
 
 		if (mapping_writably_mapped(inode->i_mapping))
 			flush_dcache_page(page);
@@ -844,10 +870,16 @@  iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 
 		flush_dcache_page(page);
 
+		if (drop_page)
+			get_page(page);
+
 		status = iomap_write_end(inode, pos, bytes, copied, page, iomap,
 				srcmap);
-		if (unlikely(status < 0))
+		if (unlikely(status < 0)) {
+			if (drop_page)
+				put_page(page);
 			break;
+		}
 		copied = status;
 
 		cond_resched();
@@ -864,15 +896,29 @@  iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 			 */
 			bytes = min_t(unsigned long, PAGE_SIZE - offset,
 						iov_iter_single_seg_count(i));
+			if (drop_page)
+				put_page(page);
 			goto again;
 		}
+
+		if (drop_page &&
+		    ((pos >> PAGE_SHIFT) != ((pos + copied) >> PAGE_SHIFT))) {
+			if (!pagevec_add(&pvec, page))
+				write_drop_cached_pages(&pvec, mapping);
+		} else {
+			if (drop_page)
+				put_page(page);
+			balance_dirty_pages_ratelimited(inode->i_mapping);
+		}
+
 		pos += copied;
 		written += copied;
 		length -= copied;
-
-		balance_dirty_pages_ratelimited(inode->i_mapping);
 	} while (iov_iter_count(i) && length);
 
+	if (pagevec_count(&pvec))
+		write_drop_cached_pages(&pvec, mapping);
+
 	return written ? written : status;
 }
 
@@ -882,10 +928,14 @@  iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *iter,
 {
 	struct inode *inode = iocb->ki_filp->f_mapping->host;
 	loff_t pos = iocb->ki_pos, ret = 0, written = 0;
+	unsigned flags = IOMAP_WRITE;
+
+	if (iocb->ki_flags & IOCB_UNCACHED)
+		flags |= IOMAP_UNCACHED;
 
 	while (iov_iter_count(iter)) {
-		ret = iomap_apply(inode, pos, iov_iter_count(iter),
-				IOMAP_WRITE, ops, iter, iomap_write_actor);
+		ret = iomap_apply(inode, pos, iov_iter_count(iter), flags,
+					ops, iter, iomap_write_actor);
 		if (ret <= 0)
 			break;
 		pos += ret;
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 61fcaa3904d4..833dd43507ac 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -121,6 +121,7 @@  struct iomap_page_ops {
 #define IOMAP_FAULT		(1 << 3) /* mapping for page fault */
 #define IOMAP_DIRECT		(1 << 4) /* direct I/O */
 #define IOMAP_NOWAIT		(1 << 5) /* do not block */
+#define IOMAP_UNCACHED		(1 << 6)
 
 struct iomap_ops {
 	/*