diff mbox series

[3/5] mm: make buffered writes work with RWF_UNCACHED

Message ID 20191210204304.12266-4-axboe@kernel.dk (mailing list archive)
State New, archived
Headers show
Series Support for RWF_UNCACHED | expand

Commit Message

Jens Axboe Dec. 10, 2019, 8:43 p.m. UTC
If RWF_UNCACHED is set for io_uring (or pwritev2(2)), we'll drop the
cache instantiated for buffered writes. If new pages aren't
instantiated, we leave them alone. This provides similar semantics to
reads with RWF_UNCACHED set.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/fs.h |  5 +++
 mm/filemap.c       | 85 +++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 85 insertions(+), 5 deletions(-)

Comments

Andrew Morton Dec. 14, 2019, 12:01 a.m. UTC | #1
On Tue, 10 Dec 2019 13:43:02 -0700 Jens Axboe <axboe@kernel.dk> wrote:

> If RWF_UNCACHED is set for io_uring (or pwritev2(2)), we'll drop the
> cache instantiated for buffered writes. If new pages aren't
> instantiated, we leave them alone. This provides similar semantics to
> reads with RWF_UNCACHED set.
> 

Wouid be nice to see a description of the proposed userspace API(s)
for exploiting this feature.

> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -285,6 +285,7 @@ enum positive_aop_returns {
>  #define AOP_FLAG_NOFS			0x0002 /* used by filesystem to direct
>  						* helper code (eg buffer layer)
>  						* to clear GFP_FS from alloc */
> +#define AOP_FLAG_UNCACHED		0x0004
>  
>  /*
>   * oh the beauties of C type declarations.
> @@ -3106,6 +3107,10 @@ extern ssize_t generic_file_direct_write(struct kiocb *, struct iov_iter *);
>  extern ssize_t generic_perform_write(struct file *, struct iov_iter *,
>  				     struct kiocb *);
>  
> +struct pagevec;
> +extern void write_drop_cached_pages(struct pagevec *pvec,
> +				struct address_space *mapping);
> +
>  ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos,
>  		rwf_t flags);
>  ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos,
> diff --git a/mm/filemap.c b/mm/filemap.c
> index fe37bd2b2630..2e36129ebe38 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -3287,10 +3287,12 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping,
>  					pgoff_t index, unsigned flags)
>  {
>  	struct page *page;
> -	int fgp_flags = FGP_LOCK|FGP_WRITE|FGP_CREAT;
> +	int fgp_flags = FGP_LOCK|FGP_WRITE;
>  
>  	if (flags & AOP_FLAG_NOFS)
>  		fgp_flags |= FGP_NOFS;
> +	if (!(flags & AOP_FLAG_UNCACHED))
> +		fgp_flags |= FGP_CREAT;
>  
>  	page = pagecache_get_page(mapping, index, fgp_flags,
>  			mapping_gfp_mask(mapping));
> @@ -3301,21 +3303,65 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping,
>  }
>  EXPORT_SYMBOL(grab_cache_page_write_begin);
>  
> +/*
> + * Start writeback on the pages in pgs[], and then try and remove those pages
> + * from the page cached. Used with RWF_UNCACHED.
> + */
> +void write_drop_cached_pages(struct pagevec *pvec,
> +			     struct address_space *mapping)
> +{
> +	loff_t start, end;
> +	int i;
> +
> +	end = 0;
> +	start = LLONG_MAX;
> +	for (i = 0; i < pagevec_count(pvec); i++) {
> +		loff_t off = page_offset(pvec->pages[i]);
> +		if (off < start)
> +			start = off;
> +		if (off > end)
> +			end = off;
> +	}
> +
> +	__filemap_fdatawrite_range(mapping, start, end, WB_SYNC_NONE);
> +
> +	for (i = 0; i < pagevec_count(pvec); i++) {
> +		struct page *page = pvec->pages[i];
> +
> +		lock_page(page);
> +		if (page->mapping == mapping) {
> +			wait_on_page_writeback(page);
> +			if (!page_has_private(page) ||
> +			    try_to_release_page(page, 0))
> +				remove_mapping(mapping, page);
> +		}
> +		unlock_page(page);
> +	}

This is kinda invalidate_inode_pages2_range(), only much less so?  Why
doesn't this code need to do all the things which
invalidate_inode_pages2_range() does?  What happens if these pages are
mmapped, faulted in?  Not faulted in?


> +	pagevec_release(pvec);
> +}
diff mbox series

Patch

diff --git a/include/linux/fs.h b/include/linux/fs.h
index bf58db1bc032..7ea3dfdd9aa5 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -285,6 +285,7 @@  enum positive_aop_returns {
 #define AOP_FLAG_NOFS			0x0002 /* used by filesystem to direct
 						* helper code (eg buffer layer)
 						* to clear GFP_FS from alloc */
+#define AOP_FLAG_UNCACHED		0x0004
 
 /*
  * oh the beauties of C type declarations.
@@ -3106,6 +3107,10 @@  extern ssize_t generic_file_direct_write(struct kiocb *, struct iov_iter *);
 extern ssize_t generic_perform_write(struct file *, struct iov_iter *,
 				     struct kiocb *);
 
+struct pagevec;
+extern void write_drop_cached_pages(struct pagevec *pvec,
+				struct address_space *mapping);
+
 ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos,
 		rwf_t flags);
 ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos,
diff --git a/mm/filemap.c b/mm/filemap.c
index fe37bd2b2630..2e36129ebe38 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -3287,10 +3287,12 @@  struct page *grab_cache_page_write_begin(struct address_space *mapping,
 					pgoff_t index, unsigned flags)
 {
 	struct page *page;
-	int fgp_flags = FGP_LOCK|FGP_WRITE|FGP_CREAT;
+	int fgp_flags = FGP_LOCK|FGP_WRITE;
 
 	if (flags & AOP_FLAG_NOFS)
 		fgp_flags |= FGP_NOFS;
+	if (!(flags & AOP_FLAG_UNCACHED))
+		fgp_flags |= FGP_CREAT;
 
 	page = pagecache_get_page(mapping, index, fgp_flags,
 			mapping_gfp_mask(mapping));
@@ -3301,21 +3303,65 @@  struct page *grab_cache_page_write_begin(struct address_space *mapping,
 }
 EXPORT_SYMBOL(grab_cache_page_write_begin);
 
+/*
+ * Start writeback on the pages in pgs[], and then try and remove those pages
+ * from the page cached. Used with RWF_UNCACHED.
+ */
+void write_drop_cached_pages(struct pagevec *pvec,
+			     struct address_space *mapping)
+{
+	loff_t start, end;
+	int i;
+
+	end = 0;
+	start = LLONG_MAX;
+	for (i = 0; i < pagevec_count(pvec); i++) {
+		loff_t off = page_offset(pvec->pages[i]);
+		if (off < start)
+			start = off;
+		if (off > end)
+			end = off;
+	}
+
+	__filemap_fdatawrite_range(mapping, start, end, WB_SYNC_NONE);
+
+	for (i = 0; i < pagevec_count(pvec); i++) {
+		struct page *page = pvec->pages[i];
+
+		lock_page(page);
+		if (page->mapping == mapping) {
+			wait_on_page_writeback(page);
+			if (!page_has_private(page) ||
+			    try_to_release_page(page, 0))
+				remove_mapping(mapping, page);
+		}
+		unlock_page(page);
+	}
+	pagevec_release(pvec);
+}
+EXPORT_SYMBOL_GPL(write_drop_cached_pages);
+
+#define GPW_PAGE_BATCH		16
+
 ssize_t generic_perform_write(struct file *file,
 				struct iov_iter *i, struct kiocb *iocb)
 {
 	struct address_space *mapping = file->f_mapping;
 	const struct address_space_operations *a_ops = mapping->a_ops;
 	loff_t pos = iocb->ki_pos;
+	struct pagevec pvec;
 	long status = 0;
 	ssize_t written = 0;
 	unsigned int flags = 0;
 
+	pagevec_init(&pvec);
+
 	do {
 		struct page *page;
 		unsigned long offset;	/* Offset into pagecache page */
 		unsigned long bytes;	/* Bytes to write to page */
 		size_t copied;		/* Bytes copied from user */
+		bool drop_page = false;	/* drop page after IO */
 		void *fsdata;
 
 		offset = (pos & (PAGE_SIZE - 1));
@@ -3323,6 +3369,9 @@  ssize_t generic_perform_write(struct file *file,
 						iov_iter_count(i));
 
 again:
+		if (iocb->ki_flags & IOCB_UNCACHED)
+			flags |= AOP_FLAG_UNCACHED;
+
 		/*
 		 * Bring in the user page that we will copy from _first_.
 		 * Otherwise there's a nasty deadlock on copying from the
@@ -3343,10 +3392,17 @@  ssize_t generic_perform_write(struct file *file,
 			break;
 		}
 
+retry:
 		status = a_ops->write_begin(file, mapping, pos, bytes, flags,
 						&page, &fsdata);
-		if (unlikely(status < 0))
+		if (unlikely(status < 0)) {
+			if (status == -ENOMEM && (flags & AOP_FLAG_UNCACHED)) {
+				drop_page = true;
+				flags &= ~AOP_FLAG_UNCACHED;
+				goto retry;
+			}
 			break;
+		}
 
 		if (mapping_writably_mapped(mapping))
 			flush_dcache_page(page);
@@ -3354,10 +3410,16 @@  ssize_t generic_perform_write(struct file *file,
 		copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
 		flush_dcache_page(page);
 
+		if (drop_page)
+			get_page(page);
+
 		status = a_ops->write_end(file, mapping, pos, bytes, copied,
 						page, fsdata);
-		if (unlikely(status < 0))
+		if (unlikely(status < 0)) {
+			if (drop_page)
+				put_page(page);
 			break;
+		}
 		copied = status;
 
 		cond_resched();
@@ -3374,14 +3436,27 @@  ssize_t generic_perform_write(struct file *file,
 			 */
 			bytes = min_t(unsigned long, PAGE_SIZE - offset,
 						iov_iter_single_seg_count(i));
+			if (drop_page)
+				put_page(page);
 			goto again;
 		}
+		if (drop_page &&
+		    ((pos >> PAGE_SHIFT) != ((pos + copied) >> PAGE_SHIFT))) {
+			if (!pagevec_add(&pvec, page))
+				write_drop_cached_pages(&pvec, mapping);
+		} else {
+			if (drop_page)
+				put_page(page);
+			balance_dirty_pages_ratelimited(mapping);
+		}
+
 		pos += copied;
 		written += copied;
-
-		balance_dirty_pages_ratelimited(mapping);
 	} while (iov_iter_count(i));
 
+	if (pagevec_count(&pvec))
+		write_drop_cached_pages(&pvec, mapping);
+
 	return written ? written : status;
 }
 EXPORT_SYMBOL(generic_perform_write);