diff mbox series

[RFC,2/4] splice: Make vmsplice() steal or copy

Message ID 20230629155433.4170837-3-dhowells@redhat.com (mailing list archive)
State New
Headers show
Series splice: Fix corruption in data spliced to pipe | expand

Commit Message

David Howells June 29, 2023, 3:54 p.m. UTC
Make vmsplice()-to-pipe try to steal gifted data or else copy the source
data immediately before adding it to the pipe.  This prevents the data
added to the pipe from being modified by write(), by shared-writable mmap
and by fallocate().

[!] Note: I'm using unmap_mapping_folio() and remove_mapping() to steal a
    gifted page on behalf of vmsplice().  It works partly, but after a
    large batch of stealing, it will oops, but I can't tell why as it dies
    in the middle of a huge chunk of macro-generated interval tree code.

[!] Note: I'm only allowing theft of pages with refcount <= 4.  refcount == 3
    would actually seem to be the right thing (one for the caller, one for the
    pagecache and one for our page table), but sometimes a fourth ref is held
    transiently (possibly deferred put from page-in).

Reported-by:  Matt Whitlock <kernel@mattwhitlock.name>
Link: https://lore.kernel.org/r/ec804f26-fa76-4fbe-9b1c-8fbbd829b735@mattwhitlock.name/
Signed-off-by: David Howells <dhowells@redhat.com>
cc: Matthew Wilcox <willy@infradead.org>
cc: Dave Chinner <david@fromorbit.com>
cc: Christoph Hellwig <hch@lst.de>
cc: Jens Axboe <axboe@kernel.dk>
cc: linux-fsdevel@vger.kernel.org
---
 fs/splice.c | 123 +++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 113 insertions(+), 10 deletions(-)

Comments

Simon Horman June 30, 2023, 1:44 p.m. UTC | #1
On Thu, Jun 29, 2023 at 04:54:31PM +0100, David Howells wrote:

...

>  static int iter_to_pipe(struct iov_iter *from,
>  			struct pipe_inode_info *pipe,
>  			unsigned flags)
>  {
> -	struct pipe_buffer buf = {
> -		.ops = &user_page_pipe_buf_ops,

Hi David,

perhaps this patchset will change somewhat based on discussion
elsewhere in this thread.

But, on a more mundane level, GCC reports that user_page_pipe_buf_ops is
(now) unused.  I guess this was the last user, and user_page_pipe_buf_ops
can be removed as part of this patch.

...
David Howells June 30, 2023, 3:29 p.m. UTC | #2
Simon Horman <simon.horman@corigine.com> wrote:

> But, on a more mundane level, GCC reports that user_page_pipe_buf_ops is
> (now) unused.  I guess this was the last user, and user_page_pipe_buf_ops
> can be removed as part of this patch.

See patch 3.

David
Simon Horman June 30, 2023, 5:32 p.m. UTC | #3
On Fri, Jun 30, 2023 at 04:29:34PM +0100, David Howells wrote:
> Simon Horman <simon.horman@corigine.com> wrote:
> 
> > But, on a more mundane level, GCC reports that user_page_pipe_buf_ops is
> > (now) unused.  I guess this was the last user, and user_page_pipe_buf_ops
> > can be removed as part of this patch.
> 
> See patch 3.

Thanks, I do see that now.
But as thing stand, bisection is broken.
diff mbox series

Patch

diff --git a/fs/splice.c b/fs/splice.c
index 004eb1c4ce31..42af642c0ff8 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -37,6 +37,7 @@ 
 #include <linux/socket.h>
 #include <linux/sched/signal.h>
 
+#include "../mm/internal.h"
 #include "internal.h"
 
 /*
@@ -1382,14 +1383,117 @@  static long __do_splice(struct file *in, loff_t __user *off_in,
 	return ret;
 }
 
+static void copy_folio_to_folio(struct folio *src, size_t src_offset,
+				struct folio *dst, size_t dst_offset,
+				size_t size)
+{
+	void *p, *q;
+
+	while (size > 0) {
+		size_t part = min3(PAGE_SIZE - src_offset % PAGE_SIZE,
+				   PAGE_SIZE - dst_offset % PAGE_SIZE,
+				   size);
+
+		p = kmap_local_folio(src, src_offset);
+		q = kmap_local_folio(dst, dst_offset);
+		memcpy(q, p, part);
+		kunmap_local(p);
+		kunmap_local(q);
+		src_offset += part;
+		dst_offset += part;
+		size -= part;
+	}
+}
+
+static int splice_try_to_steal_page(struct pipe_inode_info *pipe,
+				    struct page *page, size_t offset,
+				    size_t size, unsigned int splice_flags)
+{
+	struct folio *folio = page_folio(page), *copy;
+	unsigned int flags = 0;
+	size_t fsize = folio_size(folio), spliced = 0;
+
+	if (!(splice_flags & SPLICE_F_GIFT) ||
+	    fsize != PAGE_SIZE || offset != 0 || size != fsize)
+		goto need_copy;
+
+	/*
+	 * For a folio to be stealable, the caller holds a ref, the mapping
+	 * holds a ref and the page tables hold a ref; it may or may not also
+	 * be on the LRU.  Anything else and someone else has access to it.
+	 */
+	if (folio_ref_count(folio) > 4 || folio_mapcount(folio) != 1 ||
+	    folio_maybe_dma_pinned(folio))
+		goto need_copy;
+
+	/* Try to steal. */
+	folio_lock(folio);
+
+	if (folio_ref_count(folio) > 4 || folio_mapcount(folio) != 1 ||
+	    folio_maybe_dma_pinned(folio))
+		goto need_copy_unlock;
+	if (!folio->mapping)
+		goto need_copy_unlock; /* vmsplice race? */
+
+	/*
+	 * Remove the folio from the process VM and then try to remove
+	 * it from the mapping.  It we can't remove it, we'll have to
+	 * copy it instead.
+	 */
+	unmap_mapping_folio(folio);
+	if (remove_mapping(folio->mapping, folio)) {
+		folio_clear_mappedtodisk(folio);
+		flags |= PIPE_BUF_FLAG_LRU;
+		goto add_to_pipe;
+	}
+
+need_copy_unlock:
+	folio_unlock(folio);
+need_copy:
+
+	copy = folio_alloc(GFP_KERNEL, 0);
+	if (!copy)
+		return -ENOMEM;
+
+	size = min(size, PAGE_SIZE - offset % PAGE_SIZE);
+	copy_folio_to_folio(folio, offset, copy, 0, size);
+	folio_mark_uptodate(copy);
+	folio_put(folio);
+	folio = copy;
+	offset = 0;
+
+add_to_pipe:
+	page = folio_page(folio, offset / PAGE_SIZE);
+	size = min(size, folio_size(folio) - offset);
+	offset %= PAGE_SIZE;
+
+	while (spliced < size &&
+	       !pipe_full(pipe->head, pipe->tail, pipe->max_usage)) {
+		struct pipe_buffer *buf = pipe_head_buf(pipe);
+		size_t part = min_t(size_t, PAGE_SIZE - offset, size - spliced);
+
+		*buf = (struct pipe_buffer) {
+			.ops	= &default_pipe_buf_ops,
+			.page	= page,
+			.offset	= offset,
+			.len	= part,
+			.flags	= flags,
+		};
+		folio_get(folio);
+		pipe->head++;
+		page++;
+		spliced += part;
+		offset = 0;
+	}
+
+	folio_put(folio);
+	return spliced;
+}
+
 static int iter_to_pipe(struct iov_iter *from,
 			struct pipe_inode_info *pipe,
 			unsigned flags)
 {
-	struct pipe_buffer buf = {
-		.ops = &user_page_pipe_buf_ops,
-		.flags = flags
-	};
 	size_t total = 0;
 	int ret = 0;
 
@@ -1407,12 +1511,11 @@  static int iter_to_pipe(struct iov_iter *from,
 
 		n = DIV_ROUND_UP(left + start, PAGE_SIZE);
 		for (i = 0; i < n; i++) {
-			int size = min_t(int, left, PAGE_SIZE - start);
+			size_t part = min_t(size_t, left,
+					    PAGE_SIZE - start % PAGE_SIZE);
 
-			buf.page = pages[i];
-			buf.offset = start;
-			buf.len = size;
-			ret = add_to_pipe(pipe, &buf);
+			ret = splice_try_to_steal_page(pipe, pages[i], start,
+						       part, flags);
 			if (unlikely(ret < 0)) {
 				iov_iter_revert(from, left);
 				// this one got dropped by add_to_pipe()
@@ -1421,7 +1524,7 @@  static int iter_to_pipe(struct iov_iter *from,
 				goto out;
 			}
 			total += ret;
-			left -= size;
+			left -= part;
 			start = 0;
 		}
 	}