diff mbox series

iov_iter: Add automatic-alloc for ITER_BVEC and use in direct_splice_read()

Message ID 1740264.1684482558@warthog.procyon.org.uk (mailing list archive)
State New
Headers show
Series iov_iter: Add automatic-alloc for ITER_BVEC and use in direct_splice_read() | expand

Commit Message

David Howells May 19, 2023, 7:49 a.m. UTC
If it's a problem that direct_splice_read() always allocates as much memory as
is asked for and that will fit into the pipe when less could be allocated in
the case that, say, an O_DIRECT-read will hit a hole and do a short read or a
socket will return less than was asked for, something like the attached
modification to ITER_BVEC could be made.

David
---
iov_iter: Add automatic-alloc for ITER_BVEC and use in direct_splice_read()

Add a flag to the iov_iter struct that tells things that write to or allow
writing to a BVEC-type iterator that they should allocate pages to fill in
any slots in the bio_vec array that have null page pointers.  This allows
the bufferage in the bvec to be allocated on-demand.

Iterators of this type are initialised with iov_iter_bvec_autoalloc()
instead of iov_iter_bvec().  Only destination (ie. READ/ITER_DEST)
iterators may be used in this fashion.

An additional function, iov_iter_auto_alloc() is provided to perform the
allocation in the case that the caller wishes to make use of the bio_vec
array directly and the block layer is modified to use it.

direct_splice_read() is then modified to make use of this.  This is less
efficient if we know in advance that we want to allocate the full buffer as
we can't so easily use bulk allocation, but it does mean in cases where we
might not want the full buffer (say we hit a hole in DIO), we don't have to
allocate it.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Christoph Hellwig <hch@lst.de>
cc: Jens Axboe <axboe@kernel.dk>
cc: Al Viro <viro@zeniv.linux.org.uk>
cc: David Hildenbrand <david@redhat.com>
cc: John Hubbard <jhubbard@nvidia.com>
cc: linux-mm@kvack.org
cc: linux-block@vger.kernel.org
cc: linux-fsdevel@vger.kernel.org
---
 block/bio.c         |    2 
 fs/splice.c         |   36 ++++++-----------
 include/linux/uio.h |   13 ++++--
 lib/iov_iter.c      |  110 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 133 insertions(+), 28 deletions(-)

Comments

Christoph Hellwig May 19, 2023, 8:18 a.m. UTC | #1
On Fri, May 19, 2023 at 08:49:18AM +0100, David Howells wrote:
> direct_splice_read() is then modified to make use of this.  This is less
> efficient if we know in advance that we want to allocate the full buffer as
> we can't so easily use bulk allocation, but it does mean in cases where we
> might not want the full buffer (say we hit a hole in DIO), we don't have to
> allocate it.

Can you eplain the workloads we're trying to optimize for here?

To me splice on O_DIRECT is more of a historic accident than an actually
good use case..
diff mbox series

Patch

diff --git a/block/bio.c b/block/bio.c
index 798cc4cf3bd2..72d5c1125df2 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1330,6 +1330,8 @@  int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 	int ret = 0;
 
 	if (iov_iter_is_bvec(iter)) {
+		if (!iov_iter_auto_alloc(iter, iov_iter_count(iter)))
+			return -ENOMEM;
 		bio_iov_bvec_set(bio, iter);
 		iov_iter_advance(iter, bio->bi_iter.bi_size);
 		return 0;
diff --git a/fs/splice.c b/fs/splice.c
index 56d9802729d0..30e7a31c5ada 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -310,10 +310,8 @@  ssize_t direct_splice_read(struct file *in, loff_t *ppos,
 	struct iov_iter to;
 	struct bio_vec *bv;
 	struct kiocb kiocb;
-	struct page **pages;
 	ssize_t ret;
-	size_t used, npages, chunk, remain, keep = 0;
-	int i;
+	size_t used, npages, chunk, remain, keep = 0, i;
 
 	if (!len)
 		return 0;
@@ -334,30 +332,14 @@  ssize_t direct_splice_read(struct file *in, loff_t *ppos,
 	len = min_t(size_t, len, npages * PAGE_SIZE);
 	npages = DIV_ROUND_UP(len, PAGE_SIZE);
 
-	bv = kzalloc(array_size(npages, sizeof(bv[0])) +
-		     array_size(npages, sizeof(struct page *)), GFP_KERNEL);
+	bv = kzalloc(array_size(npages, sizeof(bv[0])), GFP_KERNEL);
 	if (!bv)
 		return -ENOMEM;
 
-	pages = (struct page **)(bv + npages);
-	npages = alloc_pages_bulk_array(GFP_USER, npages, pages);
-	if (!npages) {
-		kfree(bv);
-		return -ENOMEM;
-	}
-
 	remain = len = min_t(size_t, len, npages * PAGE_SIZE);
 
-	for (i = 0; i < npages; i++) {
-		chunk = min_t(size_t, PAGE_SIZE, remain);
-		bv[i].bv_page = pages[i];
-		bv[i].bv_offset = 0;
-		bv[i].bv_len = chunk;
-		remain -= chunk;
-	}
-
 	/* Do the I/O */
-	iov_iter_bvec(&to, ITER_DEST, bv, npages, len);
+	iov_iter_bvec_autoalloc(&to, ITER_DEST, bv, npages, len);
 	init_sync_kiocb(&kiocb, in);
 	kiocb.ki_pos = *ppos;
 	ret = call_read_iter(in, &kiocb, &to);
@@ -376,8 +358,16 @@  ssize_t direct_splice_read(struct file *in, loff_t *ppos,
 	}
 
 	/* Free any pages that didn't get touched at all. */
-	if (keep < npages)
-		release_pages(pages + keep, npages - keep);
+	if (keep < npages) {
+		struct page **pages = (struct page **)&bv[keep];
+		size_t j = 0;
+
+		for (i = keep; i < npages; i++)
+			if (bv[i].bv_page)
+				pages[j++] = bv[i].bv_page;
+		if (j)
+			release_pages(pages, j);
+	}
 
 	/* Push the remaining pages into the pipe. */
 	remain = ret;
diff --git a/include/linux/uio.h b/include/linux/uio.h
index 60c342bb7ab8..6bc2287021d9 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -40,10 +40,11 @@  struct iov_iter_state {
 
 struct iov_iter {
 	u8 iter_type;
-	bool copy_mc;
-	bool nofault;
-	bool data_source;
-	bool user_backed;
+	bool copy_mc:1;
+	bool nofault:1;
+	bool data_source:1;
+	bool user_backed:1;
+	bool auto_alloc:1;	/* Automatically alloc pages into a bvec */
 	union {
 		size_t iov_offset;
 		int last_offset;
@@ -263,6 +264,7 @@  static inline bool iov_iter_is_copy_mc(const struct iov_iter *i)
 }
 #endif
 
+bool iov_iter_auto_alloc(struct iov_iter *iter, size_t count);
 size_t iov_iter_zero(size_t bytes, struct iov_iter *);
 bool iov_iter_is_aligned(const struct iov_iter *i, unsigned addr_mask,
 			unsigned len_mask);
@@ -274,6 +276,9 @@  void iov_iter_kvec(struct iov_iter *i, unsigned int direction, const struct kvec
 			unsigned long nr_segs, size_t count);
 void iov_iter_bvec(struct iov_iter *i, unsigned int direction, const struct bio_vec *bvec,
 			unsigned long nr_segs, size_t count);
+void iov_iter_bvec_autoalloc(struct iov_iter *i, unsigned int direction,
+			     const struct bio_vec *bvec, unsigned long nr_segs,
+			     size_t count);
 void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count);
 void iov_iter_xarray(struct iov_iter *i, unsigned int direction, struct xarray *xarray,
 		     loff_t start, size_t count);
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index f18138e0292a..3643f9d80ecc 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -52,7 +52,11 @@ 
 	while (n) {						\
 		unsigned offset = p->bv_offset + skip;		\
 		unsigned left;					\
-		void *kaddr = kmap_local_page(p->bv_page +	\
+		void *kaddr;					\
+								\
+		if (!p->bv_page)				\
+			break;					\
+		kaddr = kmap_local_page(p->bv_page +		\
 					offset / PAGE_SIZE);	\
 		base = kaddr + offset % PAGE_SIZE;		\
 		len = min(min(n, (size_t)(p->bv_len - skip)),	\
@@ -159,6 +163,49 @@  __out:								\
 #define iterate_and_advance(i, n, base, len, off, I, K) \
 	__iterate_and_advance(i, n, base, len, off, I, ((void)(K),0))
 
+/*
+ * Preallocate pages into the bvec sufficient to store count bytes.
+ */
+static bool bvec_auto_alloc(struct iov_iter *iter, size_t count)
+{
+	struct bio_vec *bvec = (struct bio_vec *)iter->bvec;
+	int j;
+
+	if (!count)
+		return true;
+
+	count += iter->iov_offset;
+	for (j = 0; j < iter->nr_segs; j++) {
+		if (!bvec[j].bv_page) {
+			bvec[j].bv_page = alloc_page(GFP_KERNEL);
+			if (!bvec[j].bv_page)
+				return false;
+		}
+		if (bvec[j].bv_len >= count)
+			break;
+		count -= bvec[j].bv_len;
+	}
+
+	return true;
+}
+
+/**
+ * iov_iter_auto_alloc - Perform auto-alloc for an iterator
+ * @iter: The iterator to do the allocation for
+ * @count: The number of bytes we need to store
+ *
+ * Perform auto-allocation on a iterator.  This only works with ITER_BVEC-type
+ * iterators.  It will make sure that pages are allocated sufficient to store
+ * the specified number of bytes.  Returns true if sufficient pages are present
+ * in the bvec and false if an allocation failure occurs.
+ */
+bool iov_iter_auto_alloc(struct iov_iter *iter, size_t count)
+{
+	return !iov_iter_is_bvec(iter) || !iter->auto_alloc ||
+		bvec_auto_alloc(iter, count);
+}
+EXPORT_SYMBOL_GPL(iov_iter_auto_alloc);
+
 static int copyout(void __user *to, const void *from, size_t n)
 {
 	if (should_fail_usercopy())
@@ -313,6 +360,8 @@  size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
 		return 0;
 	if (user_backed_iter(i))
 		might_fault();
+	if (!iov_iter_auto_alloc(i, bytes))
+		return 0;
 	iterate_and_advance(i, bytes, base, len, off,
 		copyout(base, addr + off, len),
 		memcpy(base, addr + off, len)
@@ -362,6 +411,8 @@  size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
 		return 0;
 	if (user_backed_iter(i))
 		might_fault();
+	if (!iov_iter_auto_alloc(i, bytes))
+		return 0;
 	__iterate_and_advance(i, bytes, base, len, off,
 		copyout_mc(base, addr + off, len),
 		copy_mc_to_kernel(base, addr + off, len)
@@ -503,6 +554,8 @@  size_t copy_page_to_iter_nofault(struct page *page, unsigned offset, size_t byte
 		return 0;
 	if (WARN_ON_ONCE(i->data_source))
 		return 0;
+	if (!iov_iter_auto_alloc(i, bytes))
+		return 0;
 	page += offset / PAGE_SIZE; // first subpage
 	offset %= PAGE_SIZE;
 	while (1) {
@@ -557,6 +610,8 @@  EXPORT_SYMBOL(copy_page_from_iter);
 
 size_t iov_iter_zero(size_t bytes, struct iov_iter *i)
 {
+	if (!iov_iter_auto_alloc(i, bytes))
+		return -ENOMEM;
 	iterate_and_advance(i, bytes, base, len, count,
 		clear_user(base, len),
 		memset(base, 0, len)
@@ -598,6 +653,7 @@  static void iov_iter_bvec_advance(struct iov_iter *i, size_t size)
 	size += i->iov_offset;
 
 	for (bvec = i->bvec, end = bvec + i->nr_segs; bvec < end; bvec++) {
+		BUG_ON(!bvec->bv_page);
 		if (likely(size < bvec->bv_len))
 			break;
 		size -= bvec->bv_len;
@@ -740,6 +796,51 @@  void iov_iter_bvec(struct iov_iter *i, unsigned int direction,
 }
 EXPORT_SYMBOL(iov_iter_bvec);
 
+/**
+ * iov_iter_bvec_autoalloc - Initialise a BVEC-type I/O iterator with automatic alloc
+ * @i: The iterator to initialise.
+ * @direction: The direction of the transfer.
+ * @bvec: The array of bio_vecs listing the buffer segments
+ * @nr_segs: The number of segments in @bvec[].
+ * @count: The size of the I/O buffer in bytes.
+ *
+ * Set up an I/O iterator to insert pages into a bvec as data is written into
+ * it where NULL pointers exist in the bvec array (if a pointer isn't NULL, the
+ * page it points to will just be used).  No more than @nr_segs pages will be
+ * filled in.  Empty slots will have bv_offset set to 0 and bv_len to
+ * PAGE_SIZE.
+ *
+ * If the iterator is reverted, excess pages will be left for the
+ * caller to clean up.
+ */
+void iov_iter_bvec_autoalloc(struct iov_iter *i, unsigned int direction,
+			     const struct bio_vec *bvec, unsigned long nr_segs,
+			     size_t count)
+{
+	struct bio_vec *bv = (struct bio_vec *)bvec;
+	unsigned long j;
+
+	BUG_ON(direction != READ);
+	*i = (struct iov_iter){
+		.iter_type = ITER_BVEC,
+		.copy_mc = false,
+		.data_source = direction,
+		.auto_alloc = true,
+		.bvec = bvec,
+		.nr_segs = nr_segs,
+		.iov_offset = 0,
+		.count = count
+	};
+
+	for (j = 0; j < nr_segs; j++) {
+		if (!bv[j].bv_page) {
+			bv[j].bv_offset = 0;
+			bv[j].bv_len = PAGE_SIZE;
+		}
+	}
+}
+EXPORT_SYMBOL(iov_iter_bvec_autoalloc);
+
 /**
  * iov_iter_xarray - Initialise an I/O iterator to use the pages in an xarray
  * @i: The iterator to initialise.
@@ -1122,6 +1223,8 @@  static ssize_t __iov_iter_get_pages_alloc(struct iov_iter *i,
 		struct page **p;
 		struct page *page;
 
+		if (!iov_iter_auto_alloc(i, maxsize))
+			return -ENOMEM;
 		page = first_bvec_segment(i, &maxsize, start);
 		n = want_pages_array(pages, maxsize, *start, maxpages);
 		if (!n)
@@ -1226,6 +1329,8 @@  size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *_csstate,
 		csstate->off += bytes;
 		return bytes;
 	}
+	if (!iov_iter_auto_alloc(i, bytes))
+		return -ENOMEM;
 
 	sum = csum_shift(csstate->csum, csstate->off);
 	iterate_and_advance(i, bytes, base, len, off, ({
@@ -1664,6 +1769,9 @@  static ssize_t iov_iter_extract_bvec_pages(struct iov_iter *i,
 	size_t skip = i->iov_offset, offset;
 	int k;
 
+	if (!iov_iter_auto_alloc(i, maxsize))
+		return -ENOMEM;
+
 	for (;;) {
 		if (i->nr_segs == 0)
 			return 0;