diff mbox series

[v6,2/2] io_uring/rsrc: enable multi-hugepage buffer coalescing

Message ID 20240716060807.2707-3-cliang01.li@samsung.com (mailing list archive)
State New
Headers show
Series io_uring/rsrc: coalescing multi-hugepage registered buffers | expand

Commit Message

Chenliang Li July 16, 2024, 6:08 a.m. UTC
Add support for checking and coalescing multi-hugepage-backed fixed
buffers. The coalescing optimizes both time and space consumption caused
by mapping and storing multi-hugepage fixed buffers.

A coalescable multi-hugepage buffer should fully cover its folios
(except potentially the first and last one), and these folios should
have the same size. These requirements are for easier processing later,
also we need same size'd chunks in io_import_fixed for fast iov_iter
adjust.

Signed-off-by: Chenliang Li <cliang01.li@samsung.com>
---
 io_uring/rsrc.c | 139 +++++++++++++++++++++++++++++++++++++-----------
 io_uring/rsrc.h |   8 +++
 2 files changed, 115 insertions(+), 32 deletions(-)

Comments

Pavel Begunkov July 22, 2024, 1:12 p.m. UTC | #1
On 7/16/24 07:08, Chenliang Li wrote:
> Add support for checking and coalescing multi-hugepage-backed fixed
> buffers. The coalescing optimizes both time and space consumption caused
> by mapping and storing multi-hugepage fixed buffers.
> 
> A coalescable multi-hugepage buffer should fully cover its folios
> (except potentially the first and last one), and these folios should
> have the same size. These requirements are for easier processing later,
> also we need same size'd chunks in io_import_fixed for fast iov_iter
> adjust.
> 
> Signed-off-by: Chenliang Li <cliang01.li@samsung.com>
> ---

The series looks good apart from the comment below, I'll run some
tests this week, I think you sent something for liburing/test

> +static bool io_try_coalesce_buffer(struct page ***pages, int *nr_pages,
> +					 struct io_imu_folio_data *data)
> +{
> +	struct page **page_array = *pages;
> +	struct folio *folio = page_folio(page_array[0]);
> +	unsigned int count = 1, nr_folios = 1;
> +	int i;
> +
> +	if (*nr_pages <= 1)
> +		return false;
> +
> +	data->nr_pages_mid = folio_nr_pages(folio);
> +	if (data->nr_pages_mid == 1)
> +		return false;
> +
> +	data->folio_shift = folio_shift(folio);
> +	/*
> +	 * Check if pages are contiguous inside a folio, and all folios have
> +	 * the same page count except for the head and tail.
> +	 */
> +	for (i = 1; i < *nr_pages; i++) {
> +		if (page_folio(page_array[i]) == folio &&
> +			page_array[i] == page_array[i-1] + 1) {
> +			count++;
> +			continue;
> +		}
> +
> +		if (nr_folios == 1) {
> +			if (folio_page_idx(folio, page_array[i-1])
> +				!= data->nr_pages_mid - 1)

Code style, comparison (i.e. "!=") should be on the previous line

> +				return false;
> +
> +			data->nr_pages_head = count;
> +		} else if (count != data->nr_pages_mid) {
> +			return false;
> +		}
> +
> +		folio = page_folio(page_array[i]);
> +		if (folio_size(folio) != (1UL << data->folio_shift) ||
> +			folio_page_idx(folio, page_array[i]) != 0)
> +			return false;
> +
> +		count = 1;
> +		nr_folios++;
> +	}
> +	if (nr_folios == 1) {
> +		if (folio_page_idx(folio, page_array[i-1])
> +			!= data->nr_pages_mid - 1)

That's too restrictive and would be a regression. We currently allow
registrations fully falling under a single huge page but not matching
the right side. E.g. for a [N; M) huge page, you're allowed to register
[N + 4KB, M - 4KB). We should just remove this check.

And same comment about code style.


> +			return false;
> +
> +		data->nr_pages_head = count;
> +	}
> +
> +	return io_do_coalesce_buffer(pages, nr_pages, data, nr_folios);
> +}
> +
diff mbox series

Patch

diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 0d6cda92ba46..bba7d2913aa9 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -849,6 +849,103 @@  static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
 	return ret;
 }
 
+static bool io_do_coalesce_buffer(struct page ***pages, int *nr_pages,
+				struct io_imu_folio_data *data, int nr_folios)
+{
+	struct page **page_array = *pages, **new_array = NULL;
+	int nr_pages_left = *nr_pages, i, j;
+
+	/* Store head pages only*/
+	new_array = kvmalloc_array(nr_folios, sizeof(struct page *),
+					GFP_KERNEL);
+	if (!new_array)
+		return false;
+
+	new_array[0] = compound_head(page_array[0]);
+	/*
+	 * The pages are bound to the folio, it doesn't
+	 * actually unpin them but drops all but one reference,
+	 * which is usually put down by io_buffer_unmap().
+	 * Note, needs a better helper.
+	 */
+	if (data->nr_pages_head > 1)
+		unpin_user_pages(&page_array[1], data->nr_pages_head - 1);
+
+	j = data->nr_pages_head;
+	nr_pages_left -= data->nr_pages_head;
+	for (i = 1; i < nr_folios; i++) {
+		unsigned int nr_unpin;
+
+		new_array[i] = page_array[j];
+		nr_unpin = min_t(unsigned int, nr_pages_left - 1,
+					data->nr_pages_mid - 1);
+		if (nr_unpin)
+			unpin_user_pages(&page_array[j+1], nr_unpin);
+		j += data->nr_pages_mid;
+		nr_pages_left -= data->nr_pages_mid;
+	}
+	kvfree(page_array);
+	*pages = new_array;
+	*nr_pages = nr_folios;
+	return true;
+}
+
+static bool io_try_coalesce_buffer(struct page ***pages, int *nr_pages,
+					 struct io_imu_folio_data *data)
+{
+	struct page **page_array = *pages;
+	struct folio *folio = page_folio(page_array[0]);
+	unsigned int count = 1, nr_folios = 1;
+	int i;
+
+	if (*nr_pages <= 1)
+		return false;
+
+	data->nr_pages_mid = folio_nr_pages(folio);
+	if (data->nr_pages_mid == 1)
+		return false;
+
+	data->folio_shift = folio_shift(folio);
+	/*
+	 * Check if pages are contiguous inside a folio, and all folios have
+	 * the same page count except for the head and tail.
+	 */
+	for (i = 1; i < *nr_pages; i++) {
+		if (page_folio(page_array[i]) == folio &&
+			page_array[i] == page_array[i-1] + 1) {
+			count++;
+			continue;
+		}
+
+		if (nr_folios == 1) {
+			if (folio_page_idx(folio, page_array[i-1])
+				!= data->nr_pages_mid - 1)
+				return false;
+
+			data->nr_pages_head = count;
+		} else if (count != data->nr_pages_mid) {
+			return false;
+		}
+
+		folio = page_folio(page_array[i]);
+		if (folio_size(folio) != (1UL << data->folio_shift) ||
+			folio_page_idx(folio, page_array[i]) != 0)
+			return false;
+
+		count = 1;
+		nr_folios++;
+	}
+	if (nr_folios == 1) {
+		if (folio_page_idx(folio, page_array[i-1])
+			!= data->nr_pages_mid - 1)
+			return false;
+
+		data->nr_pages_head = count;
+	}
+
+	return io_do_coalesce_buffer(pages, nr_pages, data, nr_folios);
+}
+
 static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
 				  struct io_mapped_ubuf **pimu,
 				  struct page **last_hpage)
@@ -858,7 +955,8 @@  static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
 	unsigned long off;
 	size_t size;
 	int ret, nr_pages, i;
-	struct folio *folio = NULL;
+	struct io_imu_folio_data data;
+	bool coalesced;
 
 	*pimu = (struct io_mapped_ubuf *)&dummy_ubuf;
 	if (!iov->iov_base)
@@ -873,31 +971,8 @@  static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
 		goto done;
 	}
 
-	/* If it's a huge page, try to coalesce them into a single bvec entry */
-	if (nr_pages > 1) {
-		folio = page_folio(pages[0]);
-		for (i = 1; i < nr_pages; i++) {
-			/*
-			 * Pages must be consecutive and on the same folio for
-			 * this to work
-			 */
-			if (page_folio(pages[i]) != folio ||
-			    pages[i] != pages[i - 1] + 1) {
-				folio = NULL;
-				break;
-			}
-		}
-		if (folio) {
-			/*
-			 * The pages are bound to the folio, it doesn't
-			 * actually unpin them but drops all but one reference,
-			 * which is usually put down by io_buffer_unmap().
-			 * Note, needs a better helper.
-			 */
-			unpin_user_pages(&pages[1], nr_pages - 1);
-			nr_pages = 1;
-		}
-	}
+	/* If it's huge page(s), try to coalesce them into fewer bvec entries */
+	coalesced = io_try_coalesce_buffer(&pages, &nr_pages, &data);
 
 	imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
 	if (!imu)
@@ -909,7 +984,6 @@  static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
 		goto done;
 	}
 
-	off = (unsigned long) iov->iov_base & ~PAGE_MASK;
 	size = iov->iov_len;
 	/* store original address for later verification */
 	imu->ubuf = (unsigned long) iov->iov_base;
@@ -917,17 +991,18 @@  static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
 	imu->nr_bvecs = nr_pages;
 	imu->folio_shift = PAGE_SHIFT;
 	imu->folio_mask = PAGE_MASK;
+	if (coalesced) {
+		imu->folio_shift = data.folio_shift;
+		imu->folio_mask = ~((1UL << data.folio_shift) - 1);
+	}
+	off = (unsigned long) iov->iov_base & ~imu->folio_mask;
 	*pimu = imu;
 	ret = 0;
 
-	if (folio) {
-		bvec_set_page(&imu->bvec[0], pages[0], size, off);
-		goto done;
-	}
 	for (i = 0; i < nr_pages; i++) {
 		size_t vec_len;
 
-		vec_len = min_t(size_t, size, PAGE_SIZE - off);
+		vec_len = min_t(size_t, size, (1UL << imu->folio_shift) - off);
 		bvec_set_page(&imu->bvec[i], pages[i], vec_len, off);
 		off = 0;
 		size -= vec_len;
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index 8b029c53d325..048fd6959f7a 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -52,6 +52,14 @@  struct io_mapped_ubuf {
 	struct bio_vec	bvec[] __counted_by(nr_bvecs);
 };
 
+struct io_imu_folio_data {
+	/* Head folio can be partially included in the fixed buf */
+	unsigned int	nr_pages_head;
+	/* For non-head/tail folios, has to be fully included */
+	unsigned int	nr_pages_mid;
+	unsigned int	folio_shift;
+};
+
 void io_rsrc_node_ref_zero(struct io_rsrc_node *node);
 void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *ref_node);
 struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx);