diff mbox series

io_uring/rsrc: Add support for multi-folio buffer coalescing

Message ID 20240506075303.25630-1-cliang01.li@samsung.com (mailing list archive)
State New
Headers show
Series io_uring/rsrc: Add support for multi-folio buffer coalescing | expand

Commit Message

Chenliang Li May 6, 2024, 7:53 a.m. UTC
Currently fixed buffers consisting of pages in one same folio(huge page)
can be coalesced into a single bvec entry at registration.
This patch expands it to support coalescing fixed buffers
with multiple folios, by:
1. Add a helper function and a helper struct to do the coalescing work
at buffer registration;
2. Add the bvec setup procedure of the coalsced path;
3. store page_mask and page_shift into io_mapped_ubuf for
later use in io_import_fixed.

Signed-off-by: Chenliang Li <cliang01.li@samsung.com>
---
 io_uring/rsrc.c | 156 +++++++++++++++++++++++++++++++++++-------------
 io_uring/rsrc.h |   9 +++
 2 files changed, 124 insertions(+), 41 deletions(-)

Comments

Jens Axboe May 6, 2024, 12:57 p.m. UTC | #1
On 5/6/24 1:53 AM, Chenliang Li wrote:
> Currently fixed buffers consisting of pages in one same folio(huge page)
> can be coalesced into a single bvec entry at registration.
> This patch expands it to support coalescing fixed buffers
> with multiple folios, by:
> 1. Add a helper function and a helper struct to do the coalescing work
> at buffer registration;
> 2. Add the bvec setup procedure of the coalsced path;

coalesced

> 3. store page_mask and page_shift into io_mapped_ubuf for
> later use in io_import_fixed.

Can you add some justification to this commit message? A good commit
message should basically be the WHY of why this commit exists in the
first place. Your commit message just explains what the patch does,
which I can just read the code to see for myself.

As it stands, it's not clear to me or anyone casually reading this
commit message why the change is being done in the first place.

Outside of that, you probably want to split this into two parts - one
that adds the helper for the existing code, then one that modifies it
for your change. We need this to be as simple as possible to review, as
we've had a security issue with page coalescing in this code in the
past.

Minor comments below, will wait with a full review until this is split
to be more easily reviewable.

> +/*
> + * For coalesce to work, a buffer must be one or multiple
> + * folios, all the folios except the first and last one
> + * should be of the same size.
> + */
> +static bool io_sqe_buffer_try_coalesce(struct page **pages,
> +				       unsigned int nr_pages,
> +				       struct io_imu_folio_stats *stats)
> +{
> +	struct folio	*folio = NULL, *first_folio = NULL;
> +	unsigned int	page_cnt;
> +	int		i, j;

Please don't make up your own style, follow the style that's already in
the file to begin with.

> diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
> index c032ca3436ca..4c655e446150 100644
> --- a/io_uring/rsrc.h
> +++ b/io_uring/rsrc.h
> @@ -47,9 +47,18 @@ struct io_mapped_ubuf {
>  	u64		ubuf_end;
>  	unsigned int	nr_bvecs;
>  	unsigned long	acct_pages;
> +	unsigned int	page_shift;
> +	unsigned long	page_mask;
>  	struct bio_vec	bvec[] __counted_by(nr_bvecs);
>  };

When adding members to a struct, please be cognizant of how it packs.
I'd suggest making the above:

  	u64		ubuf_end;
  	unsigned int	nr_bvecs;
	unsigned int	page_shift;
	unsigned long	page_mask;
  	unsigned long	acct_pages;
	struct bio_vec	bvec[] __counted_by(nr_bvecs);

which should pack much nicer and actually save memory.
Chenliang Li May 7, 2024, 5:22 a.m. UTC | #2
On 5/6/24 6:57 AM, Jens Axboe wrote:
> Can you add some justification to this commit message? A good commit
> message should basically be the WHY of why this commit exists in the
> first place. Your commit message just explains what the patch does,
> which I can just read the code to see for myself.
> 
> As it stands, it's not clear to me or anyone casually reading this
> commit message why the change is being done in the first place.

Thank you for the instruction. I'll submit a V2 patchset with better
commit message.

> Outside of that, you probably want to split this into two parts - one
> that adds the helper for the existing code, then one that modifies it
> for your change. We need this to be as simple as possible to review, as
> we've had a security issue with page coalescing in this code in the
> past.

Will split this in V2.

> Minor comments below, will wait with a full review until this is split
> to be more easily reviewable.

Thank you for the comments. Will address them in V2.
diff mbox series

Patch

diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 65417c9553b1..f9e11131c9a5 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -871,6 +871,80 @@  static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
 	return ret;
 }
 
+/*
+ * For coalesce to work, a buffer must be one or multiple
+ * folios, all the folios except the first and last one
+ * should be of the same size.
+ */
+static bool io_sqe_buffer_try_coalesce(struct page **pages,
+				       unsigned int nr_pages,
+				       struct io_imu_folio_stats *stats)
+{
+	struct folio	*folio = NULL, *first_folio = NULL;
+	unsigned int	page_cnt;
+	int		i, j;
+
+	if (nr_pages <= 1)
+		return false;
+
+	first_folio = page_folio(pages[0]);
+	stats->full_folio_pcnt = folio_nr_pages(first_folio);
+	if (stats->full_folio_pcnt == 1)
+		return false;
+
+	stats->folio_shift = folio_shift(first_folio);
+
+	folio = first_folio;
+	page_cnt = 1;
+	stats->nr_folios = 1;
+	/*
+	 * Check:
+	 * 1. Pages must be contiguous;
+	 * 2. All folios should have the same page count
+	 *    except the first and last one
+	 */
+	for (i = 1; i < nr_pages; i++) {
+		if (page_folio(pages[i]) != folio ||
+		   pages[i] != pages[i-1] + 1) {
+			if (folio == first_folio)
+				stats->first_folio_pcnt = page_cnt;
+			else if (page_cnt != stats->full_folio_pcnt)
+				return false;
+			folio = page_folio(pages[i]);
+			page_cnt = 1;
+			stats->nr_folios++;
+			continue;
+		}
+		page_cnt++;
+	}
+	if (folio == first_folio)
+		stats->first_folio_pcnt = page_cnt;
+
+	if (stats->first_folio_pcnt > 1)
+		/*
+		 * The pages are bound to the folio, it doesn't
+		 * actually unpin them but drops all but one reference,
+		 * which is usually put down by io_buffer_unmap().
+		 * Note, needs a better helper.
+		 */
+		unpin_user_pages(&pages[1], stats->first_folio_pcnt - 1);
+	j = stats->first_folio_pcnt;
+	nr_pages -= stats->first_folio_pcnt;
+	for (i = 1; i < stats->nr_folios; i++) {
+		unsigned int nr_unpin;
+
+		nr_unpin = min_t(unsigned int, nr_pages - 1,
+				stats->full_folio_pcnt - 1);
+		if (nr_unpin <= 1)
+			continue;
+		unpin_user_pages(&pages[j+1], nr_unpin);
+		j += stats->full_folio_pcnt;
+		nr_pages -= stats->full_folio_pcnt;
+	}
+
+	return true;
+}
+
 static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
 				  struct io_mapped_ubuf **pimu,
 				  struct page **last_hpage)
@@ -879,8 +953,9 @@  static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
 	struct page **pages = NULL;
 	unsigned long off;
 	size_t size;
-	int ret, nr_pages, i;
-	struct folio *folio = NULL;
+	int ret, nr_pages, nr_bvecs, i, j;
+	bool coalesced;
+	struct io_imu_folio_stats stats;
 
 	*pimu = (struct io_mapped_ubuf *)&dummy_ubuf;
 	if (!iov->iov_base)
@@ -895,39 +970,26 @@  static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
 		goto done;
 	}
 
-	/* If it's a huge page, try to coalesce them into a single bvec entry */
-	if (nr_pages > 1) {
-		folio = page_folio(pages[0]);
-		for (i = 1; i < nr_pages; i++) {
-			/*
-			 * Pages must be consecutive and on the same folio for
-			 * this to work
-			 */
-			if (page_folio(pages[i]) != folio ||
-			    pages[i] != pages[i - 1] + 1) {
-				folio = NULL;
-				break;
-			}
-		}
-		if (folio) {
-			/*
-			 * The pages are bound to the folio, it doesn't
-			 * actually unpin them but drops all but one reference,
-			 * which is usually put down by io_buffer_unmap().
-			 * Note, needs a better helper.
-			 */
-			unpin_user_pages(&pages[1], nr_pages - 1);
-			nr_pages = 1;
-		}
-	}
-
-	imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
+	/* If it's multiple huge pages, try to coalesce them into fewer bvec entries */
+	coalesced = io_sqe_buffer_try_coalesce(pages, nr_pages, &stats);
+	nr_bvecs = nr_pages;
+	if (coalesced)
+		nr_bvecs = stats.nr_folios;
+	imu = kvmalloc(struct_size(imu, bvec, nr_bvecs), GFP_KERNEL);
 	if (!imu)
 		goto done;
 
 	ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage);
 	if (ret) {
-		unpin_user_pages(pages, nr_pages);
+		if (coalesced) {
+			unpin_user_page(pages[0]);
+			j = stats.first_folio_pcnt;
+			for (i = 1; i < stats.nr_folios; i++) {
+				unpin_user_page(pages[j]);
+				j += stats.full_folio_pcnt;
+			}
+		} else
+			unpin_user_pages(pages, nr_pages);
 		goto done;
 	}
 
@@ -936,12 +998,29 @@  static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
 	/* store original address for later verification */
 	imu->ubuf = (unsigned long) iov->iov_base;
 	imu->ubuf_end = imu->ubuf + iov->iov_len;
-	imu->nr_bvecs = nr_pages;
+	imu->nr_bvecs = nr_bvecs;
+	imu->page_shift = PAGE_SHIFT;
+	imu->page_mask = PAGE_MASK;
+	if (coalesced) {
+		imu->page_shift = stats.folio_shift;
+		imu->page_mask = ~((1UL << stats.folio_shift) - 1);
+	}
 	*pimu = imu;
 	ret = 0;
 
-	if (folio) {
-		bvec_set_page(&imu->bvec[0], pages[0], size, off);
+	if (coalesced) {
+		size_t vec_len;
+
+		vec_len = min_t(size_t, size, PAGE_SIZE * stats.first_folio_pcnt - off);
+		bvec_set_page(&imu->bvec[0], pages[0], vec_len, off);
+		size -= vec_len;
+		j = stats.first_folio_pcnt;
+		for (i = 1; i < nr_bvecs; i++) {
+			vec_len = min_t(size_t, size, PAGE_SIZE * stats.full_folio_pcnt);
+			bvec_set_page(&imu->bvec[i], pages[j], vec_len, 0);
+			size -= vec_len;
+			j += stats.full_folio_pcnt;
+		}
 		goto done;
 	}
 	for (i = 0; i < nr_pages; i++) {
@@ -1049,7 +1128,7 @@  int io_import_fixed(int ddir, struct iov_iter *iter,
 		 * we know that:
 		 *
 		 * 1) it's a BVEC iter, we set it up
-		 * 2) all bvecs are PAGE_SIZE in size, except potentially the
+		 * 2) all bvecs are the same in size, except potentially the
 		 *    first and last bvec
 		 *
 		 * So just find our index, and adjust the iterator afterwards.
@@ -1061,11 +1140,6 @@  int io_import_fixed(int ddir, struct iov_iter *iter,
 		const struct bio_vec *bvec = imu->bvec;
 
 		if (offset < bvec->bv_len) {
-			/*
-			 * Note, huge pages buffers consists of one large
-			 * bvec entry and should always go this way. The other
-			 * branch doesn't expect non PAGE_SIZE'd chunks.
-			 */
 			iter->bvec = bvec;
 			iter->nr_segs = bvec->bv_len;
 			iter->count -= offset;
@@ -1075,12 +1149,12 @@  int io_import_fixed(int ddir, struct iov_iter *iter,
 
 			/* skip first vec */
 			offset -= bvec->bv_len;
-			seg_skip = 1 + (offset >> PAGE_SHIFT);
+			seg_skip = 1 + (offset >> imu->page_shift);
 
 			iter->bvec = bvec + seg_skip;
 			iter->nr_segs -= seg_skip;
 			iter->count -= bvec->bv_len + offset;
-			iter->iov_offset = offset & ~PAGE_MASK;
+			iter->iov_offset = offset & ~(imu->page_mask);
 		}
 	}
 
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index c032ca3436ca..4c655e446150 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -47,9 +47,18 @@  struct io_mapped_ubuf {
 	u64		ubuf_end;
 	unsigned int	nr_bvecs;
 	unsigned long	acct_pages;
+	unsigned int	page_shift;
+	unsigned long	page_mask;
 	struct bio_vec	bvec[] __counted_by(nr_bvecs);
 };
 
+struct io_imu_folio_stats {
+	unsigned int	first_folio_pcnt;
+	unsigned int	full_folio_pcnt;
+	unsigned int	nr_folios;
+	unsigned int	folio_shift;
+};
+
 void io_rsrc_node_ref_zero(struct io_rsrc_node *node);
 void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *ref_node);
 struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx);