diff mbox series

[v4,3/4] io_uring/rsrc: add init and account functions for coalesced imus

Message ID 20240514075444.590910-4-cliang01.li@samsung.com (mailing list archive)
State New
Headers show
Series io_uring/rsrc: coalescing multi-hugepage registered buffers | expand

Commit Message

Chenliang Li May 14, 2024, 7:54 a.m. UTC
Introduce two functions to separate the coalesced imu alloc and
accounting path from the original one. This helps to keep the original
code path clean.

Signed-off-by: Chenliang Li <cliang01.li@samsung.com>
---
 io_uring/rsrc.c | 89 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 89 insertions(+)

Comments

Pavel Begunkov June 16, 2024, 5:43 p.m. UTC | #1
On 5/14/24 08:54, Chenliang Li wrote:
> Introduce two functions to separate the coalesced imu alloc and
> accounting path from the original one. This helps to keep the original
> code path clean.
> 
> Signed-off-by: Chenliang Li <cliang01.li@samsung.com>
> ---
>   io_uring/rsrc.c | 89 +++++++++++++++++++++++++++++++++++++++++++++++++
>   1 file changed, 89 insertions(+)
> 
> diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
> index 578d382ca9bc..53fac5f27bbf 100644
> --- a/io_uring/rsrc.c
> +++ b/io_uring/rsrc.c
> @@ -871,6 +871,45 @@ static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
>   	return ret;
>   }
>   
> +static int io_coalesced_buffer_account_pin(struct io_ring_ctx *ctx,
> +					   struct page **pages,
> +					   struct io_mapped_ubuf *imu,
> +					   struct page **last_hpage,
> +					   struct io_imu_folio_data *data)
> +{
> +	int i, j, ret;
> +
> +	imu->acct_pages = 0;
> +	j = 0;
> +	for (i = 0; i < data->nr_folios; i++) {
> +		struct page *hpage = pages[j];
> +
> +		if (hpage == *last_hpage)
> +			continue;
> +		*last_hpage = hpage;
> +		/*
> +		 * Already checked the page array in try coalesce,
> +		 * so pass in nr_pages=0 here to waive that.
> +		 */
> +		if (headpage_already_acct(ctx, pages, 0, hpage))
> +			continue;
> +		imu->acct_pages += data->nr_pages_mid;
> +		if (i)
> +			j += data->nr_pages_mid;
> +		else
> +			j = data->nr_pages_head;

You should account an entire folio here, i.e. ->nr_pages_mid
in either case. Let's say the first page in the registration
is the last page of a huge page, you'd account 4K while it
actually pins the entire huge page size.
It seems like you can just call io_buffer_account_pin()
instead.

On that note, you shouldn't duplicate code in either case,
just treat the normal discontig pages case as folios of
shift=PAGE_SHIFT.

Either just plain reuse or adjust io_buffer_account_pin()
instead of io_coalesced_buffer_account_pin().
io_coalesced_imu_alloc() should also go away.

io_sqe_buffer_register() {
	struct io_imu_folio_data data;

	if (!io_sqe_buffer_try_coalesce(pages, folio_data)) {
		folio_data.shift = PAGE_SHIFT;
		...
	}
	
	io_buffer_account_pin(pages, &data);
	imu->data = uaddr;
	...
}

> +	}
> +
> +	if (!imu->acct_pages)
> +		return 0;
> +
> +	ret = io_account_mem(ctx, imu->acct_pages);
> +	if (!ret)
> +		return 0;
> +	imu->acct_pages = 0;
> +	return ret;
> +}
> +
>   static bool __io_sqe_buffer_try_coalesce(struct page **pages, int nr_pages,
>   					 struct io_imu_folio_data *data)
>   {
> @@ -949,6 +988,56 @@ static bool io_sqe_buffer_try_coalesce(struct page **pages, int nr_pages,
>   	return true;
>   }
>   
> +static int io_coalesced_imu_alloc(struct io_ring_ctx *ctx, struct iovec *iov,
> +				  struct io_mapped_ubuf **pimu,
> +				  struct page **last_hpage, struct page **pages,
> +				  struct io_imu_folio_data *data)
> +{
> +	struct io_mapped_ubuf *imu = NULL;
> +	unsigned long off;
> +	size_t size, vec_len;
> +	int ret, i, j;
> +
> +	ret = -ENOMEM;
> +	imu = kvmalloc(struct_size(imu, bvec, data->nr_folios), GFP_KERNEL);
> +	if (!imu)
> +		return ret;
> +
> +	ret = io_coalesced_buffer_account_pin(ctx, pages, imu, last_hpage,
> +						data);
> +	if (ret) {
> +		unpin_user_page(pages[0]);
> +		j = data->nr_pages_head;
> +		for (i = 1; i < data->nr_folios; i++) {
> +			unpin_user_page(pages[j]);
> +			j += data->nr_pages_mid;
> +		}
> +		return ret;
> +	}
> +	off = (unsigned long) iov->iov_base & ~PAGE_MASK;
> +	size = iov->iov_len;
> +	/* store original address for later verification */
> +	imu->ubuf = (unsigned long) iov->iov_base;
> +	imu->ubuf_end = imu->ubuf + iov->iov_len;
> +	imu->nr_bvecs = data->nr_folios;
> +	imu->folio_shift = data->folio_shift;
> +	imu->folio_mask = ~((1UL << data->folio_shift) - 1);
> +	*pimu = imu;
> +	ret = 0;
> +
> +	vec_len = min_t(size_t, size, PAGE_SIZE * data->nr_pages_head - off);
> +	bvec_set_page(&imu->bvec[0], pages[0], vec_len, off);
> +	size -= vec_len;
> +	j = data->nr_pages_head;
> +	for (i = 1; i < data->nr_folios; i++) {
> +		vec_len = min_t(size_t, size, data->folio_size);
> +		bvec_set_page(&imu->bvec[i], pages[j], vec_len, 0);
> +		size -= vec_len;
> +		j += data->nr_pages_mid;
> +	}
> +	return ret;
> +}
> +
>   static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
>   				  struct io_mapped_ubuf **pimu,
>   				  struct page **last_hpage)
Chenliang Li June 17, 2024, 3:16 a.m. UTC | #2
Actually here it does account an entire folio. The j is just
array index.

> It seems like you can just call io_buffer_account_pin()
> instead.
>
> On that note, you shouldn't duplicate code in either case,
> just treat the normal discontig pages case as folios of
> shift=PAGE_SHIFT.
>
> Either just plain reuse or adjust io_buffer_account_pin()
> instead of io_coalesced_buffer_account_pin().
> io_coalesced_imu_alloc() should also go away.
>
> io_sqe_buffer_register() {
> 	struct io_imu_folio_data data;
>
>	if (!io_sqe_buffer_try_coalesce(pages, folio_data)) {
>		folio_data.shift = PAGE_SHIFT;
>		...
>	}
>	
>	io_buffer_account_pin(pages, &data);
>	imu->data = uaddr;
>	...
> }

Will remove them.

Thanks,
Chenliang Li
Pavel Begunkov June 17, 2024, 12:22 p.m. UTC | #3
On 6/17/24 04:16, Chenliang Li wrote:
> Actually here it does account an entire folio. The j is just
> array index.

Hmm, indeed. Is iteration the only difference here?

>> It seems like you can just call io_buffer_account_pin()
>> instead.
>>
>> On that note, you shouldn't duplicate code in either case,
>> just treat the normal discontig pages case as folios of
>> shift=PAGE_SHIFT.
>>
>> Either just plain reuse or adjust io_buffer_account_pin()
>> instead of io_coalesced_buffer_account_pin().
>> io_coalesced_imu_alloc() should also go away.
>>
>> io_sqe_buffer_register() {
>> 	struct io_imu_folio_data data;
>>
>> 	if (!io_sqe_buffer_try_coalesce(pages, folio_data)) {
>> 		folio_data.shift = PAGE_SHIFT;
>> 		...
>> 	}
>> 	
>> 	io_buffer_account_pin(pages, &data);
>> 	imu->data = uaddr;
>> 	...
>> }
> 
> Will remove them.
Chenliang Li June 18, 2024, 3:24 a.m. UTC | #4
On Mon, 17 Jun 2024 13:22:43 +0100, Pavel Begunkov wrote:
> On 6/17/24 04:16, Chenliang Li wrote:
>> Actually here it does account an entire folio. The j is just
>> array index.
>
> Hmm, indeed. Is iteration the only difference here?

Yes, but I'll try make this less confusing.

Thanks,
Chenliang Li
diff mbox series

Patch

diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 578d382ca9bc..53fac5f27bbf 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -871,6 +871,45 @@  static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
 	return ret;
 }
 
+static int io_coalesced_buffer_account_pin(struct io_ring_ctx *ctx,
+					   struct page **pages,
+					   struct io_mapped_ubuf *imu,
+					   struct page **last_hpage,
+					   struct io_imu_folio_data *data)
+{
+	int i, j, ret;
+
+	imu->acct_pages = 0;
+	j = 0;
+	for (i = 0; i < data->nr_folios; i++) {
+		struct page *hpage = pages[j];
+
+		if (hpage == *last_hpage)
+			continue;
+		*last_hpage = hpage;
+		/*
+		 * Already checked the page array in try coalesce,
+		 * so pass in nr_pages=0 here to waive that.
+		 */
+		if (headpage_already_acct(ctx, pages, 0, hpage))
+			continue;
+		imu->acct_pages += data->nr_pages_mid;
+		if (i)
+			j += data->nr_pages_mid;
+		else
+			j = data->nr_pages_head;
+	}
+
+	if (!imu->acct_pages)
+		return 0;
+
+	ret = io_account_mem(ctx, imu->acct_pages);
+	if (!ret)
+		return 0;
+	imu->acct_pages = 0;
+	return ret;
+}
+
 static bool __io_sqe_buffer_try_coalesce(struct page **pages, int nr_pages,
 					 struct io_imu_folio_data *data)
 {
@@ -949,6 +988,56 @@  static bool io_sqe_buffer_try_coalesce(struct page **pages, int nr_pages,
 	return true;
 }
 
+static int io_coalesced_imu_alloc(struct io_ring_ctx *ctx, struct iovec *iov,
+				  struct io_mapped_ubuf **pimu,
+				  struct page **last_hpage, struct page **pages,
+				  struct io_imu_folio_data *data)
+{
+	struct io_mapped_ubuf *imu = NULL;
+	unsigned long off;
+	size_t size, vec_len;
+	int ret, i, j;
+
+	ret = -ENOMEM;
+	imu = kvmalloc(struct_size(imu, bvec, data->nr_folios), GFP_KERNEL);
+	if (!imu)
+		return ret;
+
+	ret = io_coalesced_buffer_account_pin(ctx, pages, imu, last_hpage,
+						data);
+	if (ret) {
+		unpin_user_page(pages[0]);
+		j = data->nr_pages_head;
+		for (i = 1; i < data->nr_folios; i++) {
+			unpin_user_page(pages[j]);
+			j += data->nr_pages_mid;
+		}
+		return ret;
+	}
+	off = (unsigned long) iov->iov_base & ~PAGE_MASK;
+	size = iov->iov_len;
+	/* store original address for later verification */
+	imu->ubuf = (unsigned long) iov->iov_base;
+	imu->ubuf_end = imu->ubuf + iov->iov_len;
+	imu->nr_bvecs = data->nr_folios;
+	imu->folio_shift = data->folio_shift;
+	imu->folio_mask = ~((1UL << data->folio_shift) - 1);
+	*pimu = imu;
+	ret = 0;
+
+	vec_len = min_t(size_t, size, PAGE_SIZE * data->nr_pages_head - off);
+	bvec_set_page(&imu->bvec[0], pages[0], vec_len, off);
+	size -= vec_len;
+	j = data->nr_pages_head;
+	for (i = 1; i < data->nr_folios; i++) {
+		vec_len = min_t(size_t, size, data->folio_size);
+		bvec_set_page(&imu->bvec[i], pages[j], vec_len, 0);
+		size -= vec_len;
+		j += data->nr_pages_mid;
+	}
+	return ret;
+}
+
 static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
 				  struct io_mapped_ubuf **pimu,
 				  struct page **last_hpage)