diff mbox series

[02/11] io_uring: get rid of remap_pfn_range() for mapping rings/sqes

Message ID 20240328233443.797828-3-axboe@kernel.dk (mailing list archive)
State New
Headers show
Series Move away from remap_pfn_range() | expand

Commit Message

Jens Axboe March 28, 2024, 11:31 p.m. UTC
Rather than use remap_pfn_range() for this and manually free later,
switch to using vm_insert_pages() and have it Just Work.

If possible, allocate a single compound page that covers the range that
is needed. If that works, then we can just use page_address() on that
page. If we fail to get a compound page, allocate single pages and use
vmap() to map them into the kernel virtual address space.

This just covers the rings/sqes, the other remaining user of the mmap
remap_pfn_range() user will be converted separately. Once that is done,
we can kill the old alloc/free code.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 136 +++++++++++++++++++++++++++++++++++++++++---
 io_uring/io_uring.h |   2 +
 2 files changed, 130 insertions(+), 8 deletions(-)

Comments

Gabriel Krisman Bertazi March 30, 2024, 3:50 a.m. UTC | #1
Jens Axboe <axboe@kernel.dk> writes:

> Rather than use remap_pfn_range() for this and manually free later,
> switch to using vm_insert_pages() and have it Just Work.
>
> If possible, allocate a single compound page that covers the range that
> is needed. If that works, then we can just use page_address() on that
> page. If we fail to get a compound page, allocate single pages and use
> vmap() to map them into the kernel virtual address space.
>
> This just covers the rings/sqes, the other remaining user of the mmap
> remap_pfn_range() user will be converted separately. Once that is done,
> we can kill the old alloc/free code.
>
> Signed-off-by: Jens Axboe <axboe@kernel.dk>
> ---
>  io_uring/io_uring.c | 136 +++++++++++++++++++++++++++++++++++++++++---
>  io_uring/io_uring.h |   2 +
>  2 files changed, 130 insertions(+), 8 deletions(-)
>
> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
> index 104899522bc5..982545ca23f9 100644
> --- a/io_uring/io_uring.c
> +++ b/io_uring/io_uring.c
> @@ -2594,6 +2594,33 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
>  	return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
>  }
>  
> +static void io_pages_unmap(void *ptr, struct page ***pages,
> +			   unsigned short *npages)
> +{
> +	bool do_vunmap = false;
> +
> +	if (*npages) {
> +		struct page **to_free = *pages;
> +		int i;
> +
> +		/*
> +		 * Only did vmap for the non-compound multiple page case.
> +		 * For the compound page, we just need to put the head.
> +		 */
> +		if (PageCompound(to_free[0]))
> +			*npages = 1;
> +		else if (*npages > 1)
> +			do_vunmap = true;
> +		for (i = 0; i < *npages; i++)
> +			put_page(to_free[i]);
> +	}

Hi Jens,

wouldn't it be simpler to handle the compound case separately as a
folio?  Then you folio_put the compound page here and just handle the
non-continuous case after.
Jens Axboe March 30, 2024, 3:14 p.m. UTC | #2
On 3/29/24 9:50 PM, Gabriel Krisman Bertazi wrote:
> Jens Axboe <axboe@kernel.dk> writes:
> 
>> Rather than use remap_pfn_range() for this and manually free later,
>> switch to using vm_insert_pages() and have it Just Work.
>>
>> If possible, allocate a single compound page that covers the range that
>> is needed. If that works, then we can just use page_address() on that
>> page. If we fail to get a compound page, allocate single pages and use
>> vmap() to map them into the kernel virtual address space.
>>
>> This just covers the rings/sqes, the other remaining user of the mmap
>> remap_pfn_range() user will be converted separately. Once that is done,
>> we can kill the old alloc/free code.
>>
>> Signed-off-by: Jens Axboe <axboe@kernel.dk>
>> ---
>>  io_uring/io_uring.c | 136 +++++++++++++++++++++++++++++++++++++++++---
>>  io_uring/io_uring.h |   2 +
>>  2 files changed, 130 insertions(+), 8 deletions(-)
>>
>> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
>> index 104899522bc5..982545ca23f9 100644
>> --- a/io_uring/io_uring.c
>> +++ b/io_uring/io_uring.c
>> @@ -2594,6 +2594,33 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
>>  	return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
>>  }
>>  
>> +static void io_pages_unmap(void *ptr, struct page ***pages,
>> +			   unsigned short *npages)
>> +{
>> +	bool do_vunmap = false;
>> +
>> +	if (*npages) {
>> +		struct page **to_free = *pages;
>> +		int i;
>> +
>> +		/*
>> +		 * Only did vmap for the non-compound multiple page case.
>> +		 * For the compound page, we just need to put the head.
>> +		 */
>> +		if (PageCompound(to_free[0]))
>> +			*npages = 1;
>> +		else if (*npages > 1)
>> +			do_vunmap = true;
>> +		for (i = 0; i < *npages; i++)
>> +			put_page(to_free[i]);
>> +	}
> 
> Hi Jens,
> 
> wouldn't it be simpler to handle the compound case separately as a
> folio?  Then you folio_put the compound page here and just handle the
> non-continuous case after.

I don't think it makes sense, as we're still dealing with pages for
insertion. Once there's some folio variant of inserting pages, then yeah
I think it'd make sense to unify it. If not, we're doing the page <->
folio transition in one spot anyway.
diff mbox series

Patch

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 104899522bc5..982545ca23f9 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2594,6 +2594,33 @@  static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 	return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
 }
 
+static void io_pages_unmap(void *ptr, struct page ***pages,
+			   unsigned short *npages)
+{
+	bool do_vunmap = false;
+
+	if (*npages) {
+		struct page **to_free = *pages;
+		int i;
+
+		/*
+		 * Only did vmap for the non-compound multiple page case.
+		 * For the compound page, we just need to put the head.
+		 */
+		if (PageCompound(to_free[0]))
+			*npages = 1;
+		else if (*npages > 1)
+			do_vunmap = true;
+		for (i = 0; i < *npages; i++)
+			put_page(to_free[i]);
+	}
+	if (do_vunmap)
+		vunmap(ptr);
+	kvfree(*pages);
+	*pages = NULL;
+	*npages = 0;
+}
+
 void io_mem_free(void *ptr)
 {
 	if (!ptr)
@@ -2694,8 +2721,8 @@  static void *io_sqes_map(struct io_ring_ctx *ctx, unsigned long uaddr,
 static void io_rings_free(struct io_ring_ctx *ctx)
 {
 	if (!(ctx->flags & IORING_SETUP_NO_MMAP)) {
-		io_mem_free(ctx->rings);
-		io_mem_free(ctx->sq_sqes);
+		io_pages_unmap(ctx->rings, &ctx->ring_pages, &ctx->n_ring_pages);
+		io_pages_unmap(ctx->sq_sqes, &ctx->sqe_pages, &ctx->n_sqe_pages);
 	} else {
 		io_pages_free(&ctx->ring_pages, ctx->n_ring_pages);
 		ctx->n_ring_pages = 0;
@@ -2707,6 +2734,80 @@  static void io_rings_free(struct io_ring_ctx *ctx)
 	ctx->sq_sqes = NULL;
 }
 
+static void *io_mem_alloc_compound(struct page **pages, int nr_pages,
+				   size_t size, gfp_t gfp)
+{
+	struct page *page;
+	int i, order;
+
+	order = get_order(size);
+	if (order > MAX_PAGE_ORDER)
+		return NULL;
+	else if (order)
+		gfp |= __GFP_COMP;
+
+	page = alloc_pages(gfp, order);
+	if (!page)
+		return NULL;
+
+	for (i = 0; i < nr_pages; i++)
+		pages[i] = page + i;
+
+	return page_address(page);
+}
+
+static void *io_mem_alloc_single(struct page **pages, int nr_pages, size_t size,
+				 gfp_t gfp)
+{
+	void *ret;
+	int i;
+
+	for (i = 0; i < nr_pages; i++) {
+		pages[i] = alloc_page(gfp);
+		if (!pages[i])
+			goto err;
+	}
+
+	ret = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
+	if (ret)
+		return ret;
+err:
+	while (i--)
+		put_page(pages[i]);
+	return ERR_PTR(-ENOMEM);
+}
+
+static void *io_pages_map(struct page ***out_pages, unsigned short *npages,
+			  size_t size)
+{
+	gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN;
+	struct page **pages;
+	int nr_pages;
+	void *ret;
+
+	nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	pages = kvmalloc_array(nr_pages, sizeof(struct page *), gfp);
+	if (!pages)
+		return ERR_PTR(-ENOMEM);
+
+	ret = io_mem_alloc_compound(pages, nr_pages, size, gfp);
+	if (ret)
+		goto done;
+
+	ret = io_mem_alloc_single(pages, nr_pages, size, gfp);
+	if (ret) {
+done:
+		*out_pages = pages;
+		*npages = nr_pages;
+		return ret;
+	}
+
+	kvfree(pages);
+	*out_pages = NULL;
+	*npages = 0;
+	return ERR_PTR(-ENOMEM);
+}
+
 void *io_mem_alloc(size_t size)
 {
 	gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP;
@@ -3294,14 +3395,12 @@  static void *io_uring_validate_mmap_request(struct file *file,
 		/* Don't allow mmap if the ring was setup without it */
 		if (ctx->flags & IORING_SETUP_NO_MMAP)
 			return ERR_PTR(-EINVAL);
-		ptr = ctx->rings;
-		break;
+		return ctx->rings;
 	case IORING_OFF_SQES:
 		/* Don't allow mmap if the ring was setup without it */
 		if (ctx->flags & IORING_SETUP_NO_MMAP)
 			return ERR_PTR(-EINVAL);
-		ptr = ctx->sq_sqes;
-		break;
+		return ctx->sq_sqes;
 	case IORING_OFF_PBUF_RING: {
 		unsigned int bgid;
 
@@ -3324,11 +3423,22 @@  static void *io_uring_validate_mmap_request(struct file *file,
 	return ptr;
 }
 
+int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma,
+			struct page **pages, int npages)
+{
+	unsigned long nr_pages = npages;
+
+	vm_flags_set(vma, VM_DONTEXPAND);
+	return vm_insert_pages(vma, vma->vm_start, pages, &nr_pages);
+}
+
 #ifdef CONFIG_MMU
 
 static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
 {
+	struct io_ring_ctx *ctx = file->private_data;
 	size_t sz = vma->vm_end - vma->vm_start;
+	long offset = vma->vm_pgoff << PAGE_SHIFT;
 	unsigned long pfn;
 	void *ptr;
 
@@ -3336,6 +3446,16 @@  static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
 	if (IS_ERR(ptr))
 		return PTR_ERR(ptr);
 
+	switch (offset & IORING_OFF_MMAP_MASK) {
+	case IORING_OFF_SQ_RING:
+	case IORING_OFF_CQ_RING:
+		return io_uring_mmap_pages(ctx, vma, ctx->ring_pages,
+						ctx->n_ring_pages);
+	case IORING_OFF_SQES:
+		return io_uring_mmap_pages(ctx, vma, ctx->sqe_pages,
+						ctx->n_sqe_pages);
+	}
+
 	pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
 	return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
 }
@@ -3625,7 +3745,7 @@  static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
 		return -EOVERFLOW;
 
 	if (!(ctx->flags & IORING_SETUP_NO_MMAP))
-		rings = io_mem_alloc(size);
+		rings = io_pages_map(&ctx->ring_pages, &ctx->n_ring_pages, size);
 	else
 		rings = io_rings_map(ctx, p->cq_off.user_addr, size);
 
@@ -3650,7 +3770,7 @@  static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
 	}
 
 	if (!(ctx->flags & IORING_SETUP_NO_MMAP))
-		ptr = io_mem_alloc(size);
+		ptr = io_pages_map(&ctx->sqe_pages, &ctx->n_sqe_pages, size);
 	else
 		ptr = io_sqes_map(ctx, p->sq_off.user_addr, size);
 
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index dbd9a2b870eb..75230d914007 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -70,6 +70,8 @@  bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags);
 void __io_commit_cqring_flush(struct io_ring_ctx *ctx);
 
 struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages);
+int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma,
+			struct page **pages, int npages);
 
 struct file *io_file_get_normal(struct io_kiocb *req, int fd);
 struct file *io_file_get_fixed(struct io_kiocb *req, int fd,