@@ -75,6 +75,12 @@ struct io_hash_table {
unsigned hash_bits;
};
+struct io_mapped_region {
+ struct page **pages;
+ void *vmap_ptr;
+ size_t nr_pages;
+};
+
/*
* Arbitrary limit, can be raised if need be
*/
@@ -647,6 +647,19 @@ struct io_uring_files_update {
__aligned_u64 /* __s32 * */ fds;
};
+enum {
+ /* initialise with user memory pointed by user_addr */
+ IORING_REGION_USER_MEM = 1,
+};
+
+struct io_uring_region_desc {
+ __u64 user_addr;
+ __u64 size;
+ __u64 flags;
+ __u64 mmap_offset;
+ __u64 __resv[4];
+};
+
/*
* Register a fully sparse file space, rather than pass in an array of all
* -1 file descriptors.
@@ -12,6 +12,7 @@
#include "memmap.h"
#include "kbuf.h"
+#include "rsrc.h"
static void *io_mem_alloc_compound(struct page **pages, int nr_pages,
size_t size, gfp_t gfp)
@@ -194,6 +195,70 @@ void *__io_uaddr_map(struct page ***pages, unsigned short *npages,
return ERR_PTR(-ENOMEM);
}
+void io_free_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr)
+{
+ if (mr->pages)
+ unpin_user_pages(mr->pages, mr->nr_pages);
+ if (mr->vmap_ptr)
+ vunmap(mr->vmap_ptr);
+ if (mr->nr_pages && ctx->user)
+ __io_unaccount_mem(ctx->user, mr->nr_pages);
+
+ memset(mr, 0, sizeof(*mr));
+}
+
+int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr,
+ struct io_uring_region_desc *reg)
+{
+ int pages_accounted = 0;
+ struct page **pages;
+ int nr_pages, ret;
+ void *vptr;
+ u64 end;
+
+ if (WARN_ON_ONCE(mr->pages || mr->vmap_ptr || mr->nr_pages))
+ return -EFAULT;
+ if (memchr_inv(®->__resv, 0, sizeof(reg->__resv)))
+ return -EINVAL;
+ if (reg->flags != IORING_REGION_USER_MEM)
+ return -EINVAL;
+ if (!reg->user_addr)
+ return -EFAULT;
+ if (!reg->size || reg->mmap_offset)
+ return -EINVAL;
+ if ((reg->size >> PAGE_SHIFT) > INT_MAX)
+ return E2BIG;
+ if ((reg->user_addr | reg->size) & ~PAGE_MASK)
+ return -EINVAL;
+ if (check_add_overflow(reg->user_addr, reg->size, &end))
+ return -EOVERFLOW;
+
+ pages = io_pin_pages(reg->user_addr, reg->size, &nr_pages);
+ if (IS_ERR(pages))
+ return PTR_ERR(pages);
+
+ if (ctx->user) {
+ ret = __io_account_mem(ctx->user, nr_pages);
+ if (ret)
+ goto out_free;
+ pages_accounted = nr_pages;
+ }
+
+ vptr = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
+ if (!vptr)
+ goto out_free;
+
+ mr->pages = pages;
+ mr->vmap_ptr = vptr;
+ mr->nr_pages = nr_pages;
+ return 0;
+out_free:
+ if (pages_accounted)
+ __io_unaccount_mem(ctx->user, pages_accounted);
+ io_pages_free(&pages, nr_pages);
+ return ret;
+}
+
static void *io_uring_validate_mmap_request(struct file *file, loff_t pgoff,
size_t sz)
{
@@ -22,4 +22,18 @@ unsigned long io_uring_get_unmapped_area(struct file *file, unsigned long addr,
unsigned long flags);
int io_uring_mmap(struct file *file, struct vm_area_struct *vma);
+void io_free_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr);
+int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr,
+ struct io_uring_region_desc *reg);
+
+static inline void *io_region_get_ptr(struct io_mapped_region *mr)
+{
+ return mr->vmap_ptr;
+}
+
+static inline bool io_region_is_set(struct io_mapped_region *mr)
+{
+ return !!mr->nr_pages;
+}
+
#endif
We've got a good number of mappings we share with the userspace, that includes the main rings, provided buffer rings and at least a couple more types. And all of them duplicate some of the code for page pinning, mmap'ing and attempts to optimise it with huge pages. Introduce a notion of regions. For userspace it's just a new structure called struct io_uring_region_desc which supposed to parameterise all such mapping / queues creations. It either represents a user provided memory, in which case the user_addr field should point to it, or a request to the kernel to creating the memory, in which case the user is supposed to mmap it after using the offset returned in the mmap_offset field. With uniform userspace API we can avoid additional boiler plate code and when we'd be adding some optimisation it'll be applied to all mapping types. Internally, there is a new structure struct io_mapped_region holding all relevant runtime information and some helpers to work with it. This patch limits it to user provided regions, which will be extended as a follow up work. Signed-off-by: Pavel Begunkov <asml.silence@gmail.com> --- include/linux/io_uring_types.h | 6 ++++ include/uapi/linux/io_uring.h | 13 +++++++ io_uring/memmap.c | 65 ++++++++++++++++++++++++++++++++++ io_uring/memmap.h | 14 ++++++++ 4 files changed, 98 insertions(+)