diff mbox series

[12/18] io_uring: add support for pre-mapped user IO buffers

Message ID 20190207195552.22770-13-axboe@kernel.dk (mailing list archive)
State New, archived
Headers show
Series [01/18] fs: add an iopoll method to struct file_operations | expand

Commit Message

Jens Axboe Feb. 7, 2019, 7:55 p.m. UTC
If we have fixed user buffers, we can map them into the kernel when we
setup the io_context. That avoids the need to do get_user_pages() for
each and every IO.

To utilize this feature, the application must call io_uring_register()
after having setup an io_uring context, passing in
IORING_REGISTER_BUFFERS as the opcode. The argument must be a pointer
to an iovec array, and the nr_args should contain how many iovecs the
application wishes to map.

If successful, these buffers are now mapped into the kernel, eligible
for IO. To use these fixed buffers, the application must use the
IORING_OP_READ_FIXED and IORING_OP_WRITE_FIXED opcodes, and then
set sqe->index to the desired buffer index. sqe->addr..sqe->addr+seq->len
must point to somewhere inside the indexed buffer.

The application may register buffers throughout the lifetime of the
io_uring context. It can call io_uring_register() with
IORING_UNREGISTER_BUFFERS as the opcode to unregister the current set of
buffers, and then register a new set. The application need not
unregister buffers explicitly before shutting down the io_uring context.

It's perfectly valid to setup a larger buffer, and then sometimes only
use parts of it for an IO. As long as the range is within the originally
mapped region, it will work just fine.

For now, buffers must not be file backed. If file backed buffers are
passed in, the registration will fail with -1/EOPNOTSUPP. This
restriction may be relaxed in the future.

RLIMIT_MEMLOCK is used to check how much memory we can pin. A somewhat
arbitrary 1G per buffer size is also imposed.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 arch/x86/entry/syscalls/syscall_32.tbl |   1 +
 arch/x86/entry/syscalls/syscall_64.tbl |   1 +
 fs/io_uring.c                          | 356 ++++++++++++++++++++++++-
 include/linux/sched/user.h             |   2 +-
 include/linux/syscalls.h               |   2 +
 include/uapi/asm-generic/unistd.h      |   4 +-
 include/uapi/linux/io_uring.h          |  13 +-
 kernel/sys_ni.c                        |   1 +
 8 files changed, 363 insertions(+), 17 deletions(-)

Comments

Jeff Moyer Feb. 7, 2019, 8:57 p.m. UTC | #1
Hi, Jens,

Jens Axboe <axboe@kernel.dk> writes:

> +static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx)
> +{
> +	int i, j;
> +
> +	if (!ctx->user_bufs)
> +		return -ENXIO;
> +
> +	for (i = 0; i < ctx->sq_entries; i++) {
> +		struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
> +
> +		for (j = 0; j < imu->nr_bvecs; j++)
> +			put_page(imu->bvec[j].bv_page);
> +
> +		io_unaccount_mem(ctx->user, imu->nr_bvecs);
> +		kfree(imu->bvec);
> +		imu->nr_bvecs = 0;
> +	}
> +
> +	kfree(ctx->user_bufs);
> +	ctx->user_bufs = NULL;
> +	free_uid(ctx->user);
        ^^^^^^^^^^^^^^^^^^^
> +	ctx->user = NULL;
        ^^^^^^^^^^^^^^^^^

I don't think you want to do that here.  If you do an
IORING_REGISTER_BUFFERS, followed by IORING_UNREGISTER_BUFFERS, and then
follow that up with IORING_REGISTER_FILES, you'll get a null pointer
dereference trying to bump the reference count of the (now NULL)
ctx->user (io_uring.c:1944):

[  216.927990] BUG: unable to handle kernel NULL pointer dereference at 0000000000000000
[  216.935825] #PF error: [WRITE]
[  216.938883] PGD 5f39244067 P4D 5f39244067 PUD 5f043ca067 PMD 0 
[  216.944803] Oops: 0002 [#1] SMP
[  216.947949] CPU: 79 PID: 3371 Comm: io_uring_regist Not tainted 5.0.0-rc5.io_uring.4+ #26
[  216.956119] Hardware name: Intel Corporation S2600WFD/S2600WFD, BIOS SE5C620.86B.0D.01.0108.091420182119 09/14/2018
[  216.966553] RIP: 0010:__io_uring_register+0x1c2/0x7c0
[  216.971606] Code: 49 89 c6 48 85 c0 0f 84 9b 05 00 00 48 8b 83 20 02 00 00 48 8b 40 20 49 c7 46 60 60 89 1d 96 49 89 46 18 48 8b 83 18 01 00 00 <f0> ff 00 0f 88 1a a0 52 00 45 31 e4 66 83 7d 00 00 48 89 45 08 7e
[  216.990355] RSP: 0018:ffffb296087e3e70 EFLAGS: 00010286
[  216.995578] RAX: 0000000000000000 RBX: ffff9aacbbff3800 RCX: 0000000000000000
[  217.002711] RDX: ffff9aacbbaf1ac0 RSI: 00000000ffffffff RDI: ffff9aacb9a8f6b0
[  217.009842] RBP: ffff9aacbb45e800 R08: 00000000000000c0 R09: ffff9a4e87c07000
[  217.016977] R10: 0000000000000006 R11: ffff9aac97da9b00 R12: 00007efdc3dbd1fc
[  217.024107] R13: ffff9aacbb45ec08 R14: ffff9aacb9a8f600 R15: ffff9aac97da9a00
[  217.031241] FS:  00007f01c439e500(0000) GS:ffff9aacbf7c0000(0000) knlGS:0000000000000000
[  217.039326] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  217.045075] CR2: 0000000000000000 CR3: 0000005f08d85002 CR4: 00000000007606e0
[  217.052207] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  217.059340] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[  217.066472] PKRU: 55555554
[  217.069183] Call Trace:
[  217.071638]  __x64_sys_io_uring_register+0x91/0xb0
[  217.076433]  do_syscall_64+0x4f/0x190
[  217.080110]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
[  217.085167] RIP: 0033:0x7f01c3eb42bd
[  217.088743] Code: 00 c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 9b 6b 2c 00 f7 d8 64 89 01 48

I'd expect ctx->user to live as long as the io_uring context itself,
right?

-Jeff
Jens Axboe Feb. 7, 2019, 9:02 p.m. UTC | #2
On 2/7/19 1:57 PM, Jeff Moyer wrote:
> Hi, Jens,
> 
> Jens Axboe <axboe@kernel.dk> writes:
> 
>> +static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx)
>> +{
>> +	int i, j;
>> +
>> +	if (!ctx->user_bufs)
>> +		return -ENXIO;
>> +
>> +	for (i = 0; i < ctx->sq_entries; i++) {
>> +		struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
>> +
>> +		for (j = 0; j < imu->nr_bvecs; j++)
>> +			put_page(imu->bvec[j].bv_page);
>> +
>> +		io_unaccount_mem(ctx->user, imu->nr_bvecs);
>> +		kfree(imu->bvec);
>> +		imu->nr_bvecs = 0;
>> +	}
>> +
>> +	kfree(ctx->user_bufs);
>> +	ctx->user_bufs = NULL;
>> +	free_uid(ctx->user);
>         ^^^^^^^^^^^^^^^^^^^
>> +	ctx->user = NULL;
>         ^^^^^^^^^^^^^^^^^
> 
> I don't think you want to do that here.  If you do an
> IORING_REGISTER_BUFFERS, followed by IORING_UNREGISTER_BUFFERS, and then
> follow that up with IORING_REGISTER_FILES, you'll get a null pointer
> dereference trying to bump the reference count of the (now NULL)
> ctx->user (io_uring.c:1944):
> 
> [  216.927990] BUG: unable to handle kernel NULL pointer dereference at 0000000000000000
> [  216.935825] #PF error: [WRITE]
> [  216.938883] PGD 5f39244067 P4D 5f39244067 PUD 5f043ca067 PMD 0 
> [  216.944803] Oops: 0002 [#1] SMP
> [  216.947949] CPU: 79 PID: 3371 Comm: io_uring_regist Not tainted 5.0.0-rc5.io_uring.4+ #26
> [  216.956119] Hardware name: Intel Corporation S2600WFD/S2600WFD, BIOS SE5C620.86B.0D.01.0108.091420182119 09/14/2018
> [  216.966553] RIP: 0010:__io_uring_register+0x1c2/0x7c0
> [  216.971606] Code: 49 89 c6 48 85 c0 0f 84 9b 05 00 00 48 8b 83 20 02 00 00 48 8b 40 20 49 c7 46 60 60 89 1d 96 49 89 46 18 48 8b 83 18 01 00 00 <f0> ff 00 0f 88 1a a0 52 00 45 31 e4 66 83 7d 00 00 48 89 45 08 7e
> [  216.990355] RSP: 0018:ffffb296087e3e70 EFLAGS: 00010286
> [  216.995578] RAX: 0000000000000000 RBX: ffff9aacbbff3800 RCX: 0000000000000000
> [  217.002711] RDX: ffff9aacbbaf1ac0 RSI: 00000000ffffffff RDI: ffff9aacb9a8f6b0
> [  217.009842] RBP: ffff9aacbb45e800 R08: 00000000000000c0 R09: ffff9a4e87c07000
> [  217.016977] R10: 0000000000000006 R11: ffff9aac97da9b00 R12: 00007efdc3dbd1fc
> [  217.024107] R13: ffff9aacbb45ec08 R14: ffff9aacb9a8f600 R15: ffff9aac97da9a00
> [  217.031241] FS:  00007f01c439e500(0000) GS:ffff9aacbf7c0000(0000) knlGS:0000000000000000
> [  217.039326] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [  217.045075] CR2: 0000000000000000 CR3: 0000005f08d85002 CR4: 00000000007606e0
> [  217.052207] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> [  217.059340] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> [  217.066472] PKRU: 55555554
> [  217.069183] Call Trace:
> [  217.071638]  __x64_sys_io_uring_register+0x91/0xb0
> [  217.076433]  do_syscall_64+0x4f/0x190
> [  217.080110]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
> [  217.085167] RIP: 0033:0x7f01c3eb42bd
> [  217.088743] Code: 00 c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 9b 6b 2c 00 f7 d8 64 89 01 48
> 
> I'd expect ctx->user to live as long as the io_uring context itself,
> right?

Yes, it used to just be used for the buffers, now we use it generally. I've
fixed that up, thanks Jeff!
Jeff Moyer Feb. 7, 2019, 10:38 p.m. UTC | #3
Hi, Jens,

Jens Axboe <axboe@kernel.dk> writes:

> For now, buffers must not be file backed. If file backed buffers are
> passed in, the registration will fail with -1/EOPNOTSUPP. This
> restriction may be relaxed in the future.

[...]

> +		down_write(&current->mm->mmap_sem);
> +		pret = get_user_pages_longterm(ubuf, nr_pages, FOLL_WRITE,
> +						pages, vmas);
> +		if (pret == nr_pages) {
> +			/* don't support file backed memory */
> +			for (j = 0; j < nr_pages; j++) {
> +				struct vm_area_struct *vma = vmas[j];
> +
> +				if (vma->vm_file) {
> +					ret = -EOPNOTSUPP;
> +					break;
> +				}
> +			}

Unfortunately, this suffers the same problem as FOLL_ANON.  Huge pages
are backed by hugetlbfs, and vma->vm_file will be filled in.

I guess you could check is_file_hugepages(vma->vm_file):

        if (vma->vm_file &&
            !is_file_hugepages(vma->vm_file)) {
                ret = -EOPNOTSUPP;
                break;
       }

That works for me.

-Jeff
Jens Axboe Feb. 7, 2019, 10:47 p.m. UTC | #4
On 2/7/19 3:38 PM, Jeff Moyer wrote:
> Hi, Jens,
> 
> Jens Axboe <axboe@kernel.dk> writes:
> 
>> For now, buffers must not be file backed. If file backed buffers are
>> passed in, the registration will fail with -1/EOPNOTSUPP. This
>> restriction may be relaxed in the future.
> 
> [...]
> 
>> +		down_write(&current->mm->mmap_sem);
>> +		pret = get_user_pages_longterm(ubuf, nr_pages, FOLL_WRITE,
>> +						pages, vmas);
>> +		if (pret == nr_pages) {
>> +			/* don't support file backed memory */
>> +			for (j = 0; j < nr_pages; j++) {
>> +				struct vm_area_struct *vma = vmas[j];
>> +
>> +				if (vma->vm_file) {
>> +					ret = -EOPNOTSUPP;
>> +					break;
>> +				}
>> +			}
> 
> Unfortunately, this suffers the same problem as FOLL_ANON.  Huge pages
> are backed by hugetlbfs, and vma->vm_file will be filled in.
> 
> I guess you could check is_file_hugepages(vma->vm_file):
> 
>         if (vma->vm_file &&
>             !is_file_hugepages(vma->vm_file)) {
>                 ret = -EOPNOTSUPP;
>                 break;
>        }
> 
> That works for me.

Thanks, that looks better. Fixed!
diff mbox series

Patch

diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 481c126259e9..2eefd2a7c1ce 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -400,3 +400,4 @@ 
 386	i386	rseq			sys_rseq			__ia32_sys_rseq
 425	i386	io_uring_setup		sys_io_uring_setup		__ia32_sys_io_uring_setup
 426	i386	io_uring_enter		sys_io_uring_enter		__ia32_sys_io_uring_enter
+427	i386	io_uring_register	sys_io_uring_register		__ia32_sys_io_uring_register
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 6a32a430c8e0..65c026185e61 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -345,6 +345,7 @@ 
 334	common	rseq			__x64_sys_rseq
 425	common	io_uring_setup		__x64_sys_io_uring_setup
 426	common	io_uring_enter		__x64_sys_io_uring_enter
+427	common	io_uring_register	__x64_sys_io_uring_register
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 1369cb95e1b5..9d6233dc35ca 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -25,6 +25,7 @@ 
 #include <linux/slab.h>
 #include <linux/workqueue.h>
 #include <linux/blkdev.h>
+#include <linux/bvec.h>
 #include <linux/net.h>
 #include <net/sock.h>
 #include <net/af_unix.h>
@@ -32,6 +33,7 @@ 
 #include <linux/sched/mm.h>
 #include <linux/uaccess.h>
 #include <linux/nospec.h>
+#include <linux/sizes.h>
 
 #include <uapi/linux/io_uring.h>
 
@@ -61,6 +63,13 @@  struct io_cq_ring {
 	struct io_uring_cqe	cqes[];
 };
 
+struct io_mapped_ubuf {
+	u64		ubuf;
+	size_t		len;
+	struct		bio_vec *bvec;
+	unsigned int	nr_bvecs;
+};
+
 struct io_ring_ctx {
 	struct {
 		struct percpu_ref	refs;
@@ -92,6 +101,10 @@  struct io_ring_ctx {
 		struct fasync_struct	*cq_fasync;
 	} ____cacheline_aligned_in_smp;
 
+	/* if used, fixed mapped user buffers */
+	unsigned		nr_user_bufs;
+	struct io_mapped_ubuf	*user_bufs;
+
 	struct user_struct	*user;
 
 	struct completion	ctx_done;
@@ -703,6 +716,44 @@  static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
 	}
 }
 
+static int io_import_fixed(struct io_ring_ctx *ctx, int rw,
+			   const struct io_uring_sqe *sqe,
+			   struct iov_iter *iter)
+{
+	size_t len = READ_ONCE(sqe->len);
+	struct io_mapped_ubuf *imu;
+	unsigned index, buf_index;
+	size_t offset;
+	u64 buf_addr;
+
+	/* attempt to use fixed buffers without having provided iovecs */
+	if (unlikely(!ctx->user_bufs))
+		return -EFAULT;
+
+	buf_index = READ_ONCE(sqe->buf_index);
+	if (unlikely(buf_index >= ctx->nr_user_bufs))
+		return -EFAULT;
+
+	index = array_index_nospec(buf_index, ctx->nr_user_bufs);
+	imu = &ctx->user_bufs[index];
+	buf_addr = READ_ONCE(sqe->addr);
+
+	if (buf_addr + len < buf_addr)
+		return -EFAULT;
+	if (buf_addr < imu->ubuf || buf_addr + len > imu->ubuf + imu->len)
+		return -EFAULT;
+
+	/*
+	 * May not be a start of buffer, set size appropriately
+	 * and advance us to the beginning.
+	 */
+	offset = buf_addr - imu->ubuf;
+	iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
+	if (offset)
+		iov_iter_advance(iter, offset);
+	return 0;
+}
+
 static int io_import_iovec(struct io_ring_ctx *ctx, int rw,
 			   const struct sqe_submit *s, struct iovec **iovec,
 			   struct iov_iter *iter)
@@ -710,6 +761,15 @@  static int io_import_iovec(struct io_ring_ctx *ctx, int rw,
 	const struct io_uring_sqe *sqe = s->sqe;
 	void __user *buf = u64_to_user_ptr(READ_ONCE(sqe->addr));
 	size_t sqe_len = READ_ONCE(sqe->len);
+	u8 opcode;
+
+	opcode = READ_ONCE(sqe->opcode);
+	if (opcode == IORING_OP_READ_FIXED ||
+	    opcode == IORING_OP_WRITE_FIXED) {
+		ssize_t ret = io_import_fixed(ctx, rw, sqe, iter);
+		*iovec = NULL;
+		return ret;
+	}
 
 	if (!s->has_user)
 		return EFAULT;
@@ -853,7 +913,7 @@  static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 
 	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 		return -EINVAL;
-	if (unlikely(sqe->addr || sqe->ioprio))
+	if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
 		return -EINVAL;
 
 	fsync_flags = READ_ONCE(sqe->fsync_flags);
@@ -891,9 +951,19 @@  static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 		ret = io_nop(req, req->user_data);
 		break;
 	case IORING_OP_READV:
+		if (unlikely(s->sqe->buf_index))
+			return -EINVAL;
 		ret = io_read(req, s, force_nonblock, state);
 		break;
 	case IORING_OP_WRITEV:
+		if (unlikely(s->sqe->buf_index))
+			return -EINVAL;
+		ret = io_write(req, s, force_nonblock, state);
+		break;
+	case IORING_OP_READ_FIXED:
+		ret = io_read(req, s, force_nonblock, state);
+		break;
+	case IORING_OP_WRITE_FIXED:
 		ret = io_write(req, s, force_nonblock, state);
 		break;
 	case IORING_OP_FSYNC:
@@ -922,28 +992,47 @@  static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	return 0;
 }
 
+static inline bool io_sqe_needs_user(const struct io_uring_sqe *sqe)
+{
+	u8 opcode = READ_ONCE(sqe->opcode);
+
+	return !(opcode == IORING_OP_READ_FIXED ||
+		 opcode == IORING_OP_WRITE_FIXED);
+}
+
 static void io_sq_wq_submit_work(struct work_struct *work)
 {
 	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
 	struct sqe_submit *s = &req->submit;
-	u64 user_data = READ_ONCE(s->sqe->user_data);
 	struct io_ring_ctx *ctx = req->ctx;
-	mm_segment_t old_fs = get_fs();
+	mm_segment_t old_fs;
+	bool needs_user;
+	u64 user_data;
 	int ret;
 
 	 /* Ensure we clear previously set forced non-block flag */
 	req->flags &= ~REQ_F_FORCE_NONBLOCK;
 	req->rw.ki_flags &= ~IOCB_NOWAIT;
 
-	if (!mmget_not_zero(ctx->sqo_mm)) {
-		ret = -EFAULT;
-		goto err;
-	}
-
-	use_mm(ctx->sqo_mm);
-	set_fs(USER_DS);
-	s->has_user = true;
+	user_data = READ_ONCE(s->sqe->user_data);
 	s->needs_lock = true;
+	s->has_user = false;
+
+	/*
+	 * If we're doing IO to fixed buffers, we don't need to get/set
+	 * user context
+	 */
+	needs_user = io_sqe_needs_user(s->sqe);
+	if (needs_user) {
+		if (!mmget_not_zero(ctx->sqo_mm)) {
+			ret = -EFAULT;
+			goto err;
+		}
+		use_mm(ctx->sqo_mm);
+		old_fs = get_fs();
+		set_fs(USER_DS);
+		s->has_user = true;
+	}
 
 	do {
 		ret = __io_submit_sqe(ctx, req, s, false, NULL);
@@ -957,9 +1046,11 @@  static void io_sq_wq_submit_work(struct work_struct *work)
 		cond_resched();
 	} while (1);
 
-	set_fs(old_fs);
-	unuse_mm(ctx->sqo_mm);
-	mmput(ctx->sqo_mm);
+	if (needs_user) {
+		set_fs(old_fs);
+		unuse_mm(ctx->sqo_mm);
+		mmput(ctx->sqo_mm);
+	}
 err:
 	if (ret) {
 		io_cqring_add_event(ctx, user_data, ret, 0);
@@ -1241,6 +1332,188 @@  static unsigned long ring_pages(unsigned sq_entries, unsigned cq_entries)
 	return (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
 }
 
+static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx)
+{
+	int i, j;
+
+	if (!ctx->user_bufs)
+		return -ENXIO;
+
+	for (i = 0; i < ctx->sq_entries; i++) {
+		struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
+
+		for (j = 0; j < imu->nr_bvecs; j++)
+			put_page(imu->bvec[j].bv_page);
+
+		io_unaccount_mem(ctx->user, imu->nr_bvecs);
+		kfree(imu->bvec);
+		imu->nr_bvecs = 0;
+	}
+
+	kfree(ctx->user_bufs);
+	ctx->user_bufs = NULL;
+	free_uid(ctx->user);
+	ctx->user = NULL;
+	return 0;
+}
+
+static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
+		       void __user *arg, unsigned index)
+{
+	struct iovec __user *src;
+
+#ifdef CONFIG_COMPAT
+	if (ctx->compat) {
+		struct compat_iovec __user *ciovs;
+		struct compat_iovec ciov;
+
+		ciovs = (struct compat_iovec __user *) arg;
+		if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
+			return -EFAULT;
+
+		dst->iov_base = (void __user *) (unsigned long) ciov.iov_base;
+		dst->iov_len = ciov.iov_len;
+		return 0;
+	}
+#endif
+	src = (struct iovec __user *) arg;
+	if (copy_from_user(dst, &src[index], sizeof(*dst)))
+		return -EFAULT;
+	return 0;
+}
+
+static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
+				  unsigned nr_args)
+{
+	struct vm_area_struct **vmas = NULL;
+	struct page **pages = NULL;
+	int i, j, got_pages = 0;
+	int ret = -EINVAL;
+
+	if (ctx->user_bufs)
+		return -EBUSY;
+	if (!nr_args || nr_args > UIO_MAXIOV)
+		return -EINVAL;
+
+	ctx->user_bufs = kcalloc(nr_args, sizeof(struct io_mapped_ubuf),
+					GFP_KERNEL);
+	if (!ctx->user_bufs)
+		return -ENOMEM;
+
+	for (i = 0; i < nr_args; i++) {
+		struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
+		unsigned long off, start, end, ubuf;
+		int pret, nr_pages;
+		struct iovec iov;
+		size_t size;
+
+		ret = io_copy_iov(ctx, &iov, arg, i);
+		if (ret)
+			break;
+
+		/*
+		 * Don't impose further limits on the size and buffer
+		 * constraints here, we'll -EINVAL later when IO is
+		 * submitted if they are wrong.
+		 */
+		ret = -EFAULT;
+		if (!iov.iov_base || !iov.iov_len)
+			goto err;
+
+		/* arbitrary limit, but we need something */
+		if (iov.iov_len > SZ_1G)
+			goto err;
+
+		ubuf = (unsigned long) iov.iov_base;
+		end = (ubuf + iov.iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+		start = ubuf >> PAGE_SHIFT;
+		nr_pages = end - start;
+
+		ret = io_account_mem(ctx->user, nr_pages);
+		if (ret)
+			goto err;
+
+		if (!pages || nr_pages > got_pages) {
+			kfree(vmas);
+			kfree(pages);
+			pages = kmalloc_array(nr_pages, sizeof(struct page *),
+						GFP_KERNEL);
+			vmas = kmalloc_array(nr_pages,
+					sizeof(struct vma_area_struct *),
+					GFP_KERNEL);
+			if (!pages || !vmas) {
+				ret = -ENOMEM;
+				io_unaccount_mem(ctx->user, nr_pages);
+				goto err;
+			}
+			got_pages = nr_pages;
+		}
+
+		imu->bvec = kmalloc_array(nr_pages, sizeof(struct bio_vec),
+						GFP_KERNEL);
+		if (!imu->bvec) {
+			io_unaccount_mem(ctx->user, nr_pages);
+			goto err;
+		}
+
+		down_write(&current->mm->mmap_sem);
+		pret = get_user_pages_longterm(ubuf, nr_pages, FOLL_WRITE,
+						pages, vmas);
+		if (pret == nr_pages) {
+			/* don't support file backed memory */
+			for (j = 0; j < nr_pages; j++) {
+				struct vm_area_struct *vma = vmas[j];
+
+				if (vma->vm_file) {
+					ret = -EOPNOTSUPP;
+					break;
+				}
+			}
+		} else {
+			ret = pret < 0 ? pret : -EFAULT;
+		}
+		up_write(&current->mm->mmap_sem);
+		if (ret) {
+			/*
+			 * if we did partial map, or found file backed vmas,
+			 * release any pages we did get
+			 */
+			if (pret > 0) {
+				for (j = 0; j < pret; j++)
+					put_page(pages[j]);
+			}
+			io_unaccount_mem(ctx->user, nr_pages);
+			goto err;
+		}
+
+		off = ubuf & ~PAGE_MASK;
+		size = iov.iov_len;
+		for (j = 0; j < nr_pages; j++) {
+			size_t vec_len;
+
+			vec_len = min_t(size_t, size, PAGE_SIZE - off);
+			imu->bvec[j].bv_page = pages[j];
+			imu->bvec[j].bv_len = vec_len;
+			imu->bvec[j].bv_offset = off;
+			off = 0;
+			size -= vec_len;
+		}
+		/* store original address for later verification */
+		imu->ubuf = ubuf;
+		imu->len = iov.iov_len;
+		imu->nr_bvecs = nr_pages;
+	}
+	kfree(pages);
+	kfree(vmas);
+	ctx->nr_user_bufs = nr_args;
+	return 0;
+err:
+	kfree(pages);
+	kfree(vmas);
+	io_sqe_buffer_unregister(ctx);
+	return ret;
+}
+
 static void io_ring_ctx_free(struct io_ring_ctx *ctx)
 {
 	if (ctx->sqo_wq)
@@ -1253,6 +1526,7 @@  static void io_ring_ctx_free(struct io_ring_ctx *ctx)
 #endif
 
 	io_iopoll_reap_events(ctx);
+	io_sqe_buffer_unregister(ctx);
 
 	io_mem_free(ctx->sq_ring);
 	io_mem_free(ctx->sq_sqes);
@@ -1593,6 +1867,60 @@  SYSCALL_DEFINE2(io_uring_setup, u32, entries,
 	return io_uring_setup(entries, params);
 }
 
+static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
+			       void __user *arg, unsigned nr_args)
+{
+	int ret;
+
+	percpu_ref_kill(&ctx->refs);
+	wait_for_completion(&ctx->ctx_done);
+
+	switch (opcode) {
+	case IORING_REGISTER_BUFFERS:
+		ret = io_sqe_buffer_register(ctx, arg, nr_args);
+		break;
+	case IORING_UNREGISTER_BUFFERS:
+		ret = -EINVAL;
+		if (arg || nr_args)
+			break;
+		ret = io_sqe_buffer_unregister(ctx);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	/* bring the ctx back to life */
+	reinit_completion(&ctx->ctx_done);
+	percpu_ref_reinit(&ctx->refs);
+	return ret;
+}
+
+SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
+		void __user *, arg, unsigned int, nr_args)
+{
+	struct io_ring_ctx *ctx;
+	long ret = -EBADF;
+	struct fd f;
+
+	f = fdget(fd);
+	if (!f.file)
+		return -EBADF;
+
+	ret = -EOPNOTSUPP;
+	if (f.file->f_op != &io_uring_fops)
+		goto out_fput;
+
+	ctx = f.file->private_data;
+
+	mutex_lock(&ctx->uring_lock);
+	ret = __io_uring_register(ctx, opcode, arg, nr_args);
+	mutex_unlock(&ctx->uring_lock);
+out_fput:
+	fdput(f);
+	return ret;
+}
+
 static int __init io_uring_init(void)
 {
 	req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
diff --git a/include/linux/sched/user.h b/include/linux/sched/user.h
index 39ad98c09c58..c7b5f86b91a1 100644
--- a/include/linux/sched/user.h
+++ b/include/linux/sched/user.h
@@ -40,7 +40,7 @@  struct user_struct {
 	kuid_t uid;
 
 #if defined(CONFIG_PERF_EVENTS) || defined(CONFIG_BPF_SYSCALL) || \
-    defined(CONFIG_NET)
+    defined(CONFIG_NET) || defined(CONFIG_IO_URING)
 	atomic_long_t locked_vm;
 #endif
 
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 3072dbaa7869..3681c05ac538 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -315,6 +315,8 @@  asmlinkage long sys_io_uring_setup(u32 entries,
 asmlinkage long sys_io_uring_enter(unsigned int fd, u32 to_submit,
 				u32 min_complete, u32 flags,
 				const sigset_t __user *sig, size_t sigsz);
+asmlinkage long sys_io_uring_register(unsigned int fd, unsigned int op,
+				void __user *arg, unsigned int nr_args);
 
 /* fs/xattr.c */
 asmlinkage long sys_setxattr(const char __user *path, const char __user *name,
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 87871e7b7ea7..d346229a1eb0 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -744,9 +744,11 @@  __SYSCALL(__NR_kexec_file_load,     sys_kexec_file_load)
 __SYSCALL(__NR_io_uring_setup, sys_io_uring_setup)
 #define __NR_io_uring_enter 426
 __SYSCALL(__NR_io_uring_enter, sys_io_uring_enter)
+#define __NR_io_uring_register 427
+__SYSCALL(__NR_io_uring_register, sys_io_uring_register)
 
 #undef __NR_syscalls
-#define __NR_syscalls 427
+#define __NR_syscalls 428
 
 /*
  * 32 bit systems traditionally used different
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 5c457ea396e6..cf28f7a11f12 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -27,7 +27,10 @@  struct io_uring_sqe {
 		__u32		fsync_flags;
 	};
 	__u64	user_data;	/* data to be passed back at completion time */
-	__u64	__pad2[3];
+	union {
+		__u16	buf_index;	/* index into fixed buffers, if used */
+		__u64	__pad2[3];
+	};
 };
 
 /*
@@ -39,6 +42,8 @@  struct io_uring_sqe {
 #define IORING_OP_READV		1
 #define IORING_OP_WRITEV	2
 #define IORING_OP_FSYNC		3
+#define IORING_OP_READ_FIXED	4
+#define IORING_OP_WRITE_FIXED	5
 
 /*
  * sqe->fsync_flags
@@ -103,4 +108,10 @@  struct io_uring_params {
 	struct io_cqring_offsets cq_off;
 };
 
+/*
+ * io_uring_register(2) opcodes and arguments
+ */
+#define IORING_REGISTER_BUFFERS		0
+#define IORING_UNREGISTER_BUFFERS	1
+
 #endif
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index ee5e523564bb..1bb6604dc19f 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -48,6 +48,7 @@  COND_SYSCALL_COMPAT(io_getevents);
 COND_SYSCALL_COMPAT(io_pgetevents);
 COND_SYSCALL(io_uring_setup);
 COND_SYSCALL(io_uring_enter);
+COND_SYSCALL(io_uring_register);
 
 /* fs/xattr.c */