diff mbox series

[15/23] io_uring: enable BPF to submit SQEs

Message ID 8ec8373d406d1fcb41719e641799dcc5c0455db3.1621424513.git.asml.silence@gmail.com (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series io_uring BPF requests | expand

Checks

Context Check Description
netdev/tree_selection success Not a local patch

Commit Message

Pavel Begunkov May 19, 2021, 2:13 p.m. UTC
Add a BPF_FUNC_iouring_queue_sqe BPF function as a demonstration of
submmiting a new request by a BPF request.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 fs/io_uring.c            | 51 ++++++++++++++++++++++++++++++++++++----
 include/uapi/linux/bpf.h |  1 +
 2 files changed, 48 insertions(+), 4 deletions(-)

Comments

Song Liu May 21, 2021, 12:06 a.m. UTC | #1
> On May 19, 2021, at 7:13 AM, Pavel Begunkov <asml.silence@gmail.com> wrote:
> 
> Add a BPF_FUNC_iouring_queue_sqe BPF function as a demonstration of
> submmiting a new request by a BPF request.
> 
> Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
> ---
> fs/io_uring.c            | 51 ++++++++++++++++++++++++++++++++++++----
> include/uapi/linux/bpf.h |  1 +
> 2 files changed, 48 insertions(+), 4 deletions(-)
> 
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index 20fddc5945f2..aae786291c57 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -882,6 +882,7 @@ struct io_defer_entry {
> };
> 
> struct io_bpf_ctx {
> +	struct io_ring_ctx	*ctx;
> };
> 
> struct io_op_def {
> @@ -6681,7 +6682,8 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
> 			ret = -EBADF;
> 	}
> 
> -	state->ios_left--;
> +	if (state->ios_left > 1)
> +		state->ios_left--;
> 	return ret;
> }
> 
> @@ -10345,10 +10347,50 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
> 	return ret;
> }
> 
> +BPF_CALL_3(io_bpf_queue_sqe, struct io_bpf_ctx *,		bpf_ctx,
> +			     const struct io_uring_sqe *,	sqe,
> +			     u32,				sqe_len)
> +{
> +	struct io_ring_ctx *ctx = bpf_ctx->ctx;
> +	struct io_kiocb *req;
> +
> +	if (sqe_len != sizeof(struct io_uring_sqe))
> +		return -EINVAL;
> +
> +	req = io_alloc_req(ctx);
> +	if (unlikely(!req))
> +		return -ENOMEM;
> +	if (!percpu_ref_tryget_many(&ctx->refs, 1)) {
> +		kmem_cache_free(req_cachep, req);
> +		return -EAGAIN;
> +	}
> +	percpu_counter_add(&current->io_uring->inflight, 1);
> +	refcount_add(1, &current->usage);
> +
> +	/* returns number of submitted SQEs or an error */
> +	return !io_submit_sqe(ctx, req, sqe);
> +}
> +
> +const struct bpf_func_proto io_bpf_queue_sqe_proto = {
> +	.func = io_bpf_queue_sqe,
> +	.gpl_only = false,
> +	.ret_type = RET_INTEGER,
> +	.arg1_type = ARG_PTR_TO_CTX,
> +	.arg2_type = ARG_PTR_TO_MEM,
> +	.arg3_type = ARG_CONST_SIZE,
> +};
> +
> static const struct bpf_func_proto *
> io_bpf_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
> {
> -	return bpf_base_func_proto(func_id);
> +	switch (func_id) {
> +	case BPF_FUNC_copy_from_user:
> +		return prog->aux->sleepable ? &bpf_copy_from_user_proto : NULL;
> +	case BPF_FUNC_iouring_queue_sqe:
> +		return prog->aux->sleepable ? &io_bpf_queue_sqe_proto : NULL;
> +	default:
> +		return bpf_base_func_proto(func_id);
> +	}
> }
> 
> static bool io_bpf_is_valid_access(int off, int size,
> @@ -10379,9 +10421,10 @@ static void io_bpf_run(struct io_kiocb *req, unsigned int issue_flags)
> 		     atomic_read(&req->task->io_uring->in_idle)))
> 		goto done;
> 
> -	memset(&bpf_ctx, 0, sizeof(bpf_ctx));
> +	bpf_ctx.ctx = ctx;
> 	prog = req->bpf.prog;
> 
> +	io_submit_state_start(&ctx->submit_state, 1);
> 	if (prog->aux->sleepable) {
> 		rcu_read_lock();
> 		bpf_prog_run_pin_on_cpu(req->bpf.prog, &bpf_ctx);
> @@ -10389,7 +10432,7 @@ static void io_bpf_run(struct io_kiocb *req, unsigned int issue_flags)
> 	} else {
> 		bpf_prog_run_pin_on_cpu(req->bpf.prog, &bpf_ctx);
> 	}
> -
> +	io_submit_state_end(&ctx->submit_state, ctx);
> 	ret = 0;
> done:
> 	__io_req_complete(req, issue_flags, ret, 0);
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index de544f0fbeef..cc268f749a7d 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -4082,6 +4082,7 @@ union bpf_attr {
> 	FN(ima_inode_hash),		\
> 	FN(sock_from_file),		\
> 	FN(check_mtu),			\
> +	FN(iouring_queue_sqe),		\

We need to describe this function in the comment above, just like 20/23 does. 

> 	/* */
> 
> /* integer value in 'imm' field of BPF_CALL instruction selects which helper
> -- 
> 2.31.1
>
Alexei Starovoitov May 21, 2021, 1:07 a.m. UTC | #2
On Wed, May 19, 2021 at 03:13:26PM +0100, Pavel Begunkov wrote:
>  
> +BPF_CALL_3(io_bpf_queue_sqe, struct io_bpf_ctx *,		bpf_ctx,
> +			     const struct io_uring_sqe *,	sqe,
> +			     u32,				sqe_len)
> +{
> +	struct io_ring_ctx *ctx = bpf_ctx->ctx;
> +	struct io_kiocb *req;
> +
> +	if (sqe_len != sizeof(struct io_uring_sqe))
> +		return -EINVAL;
> +
> +	req = io_alloc_req(ctx);

that is GFP_KERNEL allocation.
It's only allowed from sleepable bpf progs and further down
there is a correct check for it, so all good.
But submitting sqe is a fundemntal io_uring operation,
so what is the use case for non-sleepable?
In other words why bother? Allow sleepable only and simplify the code?

> +	if (unlikely(!req))
> +		return -ENOMEM;
> +	if (!percpu_ref_tryget_many(&ctx->refs, 1)) {
> +		kmem_cache_free(req_cachep, req);
> +		return -EAGAIN;
> +	}
> +	percpu_counter_add(&current->io_uring->inflight, 1);
> +	refcount_add(1, &current->usage);
> +
> +	/* returns number of submitted SQEs or an error */
> +	return !io_submit_sqe(ctx, req, sqe);

A buggy bpf prog will be able to pass junk sizeof(struct io_uring_sqe)
as 'sqe' here.
What kind of validation io_submit_sqe() does to avoid crashing the kernel?

General comments that apply to all patches:
- commit logs are way too terse. Pls expand with details.
- describe new bpf helpers in comments in bpf.h. Just adding them to an enum is not enough.
- selftest/bpf are mandatory for all new bpf features.
- consider bpf_link style of attaching bpf progs. We had enough issues with progs
  that get stuck due to application bugs. Auto-detach saves the day more often than not.
Pavel Begunkov May 21, 2021, 9:33 a.m. UTC | #3
On 5/21/21 2:07 AM, Alexei Starovoitov wrote:
> On Wed, May 19, 2021 at 03:13:26PM +0100, Pavel Begunkov wrote:
>>  
>> +BPF_CALL_3(io_bpf_queue_sqe, struct io_bpf_ctx *,		bpf_ctx,
>> +			     const struct io_uring_sqe *,	sqe,
>> +			     u32,				sqe_len)
>> +{
>> +	struct io_ring_ctx *ctx = bpf_ctx->ctx;
>> +	struct io_kiocb *req;
>> +
>> +	if (sqe_len != sizeof(struct io_uring_sqe))
>> +		return -EINVAL;
>> +
>> +	req = io_alloc_req(ctx);
> 
> that is GFP_KERNEL allocation.
> It's only allowed from sleepable bpf progs and further down
> there is a correct check for it, so all good.
> But submitting sqe is a fundemntal io_uring operation,
> so what is the use case for non-sleepable?
> In other words why bother? Allow sleepable only and simplify the code?

Actual submission may be moved out of BPF, so enabling it for both, but
the question I wonder about is what are the plans for sleepable
programs? E.g. if it's a marginal features much limited in
functionality, e.g. iirc as it's not allowed to use some BPF data
types, it may not worth doing.

> 
>> +	if (unlikely(!req))
>> +		return -ENOMEM;
>> +	if (!percpu_ref_tryget_many(&ctx->refs, 1)) {
>> +		kmem_cache_free(req_cachep, req);
>> +		return -EAGAIN;
>> +	}
>> +	percpu_counter_add(&current->io_uring->inflight, 1);
>> +	refcount_add(1, &current->usage);
>> +
>> +	/* returns number of submitted SQEs or an error */
>> +	return !io_submit_sqe(ctx, req, sqe);
> 
> A buggy bpf prog will be able to pass junk sizeof(struct io_uring_sqe)
> as 'sqe' here.
> What kind of validation io_submit_sqe() does to avoid crashing the kernel?

It works on memory rw shared with userspace, so it already assumes
the worst
 
> General comments that apply to all patches:
> - commit logs are way too terse. Pls expand with details.
> - describe new bpf helpers in comments in bpf.h. Just adding them to an enum is not enough.
> - selftest/bpf are mandatory for all new bpf features.
> - consider bpf_link style of attaching bpf progs. We had enough issues with progs
>   that get stuck due to application bugs. Auto-detach saves the day more often than not.

Thanks for taking a look! I have no idea what bpf_link is, need
to check it out
diff mbox series

Patch

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 20fddc5945f2..aae786291c57 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -882,6 +882,7 @@  struct io_defer_entry {
 };
 
 struct io_bpf_ctx {
+	struct io_ring_ctx	*ctx;
 };
 
 struct io_op_def {
@@ -6681,7 +6682,8 @@  static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 			ret = -EBADF;
 	}
 
-	state->ios_left--;
+	if (state->ios_left > 1)
+		state->ios_left--;
 	return ret;
 }
 
@@ -10345,10 +10347,50 @@  static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 	return ret;
 }
 
+BPF_CALL_3(io_bpf_queue_sqe, struct io_bpf_ctx *,		bpf_ctx,
+			     const struct io_uring_sqe *,	sqe,
+			     u32,				sqe_len)
+{
+	struct io_ring_ctx *ctx = bpf_ctx->ctx;
+	struct io_kiocb *req;
+
+	if (sqe_len != sizeof(struct io_uring_sqe))
+		return -EINVAL;
+
+	req = io_alloc_req(ctx);
+	if (unlikely(!req))
+		return -ENOMEM;
+	if (!percpu_ref_tryget_many(&ctx->refs, 1)) {
+		kmem_cache_free(req_cachep, req);
+		return -EAGAIN;
+	}
+	percpu_counter_add(&current->io_uring->inflight, 1);
+	refcount_add(1, &current->usage);
+
+	/* returns number of submitted SQEs or an error */
+	return !io_submit_sqe(ctx, req, sqe);
+}
+
+const struct bpf_func_proto io_bpf_queue_sqe_proto = {
+	.func = io_bpf_queue_sqe,
+	.gpl_only = false,
+	.ret_type = RET_INTEGER,
+	.arg1_type = ARG_PTR_TO_CTX,
+	.arg2_type = ARG_PTR_TO_MEM,
+	.arg3_type = ARG_CONST_SIZE,
+};
+
 static const struct bpf_func_proto *
 io_bpf_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
-	return bpf_base_func_proto(func_id);
+	switch (func_id) {
+	case BPF_FUNC_copy_from_user:
+		return prog->aux->sleepable ? &bpf_copy_from_user_proto : NULL;
+	case BPF_FUNC_iouring_queue_sqe:
+		return prog->aux->sleepable ? &io_bpf_queue_sqe_proto : NULL;
+	default:
+		return bpf_base_func_proto(func_id);
+	}
 }
 
 static bool io_bpf_is_valid_access(int off, int size,
@@ -10379,9 +10421,10 @@  static void io_bpf_run(struct io_kiocb *req, unsigned int issue_flags)
 		     atomic_read(&req->task->io_uring->in_idle)))
 		goto done;
 
-	memset(&bpf_ctx, 0, sizeof(bpf_ctx));
+	bpf_ctx.ctx = ctx;
 	prog = req->bpf.prog;
 
+	io_submit_state_start(&ctx->submit_state, 1);
 	if (prog->aux->sleepable) {
 		rcu_read_lock();
 		bpf_prog_run_pin_on_cpu(req->bpf.prog, &bpf_ctx);
@@ -10389,7 +10432,7 @@  static void io_bpf_run(struct io_kiocb *req, unsigned int issue_flags)
 	} else {
 		bpf_prog_run_pin_on_cpu(req->bpf.prog, &bpf_ctx);
 	}
-
+	io_submit_state_end(&ctx->submit_state, ctx);
 	ret = 0;
 done:
 	__io_req_complete(req, issue_flags, ret, 0);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index de544f0fbeef..cc268f749a7d 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -4082,6 +4082,7 @@  union bpf_attr {
 	FN(ima_inode_hash),		\
 	FN(sock_from_file),		\
 	FN(check_mtu),			\
+	FN(iouring_queue_sqe),		\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper