Message ID | 20200827134044.82821-3-sgarzare@redhat.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | io_uring: add restrictions to support untrusted applications and guests | expand |
On 8/27/20 7:40 AM, Stefano Garzarella wrote: > @@ -6414,6 +6425,19 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, > if (unlikely(sqe_flags & ~SQE_VALID_FLAGS)) > return -EINVAL; > > + if (unlikely(ctx->restricted)) { > + if (!test_bit(req->opcode, ctx->restrictions.sqe_op)) > + return -EACCES; > + > + if ((sqe_flags & ctx->restrictions.sqe_flags_required) != > + ctx->restrictions.sqe_flags_required) > + return -EACCES; > + > + if (sqe_flags & ~(ctx->restrictions.sqe_flags_allowed | > + ctx->restrictions.sqe_flags_required)) > + return -EACCES; > + } > + This should be a separate function, ala: if (unlikely(ctx->restricted)) { ret = io_check_restriction(ctx, req); if (ret) return ret; } to move it totally out of the (very) hot path. > if ((sqe_flags & IOSQE_BUFFER_SELECT) && > !io_op_defs[req->opcode].buffer_select) > return -EOPNOTSUPP; > @@ -8714,6 +8738,71 @@ static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id) > return -EINVAL; > } > > +static int io_register_restrictions(struct io_ring_ctx *ctx, void __user *arg, > + unsigned int nr_args) > +{ > + struct io_uring_restriction *res; > + size_t size; > + int i, ret; > + > + /* We allow only a single restrictions registration */ > + if (ctx->restricted) > + return -EBUSY; > + > + if (!arg || nr_args > IORING_MAX_RESTRICTIONS) > + return -EINVAL; > + > + size = array_size(nr_args, sizeof(*res)); > + if (size == SIZE_MAX) > + return -EOVERFLOW; > + > + res = memdup_user(arg, size); > + if (IS_ERR(res)) > + return PTR_ERR(res); > + > + for (i = 0; i < nr_args; i++) { > + switch (res[i].opcode) { > + case IORING_RESTRICTION_REGISTER_OP: > + if (res[i].register_op >= IORING_REGISTER_LAST) { > + ret = -EINVAL; > + goto out; > + } > + > + __set_bit(res[i].register_op, > + ctx->restrictions.register_op); > + break; > + case IORING_RESTRICTION_SQE_OP: > + if (res[i].sqe_op >= IORING_OP_LAST) { > + ret = -EINVAL; > + goto out; > + } > + > + __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op); > + break; > + case IORING_RESTRICTION_SQE_FLAGS_ALLOWED: > + ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags; > + break; > + case IORING_RESTRICTION_SQE_FLAGS_REQUIRED: > + ctx->restrictions.sqe_flags_required = res[i].sqe_flags; > + break; > + default: > + ret = -EINVAL; > + goto out; > + } > + } > + > + ctx->restricted = 1; > + > + ret = 0; I'd set ret = 0 above the switch, that's the usual idiom - start at zero, have someone set it to -ERROR if something fails.
On Thu, Aug 27, 2020 at 07:49:45AM -0600, Jens Axboe wrote: > On 8/27/20 7:40 AM, Stefano Garzarella wrote: > > @@ -6414,6 +6425,19 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, > > if (unlikely(sqe_flags & ~SQE_VALID_FLAGS)) > > return -EINVAL; > > > > + if (unlikely(ctx->restricted)) { > > + if (!test_bit(req->opcode, ctx->restrictions.sqe_op)) > > + return -EACCES; > > + > > + if ((sqe_flags & ctx->restrictions.sqe_flags_required) != > > + ctx->restrictions.sqe_flags_required) > > + return -EACCES; > > + > > + if (sqe_flags & ~(ctx->restrictions.sqe_flags_allowed | > > + ctx->restrictions.sqe_flags_required)) > > + return -EACCES; > > + } > > + > > This should be a separate function, ala: > > if (unlikely(ctx->restricted)) { > ret = io_check_restriction(ctx, req); > if (ret) > return ret; > } > > to move it totally out of the (very) hot path. I'll fix. > > > if ((sqe_flags & IOSQE_BUFFER_SELECT) && > > !io_op_defs[req->opcode].buffer_select) > > return -EOPNOTSUPP; > > @@ -8714,6 +8738,71 @@ static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id) > > return -EINVAL; > > } > > > > +static int io_register_restrictions(struct io_ring_ctx *ctx, void __user *arg, > > + unsigned int nr_args) > > +{ > > + struct io_uring_restriction *res; > > + size_t size; > > + int i, ret; > > + > > + /* We allow only a single restrictions registration */ > > + if (ctx->restricted) > > + return -EBUSY; > > + > > + if (!arg || nr_args > IORING_MAX_RESTRICTIONS) > > + return -EINVAL; > > + > > + size = array_size(nr_args, sizeof(*res)); > > + if (size == SIZE_MAX) > > + return -EOVERFLOW; > > + > > + res = memdup_user(arg, size); > > + if (IS_ERR(res)) > > + return PTR_ERR(res); > > + > > + for (i = 0; i < nr_args; i++) { > > + switch (res[i].opcode) { > > + case IORING_RESTRICTION_REGISTER_OP: > > + if (res[i].register_op >= IORING_REGISTER_LAST) { > > + ret = -EINVAL; > > + goto out; > > + } > > + > > + __set_bit(res[i].register_op, > > + ctx->restrictions.register_op); > > + break; > > + case IORING_RESTRICTION_SQE_OP: > > + if (res[i].sqe_op >= IORING_OP_LAST) { > > + ret = -EINVAL; > > + goto out; > > + } > > + > > + __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op); > > + break; > > + case IORING_RESTRICTION_SQE_FLAGS_ALLOWED: > > + ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags; > > + break; > > + case IORING_RESTRICTION_SQE_FLAGS_REQUIRED: > > + ctx->restrictions.sqe_flags_required = res[i].sqe_flags; > > + break; > > + default: > > + ret = -EINVAL; > > + goto out; > > + } > > + } > > + > > + ctx->restricted = 1; > > + > > + ret = 0; > > I'd set ret = 0 above the switch, that's the usual idiom - start at > zero, have someone set it to -ERROR if something fails. Yes, it is better. I'll fix it. Thanks, Stefano
diff --git a/fs/io_uring.c b/fs/io_uring.c index 6df08287c59e..93b023930b0b 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -98,6 +98,8 @@ #define IORING_MAX_FILES_TABLE (1U << IORING_FILE_TABLE_SHIFT) #define IORING_FILE_TABLE_MASK (IORING_MAX_FILES_TABLE - 1) #define IORING_MAX_FIXED_FILES (64 * IORING_MAX_FILES_TABLE) +#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ + IORING_REGISTER_LAST + IORING_OP_LAST) struct io_uring { u32 head ____cacheline_aligned_in_smp; @@ -219,6 +221,13 @@ struct io_buffer { __u16 bid; }; +struct io_restriction { + DECLARE_BITMAP(register_op, IORING_REGISTER_LAST); + DECLARE_BITMAP(sqe_op, IORING_OP_LAST); + u8 sqe_flags_allowed; + u8 sqe_flags_required; +}; + struct io_ring_ctx { struct { struct percpu_ref refs; @@ -231,6 +240,7 @@ struct io_ring_ctx { unsigned int cq_overflow_flushed: 1; unsigned int drain_next: 1; unsigned int eventfd_async: 1; + unsigned int restricted: 1; /* * Ring buffer of indices into array of io_uring_sqe, which is @@ -338,6 +348,7 @@ struct io_ring_ctx { struct llist_head file_put_llist; struct work_struct exit_work; + struct io_restriction restrictions; }; /* @@ -6414,6 +6425,19 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, if (unlikely(sqe_flags & ~SQE_VALID_FLAGS)) return -EINVAL; + if (unlikely(ctx->restricted)) { + if (!test_bit(req->opcode, ctx->restrictions.sqe_op)) + return -EACCES; + + if ((sqe_flags & ctx->restrictions.sqe_flags_required) != + ctx->restrictions.sqe_flags_required) + return -EACCES; + + if (sqe_flags & ~(ctx->restrictions.sqe_flags_allowed | + ctx->restrictions.sqe_flags_required)) + return -EACCES; + } + if ((sqe_flags & IOSQE_BUFFER_SELECT) && !io_op_defs[req->opcode].buffer_select) return -EOPNOTSUPP; @@ -8714,6 +8738,71 @@ static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id) return -EINVAL; } +static int io_register_restrictions(struct io_ring_ctx *ctx, void __user *arg, + unsigned int nr_args) +{ + struct io_uring_restriction *res; + size_t size; + int i, ret; + + /* We allow only a single restrictions registration */ + if (ctx->restricted) + return -EBUSY; + + if (!arg || nr_args > IORING_MAX_RESTRICTIONS) + return -EINVAL; + + size = array_size(nr_args, sizeof(*res)); + if (size == SIZE_MAX) + return -EOVERFLOW; + + res = memdup_user(arg, size); + if (IS_ERR(res)) + return PTR_ERR(res); + + for (i = 0; i < nr_args; i++) { + switch (res[i].opcode) { + case IORING_RESTRICTION_REGISTER_OP: + if (res[i].register_op >= IORING_REGISTER_LAST) { + ret = -EINVAL; + goto out; + } + + __set_bit(res[i].register_op, + ctx->restrictions.register_op); + break; + case IORING_RESTRICTION_SQE_OP: + if (res[i].sqe_op >= IORING_OP_LAST) { + ret = -EINVAL; + goto out; + } + + __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op); + break; + case IORING_RESTRICTION_SQE_FLAGS_ALLOWED: + ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags; + break; + case IORING_RESTRICTION_SQE_FLAGS_REQUIRED: + ctx->restrictions.sqe_flags_required = res[i].sqe_flags; + break; + default: + ret = -EINVAL; + goto out; + } + } + + ctx->restricted = 1; + + ret = 0; +out: + /* Reset all restrictions if an error happened */ + if (ret != 0) + memset(&ctx->restrictions, 0, sizeof(ctx->restrictions)); + + kfree(res); + return ret; +} + static bool io_register_op_must_quiesce(int op) { switch (op) { @@ -8760,6 +8849,18 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, if (ret) { percpu_ref_resurrect(&ctx->refs); ret = -EINTR; + goto out_quiesce; + } + } + + if (ctx->restricted) { + if (opcode >= IORING_REGISTER_LAST) { + ret = -EINVAL; + goto out; + } + + if (!test_bit(opcode, ctx->restrictions.register_op)) { + ret = -EACCES; goto out; } } @@ -8823,15 +8924,19 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, break; ret = io_unregister_personality(ctx, nr_args); break; + case IORING_REGISTER_RESTRICTIONS: + ret = io_register_restrictions(ctx, arg, nr_args); + break; default: ret = -EINVAL; break; } +out: if (io_register_op_must_quiesce(opcode)) { /* bring the ctx back to life */ percpu_ref_reinit(&ctx->refs); -out: +out_quiesce: reinit_completion(&ctx->ref_comp); } return ret; diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 5f12ae6a415c..6e7f2e5e917b 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -267,6 +267,7 @@ enum { IORING_REGISTER_PROBE = 8, IORING_REGISTER_PERSONALITY = 9, IORING_UNREGISTER_PERSONALITY = 10, + IORING_REGISTER_RESTRICTIONS = 11, /* this goes last */ IORING_REGISTER_LAST @@ -295,4 +296,34 @@ struct io_uring_probe { struct io_uring_probe_op ops[0]; }; +struct io_uring_restriction { + __u16 opcode; + union { + __u8 register_op; /* IORING_RESTRICTION_REGISTER_OP */ + __u8 sqe_op; /* IORING_RESTRICTION_SQE_OP */ + __u8 sqe_flags; /* IORING_RESTRICTION_SQE_FLAGS_* */ + }; + __u8 resv; + __u32 resv2[3]; +}; + +/* + * io_uring_restriction->opcode values + */ +enum { + /* Allow an io_uring_register(2) opcode */ + IORING_RESTRICTION_REGISTER_OP = 0, + + /* Allow an sqe opcode */ + IORING_RESTRICTION_SQE_OP = 1, + + /* Allow sqe flags */ + IORING_RESTRICTION_SQE_FLAGS_ALLOWED = 2, + + /* Require sqe flags (these flags must be set on each submission) */ + IORING_RESTRICTION_SQE_FLAGS_REQUIRED = 3, + + IORING_RESTRICTION_LAST +}; + #endif