Message ID | 20240626100700.3629-9-anuj20.g@samsung.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | [v2,01/10] block: change rq_integrity_vec to respect the iterator | expand |
Anuj Gupta <anuj20.g@samsung.com> writes: > This patch adds the capability of sending meta along with read/write. > This meta is represented by a newly introduced 'struct io_uring_meta' > which specifies information such as meta type/flags/buffer/length and > apptag. > Application sets up a SQE128 ring, prepares io_uring_meta within the SQE > at offset pointed by sqe->cmd. > The patch processes the user-passed information to prepare uio_meta > descriptor and passes it down using kiocb->private. > > Meta exchange is supported only for direct IO. > Also vectored read/write operations with meta are not supported > currently. > > Signed-off-by: Anuj Gupta <anuj20.g@samsung.com> > Signed-off-by: Kanchan Joshi <joshi.k@samsung.com> > --- > include/linux/fs.h | 1 + > include/uapi/linux/io_uring.h | 30 +++++++++++++++- > io_uring/io_uring.c | 7 ++++ > io_uring/rw.c | 68 +++++++++++++++++++++++++++++++++-- > io_uring/rw.h | 9 ++++- > 5 files changed, 110 insertions(+), 5 deletions(-) > > diff --git a/include/linux/fs.h b/include/linux/fs.h > index db26b4a70c62..0132565288c2 100644 > --- a/include/linux/fs.h > +++ b/include/linux/fs.h > @@ -330,6 +330,7 @@ struct readahead_control; > #define IOCB_NOIO (1 << 20) > /* can use bio alloc cache */ > #define IOCB_ALLOC_CACHE (1 << 21) > +#define IOCB_HAS_META (1 << 22) > /* > * IOCB_DIO_CALLER_COMP can be set by the iocb owner, to indicate that the > * iocb completion can be passed back to the owner for execution from a safe > diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h > index 2aaf7ee256ac..9140c66b315b 100644 > --- a/include/uapi/linux/io_uring.h > +++ b/include/uapi/linux/io_uring.h > @@ -101,12 +101,40 @@ struct io_uring_sqe { > __u64 optval; > /* > * If the ring is initialized with IORING_SETUP_SQE128, then > - * this field is used for 80 bytes of arbitrary command data > + * this field is starting offset for 80 bytes of data. > + * This data is opaque for uring command op. And for meta io, > + * this contains 'struct io_uring_meta'. > */ > __u8 cmd[0]; > }; > }; > > +enum io_uring_sqe_meta_type_bits { > + META_TYPE_INTEGRITY_BIT, > + /* not a real meta type; just to make sure that we don't overflow */ > + META_TYPE_LAST_BIT, > +}; > + > +/* meta type flags */ > +#define META_TYPE_INTEGRITY (1U << META_TYPE_INTEGRITY_BIT) > + > +struct io_uring_meta { > + __u16 meta_type; > + __u16 meta_flags; > + __u32 meta_len; > + __u64 meta_addr; > + /* the next 64 bytes goes to SQE128 */ > + __u16 apptag; > + __u8 pad[62]; > +}; > + > +/* > + * flags for integrity meta > + */ > +#define INTEGRITY_CHK_GUARD (1U << 0) /* enforce guard check */ > +#define INTEGRITY_CHK_APPTAG (1U << 1) /* enforce app tag check */ > +#define INTEGRITY_CHK_REFTAG (1U << 2) /* enforce ref tag check */ > + > /* > * If sqe->file_index is set to this for opcodes that instantiate a new > * direct descriptor (like openat/openat2/accept), then io_uring will allocate > diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c > index 7ed1e009aaec..0d26ee1193ca 100644 > --- a/io_uring/io_uring.c > +++ b/io_uring/io_uring.c > @@ -3704,6 +3704,13 @@ static int __init io_uring_init(void) > /* top 8bits are for internal use */ > BUILD_BUG_ON((IORING_URING_CMD_MASK & 0xff000000) != 0); > > + BUILD_BUG_ON(sizeof(struct io_uring_meta) > > + 2 * sizeof(struct io_uring_sqe) - > + offsetof(struct io_uring_sqe, cmd)); > + > + BUILD_BUG_ON(META_TYPE_LAST_BIT > > + 8 * sizeof_field(struct io_uring_meta, meta_type)); > + > io_uring_optable_init(); > > /* > diff --git a/io_uring/rw.c b/io_uring/rw.c > index c004d21e2f12..e8f5b5af4d2f 100644 > --- a/io_uring/rw.c > +++ b/io_uring/rw.c > @@ -23,6 +23,8 @@ > #include "poll.h" > #include "rw.h" > > +#define INTEGRITY_VALID_FLAGS (INTEGRITY_CHK_GUARD | INTEGRITY_CHK_APPTAG | \ > + INTEGRITY_CHK_REFTAG) > struct io_rw { > /* NOTE: kiocb has the file as the first member, so don't do it here */ > struct kiocb kiocb; > @@ -247,6 +249,42 @@ static int io_prep_rw_setup(struct io_kiocb *req, int ddir, bool do_import) > return 0; > } > > +static int io_prep_rw_meta(struct io_kiocb *req, const struct io_uring_sqe *sqe, > + struct io_rw *rw, int ddir) > +{ > + const struct io_uring_meta *md = (struct io_uring_meta *)sqe->cmd; > + u16 meta_type = READ_ONCE(md->meta_type); > + const struct io_issue_def *def; > + struct io_async_rw *io; > + int ret; > + > + if (!meta_type) > + return 0; > + if (!(meta_type & META_TYPE_INTEGRITY)) > + return -EINVAL; > + > + /* should fit into two bytes */ > + BUILD_BUG_ON(INTEGRITY_VALID_FLAGS >= (1 << 16)); > + > + def = &io_issue_defs[req->opcode]; > + if (def->vectored) > + return -EOPNOTSUPP; > + > + io = req->async_data; > + io->meta.flags = READ_ONCE(md->meta_flags); > + if (io->meta.flags & ~INTEGRITY_VALID_FLAGS) > + return -EINVAL; > + > + io->meta.apptag = READ_ONCE(md->apptag); > + ret = import_ubuf(ddir, u64_to_user_ptr(READ_ONCE(md->meta_addr)), > + READ_ONCE(md->meta_len), &io->meta.iter); > + if (unlikely(ret < 0)) > + return ret; > + rw->kiocb.ki_flags |= IOCB_HAS_META; > + iov_iter_save_state(&io->meta.iter, &io->iter_meta_state); > + return ret; > +} > + > static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, > int ddir, bool do_import) > { > @@ -269,11 +307,16 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, > rw->kiocb.ki_ioprio = get_current_ioprio(); > } > rw->kiocb.dio_complete = NULL; > + rw->kiocb.ki_flags = 0; > > rw->addr = READ_ONCE(sqe->addr); > rw->len = READ_ONCE(sqe->len); > rw->flags = READ_ONCE(sqe->rw_flags); > - return io_prep_rw_setup(req, ddir, do_import); > + ret = io_prep_rw_setup(req, ddir, do_import); > + > + if (unlikely(req->ctx->flags & IORING_SETUP_SQE128 && !ret)) > + ret = io_prep_rw_meta(req, sqe, rw, ddir); > + return ret; Would it be useful to have a flag to differentiate a malformed SQE from a SQE with io_uring_meta, instead of assuming sqe->cmd has it? We don't check for addr3 at the moment and differently from uring_cmd, userspace will be mixing writes commands with and without metadata to different files, so it would be useful to catch that. Also, just styling, but can you turn that !ret into a separate if leg? We are bound to add more code here eventually, and the next patch to touch it will end up doing it anyway. I mean: ret = io_prep_rw_setup(req, ddir, do_import); if (unlikely(ret)) return ret; if (unlikely(req->ctx->flags & IORING_SETUP_SQE128)) ret = io_prep_rw_meta(req, sqe, rw, ddir); return ret;
On Wed, Jun 26, 2024 at 10:47 PM Gabriel Krisman Bertazi <krisman@suse.de> wrote: > > Anuj Gupta <anuj20.g@samsung.com> writes: > > > static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, > > int ddir, bool do_import) > > { > > @@ -269,11 +307,16 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, > > rw->kiocb.ki_ioprio = get_current_ioprio(); > > } > > rw->kiocb.dio_complete = NULL; > > + rw->kiocb.ki_flags = 0; > > > > rw->addr = READ_ONCE(sqe->addr); > > rw->len = READ_ONCE(sqe->len); > > rw->flags = READ_ONCE(sqe->rw_flags); > > - return io_prep_rw_setup(req, ddir, do_import); > > + ret = io_prep_rw_setup(req, ddir, do_import); > > + > > + if (unlikely(req->ctx->flags & IORING_SETUP_SQE128 && !ret)) > > + ret = io_prep_rw_meta(req, sqe, rw, ddir); > > + return ret; > > Would it be useful to have a flag to differentiate a malformed SQE from > a SQE with io_uring_meta, instead of assuming sqe->cmd has it? We don't > check for addr3 at the moment and differently from uring_cmd, userspace > will be mixing writes commands with and without metadata to different > files, so it would be useful to catch that. > Yes, but I couldn't find a good place to keep that flag. sqe->rw_flags are RWF flags and are meant for generic read/write. sqe->flags are generic io_uring flags and are not opcode specific. Do you see a place where this flag could fit in? > -- > Gabriel Krisman Bertazi > -- Anuj Gupta
diff --git a/include/linux/fs.h b/include/linux/fs.h index db26b4a70c62..0132565288c2 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -330,6 +330,7 @@ struct readahead_control; #define IOCB_NOIO (1 << 20) /* can use bio alloc cache */ #define IOCB_ALLOC_CACHE (1 << 21) +#define IOCB_HAS_META (1 << 22) /* * IOCB_DIO_CALLER_COMP can be set by the iocb owner, to indicate that the * iocb completion can be passed back to the owner for execution from a safe diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 2aaf7ee256ac..9140c66b315b 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -101,12 +101,40 @@ struct io_uring_sqe { __u64 optval; /* * If the ring is initialized with IORING_SETUP_SQE128, then - * this field is used for 80 bytes of arbitrary command data + * this field is starting offset for 80 bytes of data. + * This data is opaque for uring command op. And for meta io, + * this contains 'struct io_uring_meta'. */ __u8 cmd[0]; }; }; +enum io_uring_sqe_meta_type_bits { + META_TYPE_INTEGRITY_BIT, + /* not a real meta type; just to make sure that we don't overflow */ + META_TYPE_LAST_BIT, +}; + +/* meta type flags */ +#define META_TYPE_INTEGRITY (1U << META_TYPE_INTEGRITY_BIT) + +struct io_uring_meta { + __u16 meta_type; + __u16 meta_flags; + __u32 meta_len; + __u64 meta_addr; + /* the next 64 bytes goes to SQE128 */ + __u16 apptag; + __u8 pad[62]; +}; + +/* + * flags for integrity meta + */ +#define INTEGRITY_CHK_GUARD (1U << 0) /* enforce guard check */ +#define INTEGRITY_CHK_APPTAG (1U << 1) /* enforce app tag check */ +#define INTEGRITY_CHK_REFTAG (1U << 2) /* enforce ref tag check */ + /* * If sqe->file_index is set to this for opcodes that instantiate a new * direct descriptor (like openat/openat2/accept), then io_uring will allocate diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 7ed1e009aaec..0d26ee1193ca 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -3704,6 +3704,13 @@ static int __init io_uring_init(void) /* top 8bits are for internal use */ BUILD_BUG_ON((IORING_URING_CMD_MASK & 0xff000000) != 0); + BUILD_BUG_ON(sizeof(struct io_uring_meta) > + 2 * sizeof(struct io_uring_sqe) - + offsetof(struct io_uring_sqe, cmd)); + + BUILD_BUG_ON(META_TYPE_LAST_BIT > + 8 * sizeof_field(struct io_uring_meta, meta_type)); + io_uring_optable_init(); /* diff --git a/io_uring/rw.c b/io_uring/rw.c index c004d21e2f12..e8f5b5af4d2f 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -23,6 +23,8 @@ #include "poll.h" #include "rw.h" +#define INTEGRITY_VALID_FLAGS (INTEGRITY_CHK_GUARD | INTEGRITY_CHK_APPTAG | \ + INTEGRITY_CHK_REFTAG) struct io_rw { /* NOTE: kiocb has the file as the first member, so don't do it here */ struct kiocb kiocb; @@ -247,6 +249,42 @@ static int io_prep_rw_setup(struct io_kiocb *req, int ddir, bool do_import) return 0; } +static int io_prep_rw_meta(struct io_kiocb *req, const struct io_uring_sqe *sqe, + struct io_rw *rw, int ddir) +{ + const struct io_uring_meta *md = (struct io_uring_meta *)sqe->cmd; + u16 meta_type = READ_ONCE(md->meta_type); + const struct io_issue_def *def; + struct io_async_rw *io; + int ret; + + if (!meta_type) + return 0; + if (!(meta_type & META_TYPE_INTEGRITY)) + return -EINVAL; + + /* should fit into two bytes */ + BUILD_BUG_ON(INTEGRITY_VALID_FLAGS >= (1 << 16)); + + def = &io_issue_defs[req->opcode]; + if (def->vectored) + return -EOPNOTSUPP; + + io = req->async_data; + io->meta.flags = READ_ONCE(md->meta_flags); + if (io->meta.flags & ~INTEGRITY_VALID_FLAGS) + return -EINVAL; + + io->meta.apptag = READ_ONCE(md->apptag); + ret = import_ubuf(ddir, u64_to_user_ptr(READ_ONCE(md->meta_addr)), + READ_ONCE(md->meta_len), &io->meta.iter); + if (unlikely(ret < 0)) + return ret; + rw->kiocb.ki_flags |= IOCB_HAS_META; + iov_iter_save_state(&io->meta.iter, &io->iter_meta_state); + return ret; +} + static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, int ddir, bool do_import) { @@ -269,11 +307,16 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, rw->kiocb.ki_ioprio = get_current_ioprio(); } rw->kiocb.dio_complete = NULL; + rw->kiocb.ki_flags = 0; rw->addr = READ_ONCE(sqe->addr); rw->len = READ_ONCE(sqe->len); rw->flags = READ_ONCE(sqe->rw_flags); - return io_prep_rw_setup(req, ddir, do_import); + ret = io_prep_rw_setup(req, ddir, do_import); + + if (unlikely(req->ctx->flags & IORING_SETUP_SQE128 && !ret)) + ret = io_prep_rw_meta(req, sqe, rw, ddir); + return ret; } int io_prep_read(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -400,7 +443,10 @@ static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req) static void io_resubmit_prep(struct io_kiocb *req) { struct io_async_rw *io = req->async_data; + struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); + if (unlikely(rw->kiocb.ki_flags & IOCB_HAS_META)) + iov_iter_restore(&io->meta.iter, &io->iter_meta_state); iov_iter_restore(&io->iter, &io->iter_state); } @@ -768,8 +814,12 @@ static inline int io_iter_do_read(struct io_rw *rw, struct iov_iter *iter) static bool need_complete_io(struct io_kiocb *req) { + struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); + + /* Exclude meta IO as we don't support partial completion for that */ return req->flags & REQ_F_ISREG || - S_ISBLK(file_inode(req->file)->i_mode); + S_ISBLK(file_inode(req->file)->i_mode) || + !(rw->kiocb.ki_flags & IOCB_HAS_META); } static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type) @@ -786,7 +836,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type) if (!(req->flags & REQ_F_FIXED_FILE)) req->flags |= io_file_get_flags(file); - kiocb->ki_flags = file->f_iocb_flags; + kiocb->ki_flags |= file->f_iocb_flags; ret = kiocb_set_rw_flags(kiocb, rw->flags, rw_type); if (unlikely(ret)) return ret; @@ -815,6 +865,14 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type) kiocb->ki_complete = io_complete_rw; } + if (unlikely(kiocb->ki_flags & IOCB_HAS_META)) { + struct io_async_rw *io = req->async_data; + + if (!(req->file->f_flags & O_DIRECT)) + return -EOPNOTSUPP; + kiocb->private = &io->meta; + } + return 0; } @@ -881,6 +939,8 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags) * manually if we need to. */ iov_iter_restore(&io->iter, &io->iter_state); + if (unlikely(kiocb->ki_flags & IOCB_HAS_META)) + iov_iter_restore(&io->meta.iter, &io->iter_meta_state); do { /* @@ -1091,6 +1151,8 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags) } else { ret_eagain: iov_iter_restore(&io->iter, &io->iter_state); + if (unlikely(kiocb->ki_flags & IOCB_HAS_META)) + iov_iter_restore(&io->meta.iter, &io->iter_meta_state); if (kiocb->ki_flags & IOCB_WRITE) io_req_end_write(req); return -EAGAIN; diff --git a/io_uring/rw.h b/io_uring/rw.h index 3f432dc75441..49944b539c51 100644 --- a/io_uring/rw.h +++ b/io_uring/rw.h @@ -9,7 +9,14 @@ struct io_async_rw { struct iovec fast_iov; struct iovec *free_iovec; int free_iov_nr; - struct wait_page_queue wpq; + /* wpq is for buffered io, while meta fields are used with direct io*/ + union { + struct wait_page_queue wpq; + struct { + struct uio_meta meta; + struct iov_iter_state iter_meta_state; + }; + }; }; int io_prep_read_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe);