diff mbox series

[v9,06/11] io_uring: introduce attributes for read/write and PI support

Message ID 20241114104517.51726-7-anuj20.g@samsung.com (mailing list archive)
State New
Headers show
Series Read/Write with meta/integrity | expand

Commit Message

Anuj Gupta Nov. 14, 2024, 10:45 a.m. UTC
Add the ability to pass additional attributes along with read/write.
Application can populate an array of 'struct io_uring_attr_vec' and pass
its address using the SQE field:
	__u64	attr_vec_addr;

Along with number of attributes using:
	__u8	nr_attr_indirect;

Overall 16 attributes are allowed and currently one attribute
'ATTR_TYPE_PI' is supported.

With PI attribute, userspace can pass following information:
- flags: integrity check flags IO_INTEGRITY_CHK_{GUARD/APPTAG/REFTAG}
- len: length of PI/metadata buffer
- addr: address of metadata buffer
- seed: seed value for reftag remapping
- app_tag: application defined 16b value

Process this information to prepare uio_meta_descriptor and pass it down
using kiocb->private.

PI attribute is supported only for direct IO. Also, vectored read/write
operations are not supported with PI currently.

Signed-off-by: Anuj Gupta <anuj20.g@samsung.com>
Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
---
 include/uapi/linux/io_uring.h |  29 ++++++++
 io_uring/io_uring.c           |   1 +
 io_uring/rw.c                 | 128 +++++++++++++++++++++++++++++++++-
 io_uring/rw.h                 |  14 +++-
 4 files changed, 169 insertions(+), 3 deletions(-)

Comments

Christoph Hellwig Nov. 14, 2024, 12:16 p.m. UTC | #1
On Thu, Nov 14, 2024 at 04:15:12PM +0530, Anuj Gupta wrote:
> PI attribute is supported only for direct IO. Also, vectored read/write
> operations are not supported with PI currently.

Eww.  I know it's frustration for your if maintainers give contradicting
guidance, but this is really an awful interface.  Not only the pointless
indirection which make the interface hard to use, but limiting it to
not support vectored I/O makes it pretty useless.

I guess I need to do a little read-up on why Pavel wants this, but
from the block/fs perspective the previous interface made so much
more sense.
Pavel Begunkov Nov. 14, 2024, 1:09 p.m. UTC | #2
On 11/14/24 12:16, Christoph Hellwig wrote:
> On Thu, Nov 14, 2024 at 04:15:12PM +0530, Anuj Gupta wrote:
>> PI attribute is supported only for direct IO. Also, vectored read/write
>> operations are not supported with PI currently.

And my apologies Anuj, I've been busy, I hope to take a look
at this series today / tomorrow.

> Eww.  I know it's frustration for your if maintainers give contradicting
> guidance, but this is really an awful interface.  Not only the pointless

Because once you placed it at a fixed location nothing realistically
will be able to reuse it. Not everyone will need PI, but the assumption
that there will be more more additional types of attributes / parameters.

With SQE128 it's also a problem that now all SQEs are 128 bytes regardless
of whether a particular request needs it or not, and the user will need
to zero them for each request.

The discussion continued in the v6 thread, here

https://lore.kernel.org/all/20241031065535.GA26299@lst.de/T/#m12beca2ede2bd2017796adb391bedec9c95d85c3

and a little bit more here:

https://lore.kernel.org/all/20241031065535.GA26299@lst.de/T/#mc3f7a95915a64551e061d37b33a643676c5d87b2

> indirection which make the interface hard to use, but limiting it to
> not support vectored I/O makes it pretty useless.

I'm not sure why that's the case and need to take a look), but I
don't immediately see how it's relevant to that part of the API. It
shouldn't really matter where the main PI structure is located, you
get an iovec pointer and code from there wouldn't be any different.

> I guess I need to do a little read-up on why Pavel wants this, but
> from the block/fs perspective the previous interface made so much
> more sense.
Christoph Hellwig Nov. 14, 2024, 3:19 p.m. UTC | #3
On Thu, Nov 14, 2024 at 01:09:44PM +0000, Pavel Begunkov wrote:
>> Eww.  I know it's frustration for your if maintainers give contradicting
>> guidance, but this is really an awful interface.  Not only the pointless
>
> Because once you placed it at a fixed location nothing realistically
> will be able to reuse it. Not everyone will need PI, but the assumption
> that there will be more more additional types of attributes / parameters.

So?  If we have a strong enough requirement for something else we
can triviall add another opcode.  Maybe we should just add different
opcodes for read/write with metadata so that folks don't freak out
about this?

> With SQE128 it's also a problem that now all SQEs are 128 bytes regardless
> of whether a particular request needs it or not, and the user will need
> to zero them for each request.

The user is not going to create a SQE128 ring unless they need to,
so this seem like a bit of an odd objection.
diff mbox series

Patch

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 5d08435b95a8..2e6808f6ba28 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -92,12 +92,18 @@  struct io_uring_sqe {
 			__u16	addr_len;
 			__u16	__pad3[1];
 		};
+		struct {
+			/* number of elements in the attribute vector */
+			__u8	nr_attr_indirect;
+			__u8	__pad4[3];
+		};
 	};
 	union {
 		struct {
 			__u64	addr3;
 			__u64	__pad2[1];
 		};
+		__u64	attr_vec_addr;
 		__u64	optval;
 		/*
 		 * If the ring is initialized with IORING_SETUP_SQE128, then
@@ -107,6 +113,29 @@  struct io_uring_sqe {
 	};
 };
 
+
+/* Attributes to be passed with read/write */
+enum io_uring_attr_type {
+	ATTR_TYPE_PI,
+	/* max supported attributes */
+	ATTR_TYPE_LAST = 16,
+};
+
+struct io_uring_attr_vec {
+	enum io_uring_attr_type	type;
+	__u64			addr;
+};
+
+/* PI attribute information */
+struct io_uring_attr_pi {
+		__u16	flags;
+		__u16	app_tag;
+		__u32	len;
+		__u64	addr;
+		__u64	seed;
+		__u64	rsvd;
+};
+
 /*
  * If sqe->file_index is set to this for opcodes that instantiate a new
  * direct descriptor (like openat/openat2/accept), then io_uring will allocate
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index bd71782057de..e32dd118d7c8 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -3867,6 +3867,7 @@  static int __init io_uring_init(void)
 	BUILD_BUG_SQE_ELEM(44, __u32,  file_index);
 	BUILD_BUG_SQE_ELEM(44, __u16,  addr_len);
 	BUILD_BUG_SQE_ELEM(46, __u16,  __pad3[0]);
+	BUILD_BUG_SQE_ELEM(44, __u8,   nr_attr_indirect);
 	BUILD_BUG_SQE_ELEM(48, __u64,  addr3);
 	BUILD_BUG_SQE_ELEM_SIZE(48, 0, cmd);
 	BUILD_BUG_SQE_ELEM(56, __u64,  __pad2);
diff --git a/io_uring/rw.c b/io_uring/rw.c
index cce8bc2ecd3f..93d7451b9370 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -257,11 +257,98 @@  static int io_prep_rw_setup(struct io_kiocb *req, int ddir, bool do_import)
 	return 0;
 }
 
+static inline void io_meta_save_state(struct io_async_rw *io)
+{
+	io->meta_state.seed = io->meta.seed;
+	iov_iter_save_state(&io->meta.iter, &io->meta_state.iter_meta);
+}
+
+static inline void io_meta_restore(struct io_async_rw *io)
+{
+	io->meta.seed = io->meta_state.seed;
+	iov_iter_restore(&io->meta.iter, &io->meta_state.iter_meta);
+}
+
+static int io_prep_rw_pi(struct io_kiocb *req, struct io_rw *rw, int ddir,
+			 const struct io_uring_attr_pi *pi_attr)
+{
+	const struct io_issue_def *def;
+	struct io_async_rw *io;
+	int ret;
+
+	if (READ_ONCE(pi_attr->rsvd))
+		return -EINVAL;
+
+	def = &io_issue_defs[req->opcode];
+	if (def->vectored)
+		return -EOPNOTSUPP;
+
+	io = req->async_data;
+	io->meta.flags = READ_ONCE(pi_attr->flags);
+	io->meta.app_tag = READ_ONCE(pi_attr->app_tag);
+	io->meta.seed = READ_ONCE(pi_attr->seed);
+	ret = import_ubuf(ddir, u64_to_user_ptr(READ_ONCE(pi_attr->addr)),
+			  READ_ONCE(pi_attr->len), &io->meta.iter);
+	if (unlikely(ret < 0))
+		return ret;
+	rw->kiocb.ki_flags |= IOCB_HAS_METADATA;
+	io_meta_save_state(io);
+	return ret;
+}
+
+
+static inline int io_prep_pi_indirect(struct io_kiocb *req, struct io_rw *rw,
+				      int ddir, u64 pi_attr_addr)
+{
+	struct io_uring_attr_pi pi_attr;
+
+	if (copy_from_user(&pi_attr, (void __user *)pi_attr_addr, sizeof(pi_attr)))
+		return -EFAULT;
+	return io_prep_rw_pi(req, rw, ddir, &pi_attr);
+}
+
+static int io_prep_attr_vec(struct io_kiocb *req, struct io_rw *rw, int ddir,
+			      u64 attr_addr, u8 nr_attr)
+{
+	struct io_uring_attr_vec attr_vec[ATTR_TYPE_LAST];
+	size_t attr_vec_size = sizeof(struct io_uring_attr_vec) * nr_attr;
+	u8 dup[ATTR_TYPE_LAST] = {0};
+	enum io_uring_attr_type t;
+	int i, ret;
+
+	if (nr_attr > ATTR_TYPE_LAST)
+		return -EINVAL;
+	if (copy_from_user(attr_vec, (void __user *)attr_addr, attr_vec_size))
+		return -EFAULT;
+
+	for (i = 0; i < nr_attr; i++) {
+		t = attr_vec[i].type;
+		if (t >= ATTR_TYPE_LAST)
+			return -EINVAL;
+		/* allow each attribute only once */
+		if (dup[ATTR_TYPE_PI])
+			return -EBUSY;
+		dup[ATTR_TYPE_PI] = 1;
+
+		switch (t) {
+		case ATTR_TYPE_PI:
+			ret = io_prep_pi_indirect(req, rw, ddir, attr_vec[i].addr);
+			break;
+		default:
+			ret = -EOPNOTSUPP;
+		}
+		if (unlikely(ret))
+			return ret;
+	}
+	return 0;
+}
+
 static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 		      int ddir, bool do_import)
 {
 	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
 	unsigned ioprio;
+	u8 nr_attr_indirect;
 	int ret;
 
 	rw->kiocb.ki_pos = READ_ONCE(sqe->off);
@@ -279,11 +366,29 @@  static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 		rw->kiocb.ki_ioprio = get_current_ioprio();
 	}
 	rw->kiocb.dio_complete = NULL;
+	rw->kiocb.ki_flags = 0;
 
 	rw->addr = READ_ONCE(sqe->addr);
 	rw->len = READ_ONCE(sqe->len);
 	rw->flags = READ_ONCE(sqe->rw_flags);
-	return io_prep_rw_setup(req, ddir, do_import);
+	ret = io_prep_rw_setup(req, ddir, do_import);
+
+	if (unlikely(ret))
+		return ret;
+
+	nr_attr_indirect = READ_ONCE(sqe->nr_attr_indirect);
+	if (nr_attr_indirect) {
+		u64 attr_vec_usr_addr = READ_ONCE(sqe->attr_vec_addr);
+
+		if (READ_ONCE(sqe->__pad4[0]) || READ_ONCE(sqe->__pad4[1]) ||
+		    READ_ONCE(sqe->__pad4[2]))
+			return -EINVAL;
+
+		ret = io_prep_attr_vec(req, rw, ddir, attr_vec_usr_addr,
+					 nr_attr_indirect);
+	}
+
+	return ret;
 }
 
 int io_prep_read(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -409,7 +514,10 @@  static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req)
 static void io_resubmit_prep(struct io_kiocb *req)
 {
 	struct io_async_rw *io = req->async_data;
+	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
 
+	if (rw->kiocb.ki_flags & IOCB_HAS_METADATA)
+		io_meta_restore(io);
 	iov_iter_restore(&io->iter, &io->iter_state);
 }
 
@@ -794,7 +902,7 @@  static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type)
 	if (!(req->flags & REQ_F_FIXED_FILE))
 		req->flags |= io_file_get_flags(file);
 
-	kiocb->ki_flags = file->f_iocb_flags;
+	kiocb->ki_flags |= file->f_iocb_flags;
 	ret = kiocb_set_rw_flags(kiocb, rw->flags, rw_type);
 	if (unlikely(ret))
 		return ret;
@@ -828,6 +936,18 @@  static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type)
 		kiocb->ki_complete = io_complete_rw;
 	}
 
+	if (kiocb->ki_flags & IOCB_HAS_METADATA) {
+		struct io_async_rw *io = req->async_data;
+
+		/*
+		 * We have a union of meta fields with wpq used for buffered-io
+		 * in io_async_rw, so fail it here.
+		 */
+		if (!(req->file->f_flags & O_DIRECT))
+			return -EOPNOTSUPP;
+		kiocb->private = &io->meta;
+	}
+
 	return 0;
 }
 
@@ -902,6 +1022,8 @@  static int __io_read(struct io_kiocb *req, unsigned int issue_flags)
 	 * manually if we need to.
 	 */
 	iov_iter_restore(&io->iter, &io->iter_state);
+	if (kiocb->ki_flags & IOCB_HAS_METADATA)
+		io_meta_restore(io);
 
 	do {
 		/*
@@ -1125,6 +1247,8 @@  int io_write(struct io_kiocb *req, unsigned int issue_flags)
 	} else {
 ret_eagain:
 		iov_iter_restore(&io->iter, &io->iter_state);
+		if (kiocb->ki_flags & IOCB_HAS_METADATA)
+			io_meta_restore(io);
 		if (kiocb->ki_flags & IOCB_WRITE)
 			io_req_end_write(req);
 		return -EAGAIN;
diff --git a/io_uring/rw.h b/io_uring/rw.h
index 3f432dc75441..2d7656bd268d 100644
--- a/io_uring/rw.h
+++ b/io_uring/rw.h
@@ -2,6 +2,11 @@ 
 
 #include <linux/pagemap.h>
 
+struct io_meta_state {
+	u32			seed;
+	struct iov_iter_state	iter_meta;
+};
+
 struct io_async_rw {
 	size_t				bytes_done;
 	struct iov_iter			iter;
@@ -9,7 +14,14 @@  struct io_async_rw {
 	struct iovec			fast_iov;
 	struct iovec			*free_iovec;
 	int				free_iov_nr;
-	struct wait_page_queue		wpq;
+	/* wpq is for buffered io, while meta fields are used with direct io */
+	union {
+		struct wait_page_queue		wpq;
+		struct {
+			struct uio_meta			meta;
+			struct io_meta_state		meta_state;
+		};
+	};
 };
 
 int io_prep_read_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe);