diff mbox series

[v10,06/10] io_uring: introduce attributes for read/write and PI support

Message ID 20241125070633.8042-7-anuj20.g@samsung.com (mailing list archive)
State New
Headers show
Series [v10,01/10] block: define set of integrity flags to be inherited by cloned bip | expand

Commit Message

Anuj Gupta Nov. 25, 2024, 7:06 a.m. UTC
Add the ability to pass additional attributes along with read/write.
Application can populate attribute type and attibute specific information
in 'struct io_uring_attr' and pass its address using the SQE field:
	__u64	attr_ptr;

Along with setting a mask indicating attributes being passed:
	__u64	attr_type_mask;

Overall 64 attributes are allowed and currently one attribute
'ATTR_TYPE_PI' is supported.

With PI attribute, userspace can pass following information:
- flags: integrity check flags IO_INTEGRITY_CHK_{GUARD/APPTAG/REFTAG}
- len: length of PI/metadata buffer
- addr: address of metadata buffer
- seed: seed value for reftag remapping
- app_tag: application defined 16b value

Process this information to prepare uio_meta_descriptor and pass it down
using kiocb->private.

PI attribute is supported only for direct IO.

Signed-off-by: Anuj Gupta <anuj20.g@samsung.com>
Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
---
 include/uapi/linux/io_uring.h | 31 +++++++++++++
 io_uring/io_uring.c           |  2 +
 io_uring/rw.c                 | 82 ++++++++++++++++++++++++++++++++++-
 io_uring/rw.h                 | 14 +++++-
 4 files changed, 126 insertions(+), 3 deletions(-)

Comments

Pavel Begunkov Nov. 25, 2024, 2:58 p.m. UTC | #1
On 11/25/24 07:06, Anuj Gupta wrote:
> Add the ability to pass additional attributes along with read/write.
> Application can populate attribute type and attibute specific information
> in 'struct io_uring_attr' and pass its address using the SQE field:
> 	__u64	attr_ptr;
> 
> Along with setting a mask indicating attributes being passed:
> 	__u64	attr_type_mask;
> 
> Overall 64 attributes are allowed and currently one attribute
> 'ATTR_TYPE_PI' is supported.
> 
> With PI attribute, userspace can pass following information:
> - flags: integrity check flags IO_INTEGRITY_CHK_{GUARD/APPTAG/REFTAG}
> - len: length of PI/metadata buffer
> - addr: address of metadata buffer
> - seed: seed value for reftag remapping
> - app_tag: application defined 16b value

The API and io_uring parts look good, I'll ask to address the
ATTR_TYPE comment below, the rest are nits, which that can be
ignored and/or delayed.

> Process this information to prepare uio_meta_descriptor and pass it down
> using kiocb->private.

I'm not sure using ->private is a good thing, but I assume it
was discussed, so I'll leave it to Jens and other folks.


> PI attribute is supported only for direct IO.
> 
> Signed-off-by: Anuj Gupta <anuj20.g@samsung.com>
> Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
> ---
>   include/uapi/linux/io_uring.h | 31 +++++++++++++
>   io_uring/io_uring.c           |  2 +
>   io_uring/rw.c                 | 82 ++++++++++++++++++++++++++++++++++-
>   io_uring/rw.h                 | 14 +++++-
>   4 files changed, 126 insertions(+), 3 deletions(-)
> 
> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
> index aac9a4f8fa9a..bf28d49583ad 100644
> --- a/include/uapi/linux/io_uring.h
> +++ b/include/uapi/linux/io_uring.h
> @@ -98,6 +98,10 @@ struct io_uring_sqe {
>   			__u64	addr3;
>   			__u64	__pad2[1];
>   		};
> +		struct {
> +			__u64	attr_ptr; /* pointer to attribute information */
> +			__u64	attr_type_mask; /* bit mask of attributes */
> +		};
>   		__u64	optval;
>   		/*
>   		 * If the ring is initialized with IORING_SETUP_SQE128, then
> @@ -107,6 +111,33 @@ struct io_uring_sqe {
>   	};
>   };
>   
> +
> +/* Attributes to be passed with read/write */
> +enum io_uring_attr_type {
> +	ATTR_TYPE_PI,
> +	/* max supported attributes */
> +	ATTR_TYPE_LAST = 64,

ATTR_TYPE sounds too generic, too easy to get a symbol collision
including with user space code.

Some options: IORING_ATTR_TYPE_PI, IORING_RW_ATTR_TYPE_PI.
If it's not supposed to be io_uring specific can be
IO_RW_ATTR_TYPE_PI

> +};
> +
> +/* sqe->attr_type_mask flags */
> +#define ATTR_FLAG_PI	(1U << ATTR_TYPE_PI)
> +/* PI attribute information */
> +struct io_uring_attr_pi {
> +		__u16	flags;
> +		__u16	app_tag;
> +		__u32	len;
> +		__u64	addr;
> +		__u64	seed;
> +		__u64	rsvd;
> +};
> +
> +/* attribute information along with type */
> +struct io_uring_attr {
> +	enum io_uring_attr_type	attr_type;

I'm not against it, but adding a type field to each attribute is not
strictly needed, you can already derive where each attr placed purely
from the mask. Are there some upsides? But again I'm not against it.

> +	/* type specific struct here */
> +	struct io_uring_attr_pi	pi;
> +};
> +
>   /*
>    * If sqe->file_index is set to this for opcodes that instantiate a new
>    * direct descriptor (like openat/openat2/accept), then io_uring will allocate
> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
> index c3a7d0197636..02291ea679fb 100644
> --- a/io_uring/io_uring.c
> +++ b/io_uring/io_uring.c
> @@ -3889,6 +3889,8 @@ static int __init io_uring_init(void)
>   	BUILD_BUG_SQE_ELEM(46, __u16,  __pad3[0]);
>   	BUILD_BUG_SQE_ELEM(48, __u64,  addr3);
>   	BUILD_BUG_SQE_ELEM_SIZE(48, 0, cmd);
> +	BUILD_BUG_SQE_ELEM(48, __u64, attr_ptr);
> +	BUILD_BUG_SQE_ELEM(56, __u64, attr_type_mask);
>   	BUILD_BUG_SQE_ELEM(56, __u64,  __pad2);
>   
>   	BUILD_BUG_ON(sizeof(struct io_uring_files_update) !=
> diff --git a/io_uring/rw.c b/io_uring/rw.c
> index 0bcb83e4ce3c..71bfb74fef96 100644
> --- a/io_uring/rw.c
> +++ b/io_uring/rw.c
> @@ -257,11 +257,54 @@ static int io_prep_rw_setup(struct io_kiocb *req, int ddir, bool do_import)
>   	return 0;
>   }
...
> @@ -902,6 +976,8 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags)
>   	 * manually if we need to.
>   	 */
>   	iov_iter_restore(&io->iter, &io->iter_state);
> +	if (kiocb->ki_flags & IOCB_HAS_METADATA)
> +		io_meta_restore(io);

That can be turned into a helper, but that can be done as a follow up.

I'd also add a IOCB_HAS_METADATA into or around of
io_rw_should_retry(). You're relying on O_DIRECT checks, but that
sounds a bit fragile in the long run.
Anuj Gupta Nov. 26, 2024, 10:40 a.m. UTC | #2
On Mon, Nov 25, 2024 at 02:58:19PM +0000, Pavel Begunkov wrote:
> On 11/25/24 07:06, Anuj Gupta wrote:
> 
> ATTR_TYPE sounds too generic, too easy to get a symbol collision
> including with user space code.
> 
> Some options: IORING_ATTR_TYPE_PI, IORING_RW_ATTR_TYPE_PI.
> If it's not supposed to be io_uring specific can be
> IO_RW_ATTR_TYPE_PI
> 

Sure, will change to a different name in the next iteration.

> > +
> > +/* attribute information along with type */
> > +struct io_uring_attr {
> > +	enum io_uring_attr_type	attr_type;
> 
> I'm not against it, but adding a type field to each attribute is not
> strictly needed, you can already derive where each attr placed purely
> from the mask. Are there some upsides? But again I'm not against it.
> 

The mask indicates what all attributes have been passed. But while
processing we would need to know where exactly the attributes have been
placed, as attributes are of different sizes (each attribute is of
fixed size though) and they could be placed in any order. Processing when
multiple attributes are passed would look something like this:

attr_ptr = READ_ONCE(sqe->attr_ptr);
attr_mask = READ_ONCE(sqe->attr_type_mask);
size = total_size_of_attributes_passed_from_attr_mask;

copy_from_user(attr_buf, attr_ptr, size);

while (size > 0) {
	if (sizeof(io_uring_attr_type) > size)
		break;

	attr_type = get_type(attr_buf);
	attr_size = get_size(attr_type);

	process_attr(attr_type, attr_buf);
	attr_buf += attr_size;
	size -= attr_size;
}

We cannot derive where exactly the attribute information is placed
purely from the mask, so we need the type field. Do you see it
differently?
Pavel Begunkov Nov. 26, 2024, 12:53 p.m. UTC | #3
On 11/26/24 10:40, Anuj Gupta wrote:
> On Mon, Nov 25, 2024 at 02:58:19PM +0000, Pavel Begunkov wrote:
>> On 11/25/24 07:06, Anuj Gupta wrote:
>>
>> ATTR_TYPE sounds too generic, too easy to get a symbol collision
>> including with user space code.
>>
>> Some options: IORING_ATTR_TYPE_PI, IORING_RW_ATTR_TYPE_PI.
>> If it's not supposed to be io_uring specific can be
>> IO_RW_ATTR_TYPE_PI
>>
> 
> Sure, will change to a different name in the next iteration.
> 
>>> +
>>> +/* attribute information along with type */
>>> +struct io_uring_attr {
>>> +	enum io_uring_attr_type	attr_type;
>>
>> I'm not against it, but adding a type field to each attribute is not
>> strictly needed, you can already derive where each attr placed purely
>> from the mask. Are there some upsides? But again I'm not against it.
>>
> 
> The mask indicates what all attributes have been passed. But while
> processing we would need to know where exactly the attributes have been
> placed, as attributes are of different sizes (each attribute is of
> fixed size though) and they could be placed in any order. Processing when
> multiple attributes are passed would look something like this:
> 
> attr_ptr = READ_ONCE(sqe->attr_ptr);
> attr_mask = READ_ONCE(sqe->attr_type_mask);
> size = total_size_of_attributes_passed_from_attr_mask;
> 
> copy_from_user(attr_buf, attr_ptr, size);
> 
> while (size > 0) {
> 	if (sizeof(io_uring_attr_type) > size)
> 		break;
> 
> 	attr_type = get_type(attr_buf);
> 	attr_size = get_size(attr_type);
> 
> 	process_attr(attr_type, attr_buf);
> 	attr_buf += attr_size;
> 	size -= attr_size;
> }
> 
> We cannot derive where exactly the attribute information is placed
> purely from the mask, so we need the type field. Do you see it
> differently?

In the mask version I outlined attributes go in the array in order
of their types, max 1 attribute of each type, in which case the
mask fully describes the placement.

static attr_param_sizes[] = {[TYPE_PI] = sizeof(pi), ...};
mask = READ_ONCE(sqe->mask);
off = 0;

for (type = 0; type < NR_TYPE; type++) { // or find_next_bit trick
	if (!(mask & BIT(i)))
		continue;
	process(type=i, pointer= base + attr_param_sizes[i]);
	off += attr_param_sizes[i];
}


Maybe it's a good idea to have a type field for double checking
though, and with it we don't have to commit to one version or
another yet.
Pavel Begunkov Nov. 26, 2024, 1:01 p.m. UTC | #4
On 11/25/24 07:06, Anuj Gupta wrote:
...
> +/* sqe->attr_type_mask flags */
> +#define ATTR_FLAG_PI	(1U << ATTR_TYPE_PI)
> +/* PI attribute information */
> +struct io_uring_attr_pi {
> +		__u16	flags;
> +		__u16	app_tag;
> +		__u32	len;
> +		__u64	addr;
> +		__u64	seed;
> +		__u64	rsvd;
> +};
> +
> +/* attribute information along with type */
> +struct io_uring_attr {
> +	enum io_uring_attr_type	attr_type;

Hmm, I think there will be implicit padding, we need to deal
with it.

> +	/* type specific struct here */
> +	struct io_uring_attr_pi	pi;
> +};

This also looks PI specific but with a generic name. Or are
attribute structures are supposed to be unionised?
Pavel Begunkov Nov. 26, 2024, 1:04 p.m. UTC | #5
On 11/26/24 13:01, Pavel Begunkov wrote:
> On 11/25/24 07:06, Anuj Gupta wrote:
> ...
>> +/* sqe->attr_type_mask flags */
>> +#define ATTR_FLAG_PI    (1U << ATTR_TYPE_PI)
>> +/* PI attribute information */
>> +struct io_uring_attr_pi {
>> +        __u16    flags;
>> +        __u16    app_tag;
>> +        __u32    len;
>> +        __u64    addr;
>> +        __u64    seed;
>> +        __u64    rsvd;
>> +};
>> +
>> +/* attribute information along with type */
>> +struct io_uring_attr {
>> +    enum io_uring_attr_type    attr_type;
> 
> Hmm, I think there will be implicit padding, we need to deal
> with it.

And it's better to be explicitly sized, e.g.
s/enum io_uring_attr_type/__u16/

>> +    /* type specific struct here */
>> +    struct io_uring_attr_pi    pi;
>> +};
> 
> This also looks PI specific but with a generic name. Or are
> attribute structures are supposed to be unionised?
>
Anuj Gupta Nov. 26, 2024, 1:54 p.m. UTC | #6
On Tue, Nov 26, 2024 at 01:01:03PM +0000, Pavel Begunkov wrote:
> On 11/25/24 07:06, Anuj Gupta wrote:
> ...
> > +	/* type specific struct here */
> > +	struct io_uring_attr_pi	pi;
> > +};
> 
> This also looks PI specific but with a generic name. Or are
> attribute structures are supposed to be unionised?

Yes, attribute structures would be unionised here. This is done so that
"attr_type" always remains at the top. When there are multiple attributes
this structure would look something like this:

/* attribute information along with type */
struct io_uring_attr {
	enum io_uring_attr_type attr_type;
	/* type specific struct here */
	union {
		struct io_uring_attr_pi	pi;
		struct io_uring_attr_x	x;
		struct io_uring_attr_y	y;
	};
};

And then on the application side for sending attribute x, one would do:

io_uring_attr attr;
attr.type = TYPE_X;
prepare_attr(&attr.x);
Pavel Begunkov Nov. 26, 2024, 3:45 p.m. UTC | #7
On 11/26/24 13:54, Anuj Gupta wrote:
> On Tue, Nov 26, 2024 at 01:01:03PM +0000, Pavel Begunkov wrote:
>> On 11/25/24 07:06, Anuj Gupta wrote:
>> ...
>>> +	/* type specific struct here */
>>> +	struct io_uring_attr_pi	pi;
>>> +};
>>
>> This also looks PI specific but with a generic name. Or are
>> attribute structures are supposed to be unionised?
> 
> Yes, attribute structures would be unionised here. This is done so that
> "attr_type" always remains at the top. When there are multiple attributes
> this structure would look something like this:
> 
> /* attribute information along with type */
> struct io_uring_attr {
> 	enum io_uring_attr_type attr_type;
> 	/* type specific struct here */
> 	union {
> 		struct io_uring_attr_pi	pi;
> 		struct io_uring_attr_x	x;
> 		struct io_uring_attr_y	y;
> 	};
> };
> 
> And then on the application side for sending attribute x, one would do:
> 
> io_uring_attr attr;
> attr.type = TYPE_X;
> prepare_attr(&attr.x);

Hmm, I have doubts it's going to work well because the union
members have different sizes. Adding a new type could grow
struct io_uring_attr, which is already bad for uapi. And it
can't be stacked:

io_uring_attr attrs[2] = {..., ...}
sqe->attr_ptr = &attrs;
...

This example would be incorrect. Even if it's just one attribute
the user would be wasting space on stack. The only use for it I
see is having ephemeral pointers during parsing, ala

void parse(voud *attributes, offset) {
	struct io_uring_attr *attr = attributes + offset;
	
	if (attr->type == PI) {
		process_pi(&attr->pi);
		// or potentially fill_pi() in userspace
	}
}

But I don't think it's worth it. I'd say, if you're leaving
the structure, let's rename it to struct io_uring_attr_type_pi
or something similar. We can always add a new one later, it
doesn't change the ABI.
Anuj gupta Nov. 26, 2024, 4:23 p.m. UTC | #8
On Tue, Nov 26, 2024 at 9:14 PM Pavel Begunkov <asml.silence@gmail.com> wrote:
>
> On 11/26/24 13:54, Anuj Gupta wrote:
> > On Tue, Nov 26, 2024 at 01:01:03PM +0000, Pavel Begunkov wrote:
> >> On 11/25/24 07:06, Anuj Gupta wrote:
> >> ...
> >>> +   /* type specific struct here */
> >>> +   struct io_uring_attr_pi pi;
> >>> +};
> >>
> >> This also looks PI specific but with a generic name. Or are
> >> attribute structures are supposed to be unionised?
> >
> > Yes, attribute structures would be unionised here. This is done so that
> > "attr_type" always remains at the top. When there are multiple attributes
> > this structure would look something like this:
> >
> > /* attribute information along with type */
> > struct io_uring_attr {
> >       enum io_uring_attr_type attr_type;
> >       /* type specific struct here */
> >       union {
> >               struct io_uring_attr_pi pi;
> >               struct io_uring_attr_x  x;
> >               struct io_uring_attr_y  y;
> >       };
> > };
> >
> > And then on the application side for sending attribute x, one would do:
> >
> > io_uring_attr attr;
> > attr.type = TYPE_X;
> > prepare_attr(&attr.x);
>
> Hmm, I have doubts it's going to work well because the union
> members have different sizes. Adding a new type could grow
> struct io_uring_attr, which is already bad for uapi. And it
> can't be stacked:
>
> io_uring_attr attrs[2] = {..., ...}
> sqe->attr_ptr = &attrs;
> ...
>
> This example would be incorrect. Even if it's just one attribute
> the user would be wasting space on stack. The only use for it I
> see is having ephemeral pointers during parsing, ala
>
> void parse(voud *attributes, offset) {
>         struct io_uring_attr *attr = attributes + offset;
>
>         if (attr->type == PI) {
>                 process_pi(&attr->pi);
>                 // or potentially fill_pi() in userspace
>         }
> }
>
> But I don't think it's worth it. I'd say, if you're leaving
> the structure, let's rename it to struct io_uring_attr_type_pi
> or something similar. We can always add a new one later, it
> doesn't change the ABI.
>

In that case I can just drop the io_uring_attr_pi structure then. We can
keep the mask version where we won't need the type and attributes would go
in the array in order of their types as you suggested here [1]. Does that
sound fine?

[1] https://lore.kernel.org/io-uring/37ba07f6-27a5-45bc-86c4-df9c63908ef9@gmail.com/
Anuj Gupta Nov. 27, 2024, 9:46 a.m. UTC | #9
On Tue, Nov 26, 2024 at 03:45:09PM +0000, Pavel Begunkov wrote:
> On 11/26/24 13:54, Anuj Gupta wrote:
> > On Tue, Nov 26, 2024 at 01:01:03PM +0000, Pavel Begunkov wrote:
> > > On 11/25/24 07:06, Anuj Gupta wrote:
> 
> Hmm, I have doubts it's going to work well because the union
> members have different sizes. Adding a new type could grow
> struct io_uring_attr, which is already bad for uapi. And it
> can't be stacked:
> 

How about something like this [1]. I have removed the io_uring_attr
structure, and with the mask scheme the user would pass attributes in
order of their types. Do you still see some cracks?

[1]

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index aac9a4f8fa9a..38f0d6b10eaf 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -98,6 +98,10 @@ struct io_uring_sqe {
 			__u64	addr3;
 			__u64	__pad2[1];
 		};
+		struct {
+			__u64	attr_ptr; /* pointer to attribute information */
+			__u64	attr_type_mask; /* bit mask of attributes */
+		};
 		__u64	optval;
 		/*
 		 * If the ring is initialized with IORING_SETUP_SQE128, then
@@ -107,6 +111,18 @@ struct io_uring_sqe {
 	};
 };
 
+/* sqe->attr_type_mask flags */
+#define IORING_RW_ATTR_FLAG_PI	(1U << 0)
+/* PI attribute information */
+struct io_uring_attr_pi {
+		__u16	flags;
+		__u16	app_tag;
+		__u32	len;
+		__u64	addr;
+		__u64	seed;
+		__u64	rsvd;
+};
+
 /*
  * If sqe->file_index is set to this for opcodes that instantiate a new
  * direct descriptor (like openat/openat2/accept), then io_uring will allocate
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index c3a7d0197636..02291ea679fb 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -3889,6 +3889,8 @@ static int __init io_uring_init(void)
 	BUILD_BUG_SQE_ELEM(46, __u16,  __pad3[0]);
 	BUILD_BUG_SQE_ELEM(48, __u64,  addr3);
 	BUILD_BUG_SQE_ELEM_SIZE(48, 0, cmd);
+	BUILD_BUG_SQE_ELEM(48, __u64, attr_ptr);
+	BUILD_BUG_SQE_ELEM(56, __u64, attr_type_mask);
 	BUILD_BUG_SQE_ELEM(56, __u64,  __pad2);
 
 	BUILD_BUG_ON(sizeof(struct io_uring_files_update) !=
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 0bcb83e4ce3c..8d2ec89fd76b 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -257,11 +257,53 @@ static int io_prep_rw_setup(struct io_kiocb *req, int ddir, bool do_import)
 	return 0;
 }
 
+static inline void io_meta_save_state(struct io_async_rw *io)
+{
+	io->meta_state.seed = io->meta.seed;
+	iov_iter_save_state(&io->meta.iter, &io->meta_state.iter_meta);
+}
+
+static inline void io_meta_restore(struct io_async_rw *io, struct kiocb *kiocb)
+{
+	if (kiocb->ki_flags & IOCB_HAS_METADATA) {
+		io->meta.seed = io->meta_state.seed;
+		iov_iter_restore(&io->meta.iter, &io->meta_state.iter_meta);
+	}
+}
+
+static int io_prep_rw_pi(struct io_kiocb *req, struct io_rw *rw, int ddir,
+			 u64 attr_ptr, u64 attr_type_mask)
+{
+	struct io_uring_attr_pi pi_attr;
+	struct io_async_rw *io;
+	int ret;
+
+	if (copy_from_user(&pi_attr, u64_to_user_ptr(attr_ptr),
+	    sizeof(pi_attr)))
+		return -EFAULT;
+
+	if (pi_attr.rsvd)
+		return -EINVAL;
+
+	io = req->async_data;
+	io->meta.flags = pi_attr.flags;
+	io->meta.app_tag = pi_attr.app_tag;
+	io->meta.seed = READ_ONCE(pi_attr.seed);
+	ret = import_ubuf(ddir, u64_to_user_ptr(pi_attr.addr),
+			  pi_attr.len, &io->meta.iter);
+	if (unlikely(ret < 0))
+		return ret;
+	rw->kiocb.ki_flags |= IOCB_HAS_METADATA;
+	io_meta_save_state(io);
+	return ret;
+}
+
 static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 		      int ddir, bool do_import)
 {
 	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
 	unsigned ioprio;
+	u64 attr_type_mask;
 	int ret;
 
 	rw->kiocb.ki_pos = READ_ONCE(sqe->off);
@@ -279,11 +321,28 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 		rw->kiocb.ki_ioprio = get_current_ioprio();
 	}
 	rw->kiocb.dio_complete = NULL;
+	rw->kiocb.ki_flags = 0;
 
 	rw->addr = READ_ONCE(sqe->addr);
 	rw->len = READ_ONCE(sqe->len);
 	rw->flags = READ_ONCE(sqe->rw_flags);
-	return io_prep_rw_setup(req, ddir, do_import);
+	ret = io_prep_rw_setup(req, ddir, do_import);
+
+	if (unlikely(ret))
+		return ret;
+
+	attr_type_mask = READ_ONCE(sqe->attr_type_mask);
+	if (attr_type_mask) {
+		u64 attr_ptr;
+
+		/* only PI attribute is supported currently */
+		if (attr_type_mask != IORING_RW_ATTR_FLAG_PI)
+			return -EINVAL;
+
+		attr_ptr = READ_ONCE(sqe->attr_ptr);
+		ret = io_prep_rw_pi(req, rw, ddir, attr_ptr, attr_type_mask);
+	}
+	return ret;
 }
 
 int io_prep_read(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -409,7 +468,9 @@ static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req)
 static void io_resubmit_prep(struct io_kiocb *req)
 {
 	struct io_async_rw *io = req->async_data;
+	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
 
+	io_meta_restore(io, &rw->kiocb);
 	iov_iter_restore(&io->iter, &io->iter_state);
 }
 
@@ -744,6 +805,10 @@ static bool io_rw_should_retry(struct io_kiocb *req)
 	if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_HIPRI))
 		return false;
 
+	/* never retry for meta io */
+	if (kiocb->ki_flags & IOCB_HAS_METADATA)
+		return false;
+
 	/*
 	 * just use poll if we can, and don't attempt if the fs doesn't
 	 * support callback based unlocks
@@ -794,7 +859,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type)
 	if (!(req->flags & REQ_F_FIXED_FILE))
 		req->flags |= io_file_get_flags(file);
 
-	kiocb->ki_flags = file->f_iocb_flags;
+	kiocb->ki_flags |= file->f_iocb_flags;
 	ret = kiocb_set_rw_flags(kiocb, rw->flags, rw_type);
 	if (unlikely(ret))
 		return ret;
@@ -828,6 +893,18 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type)
 		kiocb->ki_complete = io_complete_rw;
 	}
 
+	if (kiocb->ki_flags & IOCB_HAS_METADATA) {
+		struct io_async_rw *io = req->async_data;
+
+		/*
+		 * We have a union of meta fields with wpq used for buffered-io
+		 * in io_async_rw, so fail it here.
+		 */
+		if (!(req->file->f_flags & O_DIRECT))
+			return -EOPNOTSUPP;
+		kiocb->private = &io->meta;
+	}
+
 	return 0;
 }
 
@@ -902,6 +979,7 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags)
 	 * manually if we need to.
 	 */
 	iov_iter_restore(&io->iter, &io->iter_state);
+	io_meta_restore(io, kiocb);
 
 	do {
 		/*
@@ -1125,6 +1203,7 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags)
 	} else {
 ret_eagain:
 		iov_iter_restore(&io->iter, &io->iter_state);
+		io_meta_restore(io, kiocb);
 		if (kiocb->ki_flags & IOCB_WRITE)
 			io_req_end_write(req);
 		return -EAGAIN;
diff --git a/io_uring/rw.h b/io_uring/rw.h
index 3f432dc75441..2d7656bd268d 100644
--- a/io_uring/rw.h
+++ b/io_uring/rw.h
@@ -2,6 +2,11 @@
 
 #include <linux/pagemap.h>
 
+struct io_meta_state {
+	u32			seed;
+	struct iov_iter_state	iter_meta;
+};
+
 struct io_async_rw {
 	size_t				bytes_done;
 	struct iov_iter			iter;
@@ -9,7 +14,14 @@ struct io_async_rw {
 	struct iovec			fast_iov;
 	struct iovec			*free_iovec;
 	int				free_iov_nr;
-	struct wait_page_queue		wpq;
+	/* wpq is for buffered io, while meta fields are used with direct io */
+	union {
+		struct wait_page_queue		wpq;
+		struct {
+			struct uio_meta			meta;
+			struct io_meta_state		meta_state;
+		};
+	};
 };
 
 int io_prep_read_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe);
Pavel Begunkov Nov. 27, 2024, 10:35 a.m. UTC | #10
On 11/26/24 16:23, Anuj gupta wrote:
> On Tue, Nov 26, 2024 at 9:14 PM Pavel Begunkov <asml.silence@gmail.com> wrote:
...
>> This example would be incorrect. Even if it's just one attribute
>> the user would be wasting space on stack. The only use for it I
>> see is having ephemeral pointers during parsing, ala
>>
>> void parse(voud *attributes, offset) {
>>          struct io_uring_attr *attr = attributes + offset;
>>
>>          if (attr->type == PI) {
>>                  process_pi(&attr->pi);
>>                  // or potentially fill_pi() in userspace
>>          }
>> }
>>
>> But I don't think it's worth it. I'd say, if you're leaving
>> the structure, let's rename it to struct io_uring_attr_type_pi
>> or something similar. We can always add a new one later, it
>> doesn't change the ABI.
>>
> 
> In that case I can just drop the io_uring_attr_pi structure then. We can
> keep the mask version where we won't need the type and attributes would go
> in the array in order of their types as you suggested here [1]. Does that
> sound fine?

That should work, the approach in this patchset is fine as well.
I'll take a look at the path a bit later today.

> [1] https://lore.kernel.org/io-uring/37ba07f6-27a5-45bc-86c4-df9c63908ef9@gmail.com/
Pavel Begunkov Nov. 27, 2024, 11:24 a.m. UTC | #11
On 11/27/24 09:46, Anuj Gupta wrote:
> On Tue, Nov 26, 2024 at 03:45:09PM +0000, Pavel Begunkov wrote:
>> On 11/26/24 13:54, Anuj Gupta wrote:
>>> On Tue, Nov 26, 2024 at 01:01:03PM +0000, Pavel Begunkov wrote:
>>>> On 11/25/24 07:06, Anuj Gupta wrote:
>>
>> Hmm, I have doubts it's going to work well because the union
>> members have different sizes. Adding a new type could grow
>> struct io_uring_attr, which is already bad for uapi. And it
>> can't be stacked:
>>
> 
> How about something like this [1]. I have removed the io_uring_attr
> structure, and with the mask scheme the user would pass attributes in
> order of their types. Do you still see some cracks?

Looks good to me

> --- a/io_uring/rw.c
> +++ b/io_uring/rw.c
...
> +static int io_prep_rw_pi(struct io_kiocb *req, struct io_rw *rw, int ddir,
> +			 u64 attr_ptr, u64 attr_type_mask)
> +{
> +	struct io_uring_attr_pi pi_attr;
> +	struct io_async_rw *io;
> +	int ret;
> +
> +	if (copy_from_user(&pi_attr, u64_to_user_ptr(attr_ptr),
> +	    sizeof(pi_attr)))
> +		return -EFAULT;
> +
> +	if (pi_attr.rsvd)
> +		return -EINVAL;
> +
> +	io = req->async_data;
> +	io->meta.flags = pi_attr.flags;
> +	io->meta.app_tag = pi_attr.app_tag;
> +	io->meta.seed = READ_ONCE(pi_attr.seed);

Seems an unnecessary READ_ONCE slipped here

> +	ret = import_ubuf(ddir, u64_to_user_ptr(pi_attr.addr),
> +			  pi_attr.len, &io->meta.iter);
> +	if (unlikely(ret < 0))
> +		return ret;
> +	rw->kiocb.ki_flags |= IOCB_HAS_METADATA;
> +	io_meta_save_state(io);
> +	return ret;
> +}
...
diff mbox series

Patch

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index aac9a4f8fa9a..bf28d49583ad 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -98,6 +98,10 @@  struct io_uring_sqe {
 			__u64	addr3;
 			__u64	__pad2[1];
 		};
+		struct {
+			__u64	attr_ptr; /* pointer to attribute information */
+			__u64	attr_type_mask; /* bit mask of attributes */
+		};
 		__u64	optval;
 		/*
 		 * If the ring is initialized with IORING_SETUP_SQE128, then
@@ -107,6 +111,33 @@  struct io_uring_sqe {
 	};
 };
 
+
+/* Attributes to be passed with read/write */
+enum io_uring_attr_type {
+	ATTR_TYPE_PI,
+	/* max supported attributes */
+	ATTR_TYPE_LAST = 64,
+};
+
+/* sqe->attr_type_mask flags */
+#define ATTR_FLAG_PI	(1U << ATTR_TYPE_PI)
+/* PI attribute information */
+struct io_uring_attr_pi {
+		__u16	flags;
+		__u16	app_tag;
+		__u32	len;
+		__u64	addr;
+		__u64	seed;
+		__u64	rsvd;
+};
+
+/* attribute information along with type */
+struct io_uring_attr {
+	enum io_uring_attr_type	attr_type;
+	/* type specific struct here */
+	struct io_uring_attr_pi	pi;
+};
+
 /*
  * If sqe->file_index is set to this for opcodes that instantiate a new
  * direct descriptor (like openat/openat2/accept), then io_uring will allocate
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index c3a7d0197636..02291ea679fb 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -3889,6 +3889,8 @@  static int __init io_uring_init(void)
 	BUILD_BUG_SQE_ELEM(46, __u16,  __pad3[0]);
 	BUILD_BUG_SQE_ELEM(48, __u64,  addr3);
 	BUILD_BUG_SQE_ELEM_SIZE(48, 0, cmd);
+	BUILD_BUG_SQE_ELEM(48, __u64, attr_ptr);
+	BUILD_BUG_SQE_ELEM(56, __u64, attr_type_mask);
 	BUILD_BUG_SQE_ELEM(56, __u64,  __pad2);
 
 	BUILD_BUG_ON(sizeof(struct io_uring_files_update) !=
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 0bcb83e4ce3c..71bfb74fef96 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -257,11 +257,54 @@  static int io_prep_rw_setup(struct io_kiocb *req, int ddir, bool do_import)
 	return 0;
 }
 
+static inline void io_meta_save_state(struct io_async_rw *io)
+{
+	io->meta_state.seed = io->meta.seed;
+	iov_iter_save_state(&io->meta.iter, &io->meta_state.iter_meta);
+}
+
+static inline void io_meta_restore(struct io_async_rw *io)
+{
+	io->meta.seed = io->meta_state.seed;
+	iov_iter_restore(&io->meta.iter, &io->meta_state.iter_meta);
+}
+
+static int io_prep_rw_pi(struct io_kiocb *req, struct io_rw *rw, int ddir,
+			 u64 attr_ptr, u64 attr_type_mask)
+{
+	struct io_uring_attr pi_attr;
+	struct io_async_rw *io;
+	int ret;
+
+	if (copy_from_user(&pi_attr, u64_to_user_ptr(attr_ptr),
+	    sizeof(pi_attr)))
+		return -EFAULT;
+
+	if (pi_attr.attr_type != ATTR_TYPE_PI)
+		return -EINVAL;
+
+	if (pi_attr.pi.rsvd)
+		return -EINVAL;
+
+	io = req->async_data;
+	io->meta.flags = pi_attr.pi.flags;
+	io->meta.app_tag = pi_attr.pi.app_tag;
+	io->meta.seed = READ_ONCE(pi_attr.pi.seed);
+	ret = import_ubuf(ddir, u64_to_user_ptr(pi_attr.pi.addr),
+			  pi_attr.pi.len, &io->meta.iter);
+	if (unlikely(ret < 0))
+		return ret;
+	rw->kiocb.ki_flags |= IOCB_HAS_METADATA;
+	io_meta_save_state(io);
+	return ret;
+}
+
 static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 		      int ddir, bool do_import)
 {
 	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
 	unsigned ioprio;
+	u64 attr_type_mask;
 	int ret;
 
 	rw->kiocb.ki_pos = READ_ONCE(sqe->off);
@@ -279,11 +322,27 @@  static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 		rw->kiocb.ki_ioprio = get_current_ioprio();
 	}
 	rw->kiocb.dio_complete = NULL;
+	rw->kiocb.ki_flags = 0;
 
 	rw->addr = READ_ONCE(sqe->addr);
 	rw->len = READ_ONCE(sqe->len);
 	rw->flags = READ_ONCE(sqe->rw_flags);
-	return io_prep_rw_setup(req, ddir, do_import);
+	ret = io_prep_rw_setup(req, ddir, do_import);
+
+	if (unlikely(ret))
+		return ret;
+
+	attr_type_mask = READ_ONCE(sqe->attr_type_mask);
+	if (attr_type_mask) {
+		u64 attr_ptr;
+
+		if (attr_type_mask != ATTR_FLAG_PI)
+			return -EINVAL;
+
+		attr_ptr = READ_ONCE(sqe->attr_ptr);
+		ret = io_prep_rw_pi(req, rw, ddir, attr_ptr, attr_type_mask);
+	}
+	return ret;
 }
 
 int io_prep_read(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -409,7 +468,10 @@  static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req)
 static void io_resubmit_prep(struct io_kiocb *req)
 {
 	struct io_async_rw *io = req->async_data;
+	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
 
+	if (rw->kiocb.ki_flags & IOCB_HAS_METADATA)
+		io_meta_restore(io);
 	iov_iter_restore(&io->iter, &io->iter_state);
 }
 
@@ -794,7 +856,7 @@  static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type)
 	if (!(req->flags & REQ_F_FIXED_FILE))
 		req->flags |= io_file_get_flags(file);
 
-	kiocb->ki_flags = file->f_iocb_flags;
+	kiocb->ki_flags |= file->f_iocb_flags;
 	ret = kiocb_set_rw_flags(kiocb, rw->flags, rw_type);
 	if (unlikely(ret))
 		return ret;
@@ -828,6 +890,18 @@  static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type)
 		kiocb->ki_complete = io_complete_rw;
 	}
 
+	if (kiocb->ki_flags & IOCB_HAS_METADATA) {
+		struct io_async_rw *io = req->async_data;
+
+		/*
+		 * We have a union of meta fields with wpq used for buffered-io
+		 * in io_async_rw, so fail it here.
+		 */
+		if (!(req->file->f_flags & O_DIRECT))
+			return -EOPNOTSUPP;
+		kiocb->private = &io->meta;
+	}
+
 	return 0;
 }
 
@@ -902,6 +976,8 @@  static int __io_read(struct io_kiocb *req, unsigned int issue_flags)
 	 * manually if we need to.
 	 */
 	iov_iter_restore(&io->iter, &io->iter_state);
+	if (kiocb->ki_flags & IOCB_HAS_METADATA)
+		io_meta_restore(io);
 
 	do {
 		/*
@@ -1125,6 +1201,8 @@  int io_write(struct io_kiocb *req, unsigned int issue_flags)
 	} else {
 ret_eagain:
 		iov_iter_restore(&io->iter, &io->iter_state);
+		if (kiocb->ki_flags & IOCB_HAS_METADATA)
+			io_meta_restore(io);
 		if (kiocb->ki_flags & IOCB_WRITE)
 			io_req_end_write(req);
 		return -EAGAIN;
diff --git a/io_uring/rw.h b/io_uring/rw.h
index 3f432dc75441..2d7656bd268d 100644
--- a/io_uring/rw.h
+++ b/io_uring/rw.h
@@ -2,6 +2,11 @@ 
 
 #include <linux/pagemap.h>
 
+struct io_meta_state {
+	u32			seed;
+	struct iov_iter_state	iter_meta;
+};
+
 struct io_async_rw {
 	size_t				bytes_done;
 	struct iov_iter			iter;
@@ -9,7 +14,14 @@  struct io_async_rw {
 	struct iovec			fast_iov;
 	struct iovec			*free_iovec;
 	int				free_iov_nr;
-	struct wait_page_queue		wpq;
+	/* wpq is for buffered io, while meta fields are used with direct io */
+	union {
+		struct wait_page_queue		wpq;
+		struct {
+			struct uio_meta			meta;
+			struct io_meta_state		meta_state;
+		};
+	};
 };
 
 int io_prep_read_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe);