diff mbox series

[for-next,3/4] io_uring: grow a field in struct io_uring_cmd

Message ID 20220711110155.649153-4-joshi.k@samsung.com (mailing list archive)
State New, archived
Headers show
Series [for-next,1/4] io_uring, nvme: rename a function | expand

Commit Message

Kanchan Joshi July 11, 2022, 11:01 a.m. UTC
Use the leftover space to carve 'next' field that enables linking of
io_uring_cmd structs. Also introduce a list head and few helpers.

This is in preparation to support nvme-mulitpath, allowing multiple
uring passthrough commands to be queued.

Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
Signed-off-by: Anuj Gupta <anuj20.g@samsung.com>
---
 include/linux/io_uring.h | 38 ++++++++++++++++++++++++++++++++++++--
 1 file changed, 36 insertions(+), 2 deletions(-)

Comments

Sagi Grimberg July 11, 2022, 5 p.m. UTC | #1
> Use the leftover space to carve 'next' field that enables linking of
> io_uring_cmd structs. Also introduce a list head and few helpers.
> 
> This is in preparation to support nvme-mulitpath, allowing multiple
> uring passthrough commands to be queued.
> 
> Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
> Signed-off-by: Anuj Gupta <anuj20.g@samsung.com>
> ---
>   include/linux/io_uring.h | 38 ++++++++++++++++++++++++++++++++++++--
>   1 file changed, 36 insertions(+), 2 deletions(-)
> 
> diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
> index 54063d67506b..d734599cbcd7 100644
> --- a/include/linux/io_uring.h
> +++ b/include/linux/io_uring.h
> @@ -22,9 +22,14 @@ struct io_uring_cmd {
>   	const void	*cmd;
>   	/* callback to defer completions to task context */
>   	void (*task_work_cb)(struct io_uring_cmd *cmd);
> +	struct io_uring_cmd	*next;
>   	u32		cmd_op;
> -	u32		pad;
> -	u8		pdu[32]; /* available inline for free use */
> +	u8		pdu[28]; /* available inline for free use */
> +};

I think io_uring_cmd will at some point become two cachelines and may
not be worth the effort to limit a pdu to 28 bytes...
Jens Axboe July 11, 2022, 5:18 p.m. UTC | #2
On 7/11/22 5:01 AM, Kanchan Joshi wrote:
> Use the leftover space to carve 'next' field that enables linking of
> io_uring_cmd structs. Also introduce a list head and few helpers.
> 
> This is in preparation to support nvme-mulitpath, allowing multiple
> uring passthrough commands to be queued.

It's not clear to me why we need linking at that level?
Jens Axboe July 11, 2022, 5:19 p.m. UTC | #3
On 7/11/22 11:00 AM, Sagi Grimberg wrote:
> 
>> Use the leftover space to carve 'next' field that enables linking of
>> io_uring_cmd structs. Also introduce a list head and few helpers.
>>
>> This is in preparation to support nvme-mulitpath, allowing multiple
>> uring passthrough commands to be queued.
>>
>> Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
>> Signed-off-by: Anuj Gupta <anuj20.g@samsung.com>
>> ---
>>   include/linux/io_uring.h | 38 ++++++++++++++++++++++++++++++++++++--
>>   1 file changed, 36 insertions(+), 2 deletions(-)
>>
>> diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
>> index 54063d67506b..d734599cbcd7 100644
>> --- a/include/linux/io_uring.h
>> +++ b/include/linux/io_uring.h
>> @@ -22,9 +22,14 @@ struct io_uring_cmd {
>>       const void    *cmd;
>>       /* callback to defer completions to task context */
>>       void (*task_work_cb)(struct io_uring_cmd *cmd);
>> +    struct io_uring_cmd    *next;
>>       u32        cmd_op;
>> -    u32        pad;
>> -    u8        pdu[32]; /* available inline for free use */
>> +    u8        pdu[28]; /* available inline for free use */
>> +};
> 
> I think io_uring_cmd will at some point become two cachelines and may
> not be worth the effort to limit a pdu to 28 bytes...
Sagi Grimberg July 11, 2022, 5:55 p.m. UTC | #4
>> Use the leftover space to carve 'next' field that enables linking of
>> io_uring_cmd structs. Also introduce a list head and few helpers.
>>
>> This is in preparation to support nvme-mulitpath, allowing multiple
>> uring passthrough commands to be queued.
> 
> It's not clear to me why we need linking at that level?

I think the attempt is to allow something like blk_steal_bios that
nvme leverages for io_uring_cmd(s).

nvme failover steals all the bios from requests that fail (and should
failover) and puts them on a requeue list, and then schedules
a work that takes these bios one-by-one and submits them on a different
bottom namespace (see nvme_failover_req/nvme_requeue_work).
Sagi Grimberg July 11, 2022, 6:22 p.m. UTC | #5
>>> Use the leftover space to carve 'next' field that enables linking of
>>> io_uring_cmd structs. Also introduce a list head and few helpers.
>>>
>>> This is in preparation to support nvme-mulitpath, allowing multiple
>>> uring passthrough commands to be queued.
>>
>> It's not clear to me why we need linking at that level?
> 
> I think the attempt is to allow something like blk_steal_bios that
> nvme leverages for io_uring_cmd(s).

I'll rephrase because now that I read it, I think my phrasing is
confusing.

I think the attempt is to allow something like blk_steal_bios that
nvme leverages, but for io_uring_cmd(s). Essentially allow io_uring_cmd
to be linked in a requeue_list.

> 
> nvme failover steals all the bios from requests that fail (and should
> failover) and puts them on a requeue list, and then schedules
> a work that takes these bios one-by-one and submits them on a different
> bottom namespace (see nvme_failover_req/nvme_requeue_work).

Maybe if io_kiocb could exposed to nvme, and it had some generic space
that nvme could use, that would work as well...
Jens Axboe July 11, 2022, 6:24 p.m. UTC | #6
On 7/11/22 12:22 PM, Sagi Grimberg wrote:
> 
>>>> Use the leftover space to carve 'next' field that enables linking of
>>>> io_uring_cmd structs. Also introduce a list head and few helpers.
>>>>
>>>> This is in preparation to support nvme-mulitpath, allowing multiple
>>>> uring passthrough commands to be queued.
>>>
>>> It's not clear to me why we need linking at that level?
>>
>> I think the attempt is to allow something like blk_steal_bios that
>> nvme leverages for io_uring_cmd(s).
> 
> I'll rephrase because now that I read it, I think my phrasing is
> confusing.
> 
> I think the attempt is to allow something like blk_steal_bios that
> nvme leverages, but for io_uring_cmd(s). Essentially allow io_uring_cmd
> to be linked in a requeue_list.

I see. I wonder if there's some other way we can accomplish that, so we
don't have to shrink the current space. io_kiocb already support
linking, so seems like that should be workable.

>> nvme failover steals all the bios from requests that fail (and should
>> failover) and puts them on a requeue list, and then schedules
>> a work that takes these bios one-by-one and submits them on a different
>> bottom namespace (see nvme_failover_req/nvme_requeue_work).
> 
> Maybe if io_kiocb could exposed to nvme, and it had some generic space
> that nvme could use, that would work as well...

It will be more exposed in 5.20, but passthrough is already using the
per-op allotted space in the io_kiocb. But as mentioned above, there's
already linking support between io_kiocbs, and that is likely what
should be used here too.
Sagi Grimberg July 11, 2022, 6:58 p.m. UTC | #7
>>>>> Use the leftover space to carve 'next' field that enables linking of
>>>>> io_uring_cmd structs. Also introduce a list head and few helpers.
>>>>>
>>>>> This is in preparation to support nvme-mulitpath, allowing multiple
>>>>> uring passthrough commands to be queued.
>>>>
>>>> It's not clear to me why we need linking at that level?
>>>
>>> I think the attempt is to allow something like blk_steal_bios that
>>> nvme leverages for io_uring_cmd(s).
>>
>> I'll rephrase because now that I read it, I think my phrasing is
>> confusing.
>>
>> I think the attempt is to allow something like blk_steal_bios that
>> nvme leverages, but for io_uring_cmd(s). Essentially allow io_uring_cmd
>> to be linked in a requeue_list.
> 
> I see. I wonder if there's some other way we can accomplish that, so we
> don't have to shrink the current space. io_kiocb already support
> linking, so seems like that should be workable.
> 
>>> nvme failover steals all the bios from requests that fail (and should
>>> failover) and puts them on a requeue list, and then schedules
>>> a work that takes these bios one-by-one and submits them on a different
>>> bottom namespace (see nvme_failover_req/nvme_requeue_work).
>>
>> Maybe if io_kiocb could exposed to nvme, and it had some generic space
>> that nvme could use, that would work as well...
> 
> It will be more exposed in 5.20, but passthrough is already using the
> per-op allotted space in the io_kiocb. But as mentioned above, there's
> already linking support between io_kiocbs, and that is likely what
> should be used here too.

Agree. I don't think there is an issue with passing uring_cmd() an
io_kiocb instead of a io_uring_cmd which is less space strict.
Kanchan Joshi July 12, 2022, 11:40 a.m. UTC | #8
On Mon, Jul 11, 2022 at 12:24:42PM -0600, Jens Axboe wrote:
>On 7/11/22 12:22 PM, Sagi Grimberg wrote:
>>
>>>>> Use the leftover space to carve 'next' field that enables linking of
>>>>> io_uring_cmd structs. Also introduce a list head and few helpers.
>>>>>
>>>>> This is in preparation to support nvme-mulitpath, allowing multiple
>>>>> uring passthrough commands to be queued.
>>>>
>>>> It's not clear to me why we need linking at that level?
>>>
>>> I think the attempt is to allow something like blk_steal_bios that
>>> nvme leverages for io_uring_cmd(s).
>>
>> I'll rephrase because now that I read it, I think my phrasing is
>> confusing.
>>
>> I think the attempt is to allow something like blk_steal_bios that
>> nvme leverages, but for io_uring_cmd(s). Essentially allow io_uring_cmd
>> to be linked in a requeue_list.
>
>I see. I wonder if there's some other way we can accomplish that, so we
>don't have to shrink the current space. io_kiocb already support
>linking, so seems like that should be workable.
>
>>> nvme failover steals all the bios from requests that fail (and should
>>> failover) and puts them on a requeue list, and then schedules
>>> a work that takes these bios one-by-one and submits them on a different
>>> bottom namespace (see nvme_failover_req/nvme_requeue_work).
>>
>> Maybe if io_kiocb could exposed to nvme, and it had some generic space
>> that nvme could use, that would work as well...
>
>It will be more exposed in 5.20, but passthrough is already using the
>per-op allotted space in the io_kiocb. But as mentioned above, there's
>already linking support between io_kiocbs, and that is likely what
>should be used here too.
>
io_kiocb linking is used if user-space wants to link SQEs for any
ordering. If we go this route, we give up that feature for
uring-command SQEs.
Ming Lei July 14, 2022, 3:40 a.m. UTC | #9
On Mon, Jul 11, 2022 at 09:22:54PM +0300, Sagi Grimberg wrote:
> 
> > > > Use the leftover space to carve 'next' field that enables linking of
> > > > io_uring_cmd structs. Also introduce a list head and few helpers.
> > > > 
> > > > This is in preparation to support nvme-mulitpath, allowing multiple
> > > > uring passthrough commands to be queued.
> > > 
> > > It's not clear to me why we need linking at that level?
> > 
> > I think the attempt is to allow something like blk_steal_bios that
> > nvme leverages for io_uring_cmd(s).
> 
> I'll rephrase because now that I read it, I think my phrasing is
> confusing.
> 
> I think the attempt is to allow something like blk_steal_bios that
> nvme leverages, but for io_uring_cmd(s). Essentially allow io_uring_cmd
> to be linked in a requeue_list.

io_uring_cmd is 1:1 with pt request, so I am wondering why not retry on
io_uring_cmd instance directly via io_uring_cmd_execute_in_task().

I feels it isn't necessary to link io_uring_cmd into list.


Thanks,
Ming
Kanchan Joshi July 14, 2022, 8:19 a.m. UTC | #10
On Thu, Jul 14, 2022 at 11:40:46AM +0800, Ming Lei wrote:
>On Mon, Jul 11, 2022 at 09:22:54PM +0300, Sagi Grimberg wrote:
>>
>> > > > Use the leftover space to carve 'next' field that enables linking of
>> > > > io_uring_cmd structs. Also introduce a list head and few helpers.
>> > > >
>> > > > This is in preparation to support nvme-mulitpath, allowing multiple
>> > > > uring passthrough commands to be queued.
>> > >
>> > > It's not clear to me why we need linking at that level?
>> >
>> > I think the attempt is to allow something like blk_steal_bios that
>> > nvme leverages for io_uring_cmd(s).
>>
>> I'll rephrase because now that I read it, I think my phrasing is
>> confusing.
>>
>> I think the attempt is to allow something like blk_steal_bios that
>> nvme leverages, but for io_uring_cmd(s). Essentially allow io_uring_cmd
>> to be linked in a requeue_list.
>
>io_uring_cmd is 1:1 with pt request, so I am wondering why not retry on
>io_uring_cmd instance directly via io_uring_cmd_execute_in_task().
>
>I feels it isn't necessary to link io_uring_cmd into list.

If path is not available, retry is not done immediately rather we wait for
path to be available (as underlying controller may still be
resetting/connecting). List helped as command gets added into
it (and submitter/io_uring gets the control back), and retry is done
exact point in time.
But yes, it won't harm if we do couple of retries even if path is known
not to be available (somewhat like iopoll). As this situation is
not common. And with that scheme, we don't have to link io_uring_cmd.

Sagi: does this sound fine to you?
Daniel Wagner July 14, 2022, 3:30 p.m. UTC | #11
On Thu, Jul 14, 2022 at 01:49:23PM +0530, Kanchan Joshi wrote:
> If path is not available, retry is not done immediately rather we wait for
> path to be available (as underlying controller may still be
> resetting/connecting). List helped as command gets added into
> it (and submitter/io_uring gets the control back), and retry is done
> exact point in time.
> But yes, it won't harm if we do couple of retries even if path is known
> not to be available (somewhat like iopoll). As this situation is
> not common. And with that scheme, we don't have to link io_uring_cmd.

Stupid question does it only fail over immediately when the path is not
available or any failure? If it fails over for everything it's possible
the target gets the same request twice. FWIW, we are just debugging this
scenario right now.
Kanchan Joshi July 15, 2022, 11:07 a.m. UTC | #12
On Thu, Jul 14, 2022 at 05:30:51PM +0200, Daniel Wagner wrote:
>On Thu, Jul 14, 2022 at 01:49:23PM +0530, Kanchan Joshi wrote:
>> If path is not available, retry is not done immediately rather we wait for
>> path to be available (as underlying controller may still be
>> resetting/connecting). List helped as command gets added into
>> it (and submitter/io_uring gets the control back), and retry is done
>> exact point in time.
>> But yes, it won't harm if we do couple of retries even if path is known
>> not to be available (somewhat like iopoll). As this situation is
>> not common. And with that scheme, we don't have to link io_uring_cmd.
>
>Stupid question does it only fail over immediately when the path is not
>available or any failure? If it fails over for everything it's possible
>the target gets the same request twice. FWIW, we are just debugging this
>scenario right now.

failover is only for path-related failure, and not otherwise.
you might want to take a look at nvme_decide_disposition routine where
it makes that decision.
Daniel Wagner July 18, 2022, 9:03 a.m. UTC | #13
On Fri, Jul 15, 2022 at 04:37:00PM +0530, Kanchan Joshi wrote:
> On Thu, Jul 14, 2022 at 05:30:51PM +0200, Daniel Wagner wrote:
> > On Thu, Jul 14, 2022 at 01:49:23PM +0530, Kanchan Joshi wrote:
> > > If path is not available, retry is not done immediately rather we wait for
> > > path to be available (as underlying controller may still be
> > > resetting/connecting). List helped as command gets added into
> > > it (and submitter/io_uring gets the control back), and retry is done
> > > exact point in time.
> > > But yes, it won't harm if we do couple of retries even if path is known
> > > not to be available (somewhat like iopoll). As this situation is
> > > not common. And with that scheme, we don't have to link io_uring_cmd.
> > 
> > Stupid question does it only fail over immediately when the path is not
> > available or any failure? If it fails over for everything it's possible
> > the target gets the same request twice. FWIW, we are just debugging this
> > scenario right now.
> 
> failover is only for path-related failure, and not otherwise.
> you might want to take a look at nvme_decide_disposition routine where
> it makes that decision.

Ah okay, never mind. I somewhat got the impression there is special
handling code added for this case. Sorry for the noise.
diff mbox series

Patch

diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index 54063d67506b..d734599cbcd7 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -22,9 +22,14 @@  struct io_uring_cmd {
 	const void	*cmd;
 	/* callback to defer completions to task context */
 	void (*task_work_cb)(struct io_uring_cmd *cmd);
+	struct io_uring_cmd	*next;
 	u32		cmd_op;
-	u32		pad;
-	u8		pdu[32]; /* available inline for free use */
+	u8		pdu[28]; /* available inline for free use */
+};
+
+struct ioucmd_list {
+	struct io_uring_cmd *head;
+	struct io_uring_cmd *tail;
 };
 
 #if defined(CONFIG_IO_URING)
@@ -54,6 +59,27 @@  static inline void io_uring_free(struct task_struct *tsk)
 	if (tsk->io_uring)
 		__io_uring_free(tsk);
 }
+
+static inline struct io_uring_cmd *ioucmd_list_get(struct ioucmd_list *il)
+{
+	struct io_uring_cmd *ioucmd = il->head;
+
+	il->head = il->tail = NULL;
+
+	return ioucmd;
+}
+
+static inline void ioucmd_list_add(struct ioucmd_list *il,
+		struct io_uring_cmd *ioucmd)
+{
+	ioucmd->next = NULL;
+
+	if (il->tail)
+		il->tail->next = ioucmd;
+	else
+		il->head = ioucmd;
+	il->tail = ioucmd;
+}
 #else
 static inline void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret,
 		ssize_t ret2)
@@ -80,6 +106,14 @@  static inline const char *io_uring_get_opcode(u8 opcode)
 {
 	return "";
 }
+static inline struct io_uring_cmd *ioucmd_list_get(struct ioucmd_list *il)
+{
+	return NULL;
+}
+static inline void ioucmd_list_add(struct ioucmd_list *il,
+		struct io_uring_cmd *ioucmd)
+{
+}
 #endif
 
 #endif