diff mbox series

[for-next,4/4] nvme-multipath: add multipathing for uring-passthrough commands

Message ID 20220711110155.649153-5-joshi.k@samsung.com (mailing list archive)
State New, archived
Headers show
Series [for-next,1/4] io_uring, nvme: rename a function | expand

Commit Message

Kanchan Joshi July 11, 2022, 11:01 a.m. UTC
From: Anuj Gupta <anuj20.g@samsung.com>

This heavily reuses nvme-mpath infrastructure for block-path.
Block-path operates on bio, and uses that as long-lived object across
requeue attempts. For passthrough interface io_uring_cmd happens to
be such object, so add a requeue list for that.

If path is not available, uring-cmds are added into that list and
resubmitted on discovery of new path. Existing requeue handler
(nvme_requeue_work) is extended to do this resubmission.

For failed commands, failover is done directly (i.e. without first
queuing into list) by resubmitting individual command from task-work
based callback.

Suggested-by: Sagi Grimberg <sagi@grimberg.me>
Co-developed-by: Kanchan Joshi <joshi.k@samsung.com>
Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
Signed-off-by: Anuj Gupta <anuj20.g@samsung.com>
---
 drivers/nvme/host/ioctl.c     | 141 ++++++++++++++++++++++++++++++++--
 drivers/nvme/host/multipath.c |  36 ++++++++-
 drivers/nvme/host/nvme.h      |  12 +++
 include/linux/io_uring.h      |   2 +
 4 files changed, 182 insertions(+), 9 deletions(-)

Comments

Sagi Grimberg July 11, 2022, 1:51 p.m. UTC | #1
> This heavily reuses nvme-mpath infrastructure for block-path.
> Block-path operates on bio, and uses that as long-lived object across
> requeue attempts. For passthrough interface io_uring_cmd happens to
> be such object, so add a requeue list for that.
> 
> If path is not available, uring-cmds are added into that list and
> resubmitted on discovery of new path. Existing requeue handler
> (nvme_requeue_work) is extended to do this resubmission.
> 
> For failed commands, failover is done directly (i.e. without first
> queuing into list) by resubmitting individual command from task-work
> based callback.

Nice!

> 
> Suggested-by: Sagi Grimberg <sagi@grimberg.me>
> Co-developed-by: Kanchan Joshi <joshi.k@samsung.com>
> Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
> Signed-off-by: Anuj Gupta <anuj20.g@samsung.com>
> ---
>   drivers/nvme/host/ioctl.c     | 141 ++++++++++++++++++++++++++++++++--
>   drivers/nvme/host/multipath.c |  36 ++++++++-
>   drivers/nvme/host/nvme.h      |  12 +++
>   include/linux/io_uring.h      |   2 +
>   4 files changed, 182 insertions(+), 9 deletions(-)
> 
> diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
> index fc02eddd4977..281c21d3c67d 100644
> --- a/drivers/nvme/host/ioctl.c
> +++ b/drivers/nvme/host/ioctl.c
> @@ -340,12 +340,6 @@ struct nvme_uring_data {
>   	__u32	timeout_ms;
>   };
>   
> -static inline struct nvme_uring_cmd_pdu *nvme_uring_cmd_pdu(
> -		struct io_uring_cmd *ioucmd)
> -{
> -	return (struct nvme_uring_cmd_pdu *)&ioucmd->pdu;
> -}
> -
>   static void nvme_uring_task_cb(struct io_uring_cmd *ioucmd)
>   {
>   	struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
> @@ -448,6 +442,14 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
>   	pdu->meta_buffer = nvme_to_user_ptr(d.metadata);
>   	pdu->meta_len = d.metadata_len;
>   
> +	if (issue_flags & IO_URING_F_MPATH) {
> +		req->cmd_flags |= REQ_NVME_MPATH;
> +		/*
> +		 * we would need the buffer address (d.addr field) if we have
> +		 * to retry the command. Store it by repurposing ioucmd->cmd
> +		 */
> +		ioucmd->cmd = (void *)d.addr;

What does repurposing mean here?

> +	}
>   	blk_execute_rq_nowait(req, false);
>   	return -EIOCBQUEUED;
>   }
> @@ -665,12 +667,135 @@ int nvme_ns_head_chr_uring_cmd(struct io_uring_cmd *ioucmd,
>   	int srcu_idx = srcu_read_lock(&head->srcu);
>   	struct nvme_ns *ns = nvme_find_path(head);
>   	int ret = -EINVAL;
> +	struct device *dev = &head->cdev_device;
> +
> +	if (likely(ns)) {
> +		ret = nvme_ns_uring_cmd(ns, ioucmd,
> +				issue_flags | IO_URING_F_MPATH);
> +	} else if (nvme_available_path(head)) {
> +		struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
> +		struct nvme_uring_cmd *payload = NULL;
> +
> +		dev_warn_ratelimited(dev, "no usable path - requeuing I/O\n");
> +		/*
> +		 * We may requeue two kinds of uring commands:
> +		 * 1. command failed post submission. pdu->req will be non-null
> +		 * for this
> +		 * 2. command that could not be submitted because path was not
> +		 * available. For this pdu->req is set to NULL.
> +		 */
> +		pdu->req = NULL;

Relying on a pointer does not sound like a good idea to me.
But why do you care at all where did this command came from?
This code should not concern itself what happened prior to this
execution.

> +		/*
> +		 * passthrough command will not be available during retry as it
> +		 * is embedded in io_uring's SQE. Hence we allocate/copy here
> +		 */

OK, that is a nice solution.

> +		payload = kmalloc(sizeof(struct nvme_uring_cmd), GFP_KERNEL);
> +		if (!payload) {
> +			ret = -ENOMEM;
> +			goto out_unlock;
> +		}
> +		memcpy(payload, ioucmd->cmd, sizeof(struct nvme_uring_cmd));
> +		ioucmd->cmd = payload;
>   
> -	if (ns)
> -		ret = nvme_ns_uring_cmd(ns, ioucmd, issue_flags);
> +		spin_lock_irq(&head->requeue_ioucmd_lock);
> +		ioucmd_list_add(&head->requeue_ioucmd_list, ioucmd);
> +		spin_unlock_irq(&head->requeue_ioucmd_lock);
> +		ret = -EIOCBQUEUED;
> +	} else {
> +		dev_warn_ratelimited(dev, "no available path - failing I/O\n");

ret=-EIO ?

> +	}
> +out_unlock:
>   	srcu_read_unlock(&head->srcu, srcu_idx);
>   	return ret;
>   }
> +
> +int nvme_uring_cmd_io_retry(struct nvme_ns *ns, struct request *oreq,
> +		struct io_uring_cmd *ioucmd, struct nvme_uring_cmd_pdu *pdu)
> +{
> +	struct nvme_ctrl *ctrl = ns->ctrl;
> +	struct request_queue *q = ns ? ns->queue : ctrl->admin_q;
> +	struct nvme_uring_data d;
> +	struct nvme_command c;
> +	struct request *req;
> +	struct bio *obio = oreq->bio;
> +	void *meta = NULL;
> +
> +	memcpy(&c, nvme_req(oreq)->cmd, sizeof(struct nvme_command));
> +	d.metadata = (__u64)pdu->meta_buffer;
> +	d.metadata_len = pdu->meta_len;
> +	d.timeout_ms = oreq->timeout;
> +	d.addr = (__u64)ioucmd->cmd;
> +	if (obio) {
> +		d.data_len = obio->bi_iter.bi_size;
> +		blk_rq_unmap_user(obio);
> +	} else {
> +		d.data_len = 0;
> +	}
> +	blk_mq_free_request(oreq);

The way I would do this that in nvme_ioucmd_failover_req (or in the
retry driven from command retriable failure) I would do the above,
requeue it and kick the requeue work, to go over the requeue_list and
just execute them again. Not sure why you even need an explicit retry
code.

> +	req = nvme_alloc_user_request(q, &c, nvme_to_user_ptr(d.addr),
> +			d.data_len, nvme_to_user_ptr(d.metadata),
> +			d.metadata_len, 0, &meta, d.timeout_ms ?
> +			msecs_to_jiffies(d.timeout_ms) : 0,
> +			ioucmd->cmd_op == NVME_URING_CMD_IO_VEC, 0, 0);
> +	if (IS_ERR(req))
> +		return PTR_ERR(req);
> +
> +	req->end_io = nvme_uring_cmd_end_io;
> +	req->end_io_data = ioucmd;
> +	pdu->bio = req->bio;
> +	pdu->meta = meta;
> +	req->cmd_flags |= REQ_NVME_MPATH;
> +	blk_execute_rq_nowait(req, false);
> +	return -EIOCBQUEUED;
> +}
> +
> +void nvme_ioucmd_mpath_retry(struct io_uring_cmd *ioucmd)
> +{
> +	struct cdev *cdev = file_inode(ioucmd->file)->i_cdev;
> +	struct nvme_ns_head *head = container_of(cdev, struct nvme_ns_head,
> +			cdev);
> +	int srcu_idx = srcu_read_lock(&head->srcu);
> +	struct nvme_ns *ns = nvme_find_path(head);
> +	unsigned int issue_flags = IO_URING_F_SQE128 | IO_URING_F_CQE32 |
> +		IO_URING_F_MPATH;
> +	struct device *dev = &head->cdev_device;
> +
> +	if (likely(ns)) {
> +		struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
> +		struct request *oreq = pdu->req;
> +		int ret;
> +
> +		if (oreq == NULL) {
> +			/*
> +			 * this was not submitted (to device) earlier. For this
> +			 * ioucmd->cmd points to persistent memory. Free that
> +			 * up post submission
> +			 */
> +			const void *cmd = ioucmd->cmd;
> +
> +			ret = nvme_ns_uring_cmd(ns, ioucmd, issue_flags);
> +			kfree(cmd);
> +		} else {
> +			/*
> +			 * this was submitted (to device) earlier. Use old
> +			 * request, bio (if it exists) and nvme-pdu to recreate
> +			 * the command for the discovered path
> +			 */
> +			ret = nvme_uring_cmd_io_retry(ns, oreq, ioucmd, pdu);

Why is this needed? Why is reuse important here? Why not always call
nvme_ns_uring_cmd?

> +		}
> +		if (ret != -EIOCBQUEUED)
> +			io_uring_cmd_done(ioucmd, ret, 0);
> +	} else if (nvme_available_path(head)) {
> +		dev_warn_ratelimited(dev, "no usable path - requeuing I/O\n");
> +		spin_lock_irq(&head->requeue_ioucmd_lock);
> +		ioucmd_list_add(&head->requeue_ioucmd_list, ioucmd);
> +		spin_unlock_irq(&head->requeue_ioucmd_lock);
> +	} else {
> +		dev_warn_ratelimited(dev, "no available path - failing I/O\n");
> +		io_uring_cmd_done(ioucmd, -EINVAL, 0);

-EIO?

> +	}
> +	srcu_read_unlock(&head->srcu, srcu_idx);
> +}
>   #endif /* CONFIG_NVME_MULTIPATH */
>   
>   int nvme_dev_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags)
> diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
> index f26640ccb955..fe5655d98c36 100644
> --- a/drivers/nvme/host/multipath.c
> +++ b/drivers/nvme/host/multipath.c
> @@ -6,6 +6,7 @@
>   #include <linux/backing-dev.h>
>   #include <linux/moduleparam.h>
>   #include <linux/vmalloc.h>
> +#include <linux/io_uring.h>
>   #include <trace/events/block.h>
>   #include "nvme.h"
>   
> @@ -80,6 +81,17 @@ void nvme_mpath_start_freeze(struct nvme_subsystem *subsys)
>   			blk_freeze_queue_start(h->disk->queue);
>   }
>   
> +static void nvme_ioucmd_failover_req(struct request *req, struct nvme_ns *ns)
> +{
> +	struct io_uring_cmd *ioucmd = req->end_io_data;
> +	struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
> +
> +	/* store the request, to reconstruct the command during retry */
> +	pdu->req = req;
> +	/* retry in original submitter context */
> +	io_uring_cmd_execute_in_task(ioucmd, nvme_ioucmd_mpath_retry);

Why not deinit the command, put it on the request list and just schedule
requeue_work? Why not just have requeue_work go over the request list
and call nvme_ns_head_chr_uring_cmd similar to what it does for block
requests?

> +}
> +
>   void nvme_failover_req(struct request *req)
>   {
>   	struct nvme_ns *ns = req->q->queuedata;
> @@ -99,6 +111,11 @@ void nvme_failover_req(struct request *req)
>   		queue_work(nvme_wq, &ns->ctrl->ana_work);
>   	}
>   
> +	if (blk_rq_is_passthrough(req)) {
> +		nvme_ioucmd_failover_req(req, ns);
> +		return;
> +	}
> +
>   	spin_lock_irqsave(&ns->head->requeue_lock, flags);
>   	for (bio = req->bio; bio; bio = bio->bi_next) {
>   		bio_set_dev(bio, ns->head->disk->part0);
> @@ -314,7 +331,7 @@ inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
>   	return ns;
>   }
>   
> -static bool nvme_available_path(struct nvme_ns_head *head)
> +bool nvme_available_path(struct nvme_ns_head *head)
>   {
>   	struct nvme_ns *ns;
>   
> @@ -459,7 +476,9 @@ static void nvme_requeue_work(struct work_struct *work)
>   	struct nvme_ns_head *head =
>   		container_of(work, struct nvme_ns_head, requeue_work);
>   	struct bio *bio, *next;
> +	struct io_uring_cmd *ioucmd, *ioucmd_next;
>   
> +	/* process requeued bios*/
>   	spin_lock_irq(&head->requeue_lock);
>   	next = bio_list_get(&head->requeue_list);
>   	spin_unlock_irq(&head->requeue_lock);
> @@ -470,6 +489,21 @@ static void nvme_requeue_work(struct work_struct *work)
>   
>   		submit_bio_noacct(bio);
>   	}
> +
> +	/* process requeued passthrough-commands */
> +	spin_lock_irq(&head->requeue_ioucmd_lock);
> +	ioucmd_next = ioucmd_list_get(&head->requeue_ioucmd_list);
> +	spin_unlock_irq(&head->requeue_ioucmd_lock);
> +
> +	while ((ioucmd = ioucmd_next) != NULL) {
> +		ioucmd_next = ioucmd->next;
> +		ioucmd->next = NULL;
> +		/*
> +		 * Retry in original submitter context as we would be
> +		 * processing the passthrough command again
> +		 */
> +		io_uring_cmd_execute_in_task(ioucmd, nvme_ioucmd_mpath_retry);

Why retry? why not nvme_ns_head_chr_uring_cmd ?

> +	}
>   }
>   
>   int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
> diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
> index 9d3ff6feda06..125b48e74e72 100644
> --- a/drivers/nvme/host/nvme.h
> +++ b/drivers/nvme/host/nvme.h
> @@ -16,6 +16,7 @@
>   #include <linux/rcupdate.h>
>   #include <linux/wait.h>
>   #include <linux/t10-pi.h>
> +#include <linux/io_uring.h>
>   
>   #include <trace/events/block.h>
>   
> @@ -189,6 +190,12 @@ enum {
>   	NVME_REQ_USERCMD		= (1 << 1),
>   };
>   
> +static inline struct nvme_uring_cmd_pdu *nvme_uring_cmd_pdu(
> +		struct io_uring_cmd *ioucmd)
> +{
> +	return (struct nvme_uring_cmd_pdu *)&ioucmd->pdu;
> +}
> +
>   static inline struct nvme_request *nvme_req(struct request *req)
>   {
>   	return blk_mq_rq_to_pdu(req);
> @@ -442,6 +449,9 @@ struct nvme_ns_head {
>   	struct work_struct	requeue_work;
>   	struct mutex		lock;
>   	unsigned long		flags;
> +	/* for uring-passthru multipath handling */
> +	struct ioucmd_list	requeue_ioucmd_list;
> +	spinlock_t		requeue_ioucmd_lock;
>   #define NVME_NSHEAD_DISK_LIVE	0
>   	struct nvme_ns __rcu	*current_path[];
>   #endif
> @@ -830,6 +840,7 @@ int nvme_ns_chr_uring_cmd(struct io_uring_cmd *ioucmd,
>   		unsigned int issue_flags);
>   int nvme_ns_head_chr_uring_cmd(struct io_uring_cmd *ioucmd,
>   		unsigned int issue_flags);
> +void nvme_ioucmd_mpath_retry(struct io_uring_cmd *ioucmd);
>   int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo);
>   int nvme_dev_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags);
>   
> @@ -844,6 +855,7 @@ static inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl)
>   	return ctrl->ana_log_buf != NULL;
>   }
>   
> +bool nvme_available_path(struct nvme_ns_head *head);
>   void nvme_mpath_unfreeze(struct nvme_subsystem *subsys);
>   void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys);
>   void nvme_mpath_start_freeze(struct nvme_subsystem *subsys);
> diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
> index d734599cbcd7..57f4dfc83316 100644
> --- a/include/linux/io_uring.h
> +++ b/include/linux/io_uring.h
> @@ -15,6 +15,8 @@ enum io_uring_cmd_flags {
>   	IO_URING_F_SQE128		= 4,
>   	IO_URING_F_CQE32		= 8,
>   	IO_URING_F_IOPOLL		= 16,
> +	/* to indicate that it is a MPATH req*/
> +	IO_URING_F_MPATH		= 32,

Unrelated, but this should probably move to bitwise representation...
Stefan Metzmacher July 11, 2022, 3:12 p.m. UTC | #2
Hi Sagi,

>> @@ -189,6 +190,12 @@ enum {
>>       NVME_REQ_USERCMD        = (1 << 1),
>>   };
>> +static inline struct nvme_uring_cmd_pdu *nvme_uring_cmd_pdu(
>> +        struct io_uring_cmd *ioucmd)
>> +{

Shouldn't we have a BUILD_BUG_ON(sizeof(struct nvme_uring_cmd_pdu) > sizeof(ioucmd->pdu));
here?

>> +    return (struct nvme_uring_cmd_pdu *)&ioucmd->pdu;
>> +}
>> +

>> diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
>> index d734599cbcd7..57f4dfc83316 100644
>> --- a/include/linux/io_uring.h
>> +++ b/include/linux/io_uring.h
>> @@ -15,6 +15,8 @@ enum io_uring_cmd_flags {
>>       IO_URING_F_SQE128        = 4,
>>       IO_URING_F_CQE32        = 8,
>>       IO_URING_F_IOPOLL        = 16,
>> +    /* to indicate that it is a MPATH req*/
>> +    IO_URING_F_MPATH        = 32,

Isn't that nvme specific? If so I don't think it belongs in io_uring.h at all...

metze
Sagi Grimberg July 11, 2022, 4:58 p.m. UTC | #3
>>> +static inline struct nvme_uring_cmd_pdu *nvme_uring_cmd_pdu(
>>> +        struct io_uring_cmd *ioucmd)
>>> +{
> 
> Shouldn't we have a BUILD_BUG_ON(sizeof(struct nvme_uring_cmd_pdu) > 
> sizeof(ioucmd->pdu));
> here?

Probably...

>>> +    return (struct nvme_uring_cmd_pdu *)&ioucmd->pdu;
>>> +}
>>> +
> 
>>> diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
>>> index d734599cbcd7..57f4dfc83316 100644
>>> --- a/include/linux/io_uring.h
>>> +++ b/include/linux/io_uring.h
>>> @@ -15,6 +15,8 @@ enum io_uring_cmd_flags {
>>>       IO_URING_F_SQE128        = 4,
>>>       IO_URING_F_CQE32        = 8,
>>>       IO_URING_F_IOPOLL        = 16,
>>> +    /* to indicate that it is a MPATH req*/
>>> +    IO_URING_F_MPATH        = 32,
> 
> Isn't that nvme specific? If so I don't think it belongs in io_uring.h 
> at all...

Yes, it doesn't, it should be completely internal to nvme.
Kanchan Joshi July 11, 2022, 6:37 p.m. UTC | #4
On Mon, Jul 11, 2022 at 04:51:44PM +0300, Sagi Grimberg wrote:
>
>>This heavily reuses nvme-mpath infrastructure for block-path.
>>Block-path operates on bio, and uses that as long-lived object across
>>requeue attempts. For passthrough interface io_uring_cmd happens to
>>be such object, so add a requeue list for that.
>>
>>If path is not available, uring-cmds are added into that list and
>>resubmitted on discovery of new path. Existing requeue handler
>>(nvme_requeue_work) is extended to do this resubmission.
>>
>>For failed commands, failover is done directly (i.e. without first
>>queuing into list) by resubmitting individual command from task-work
>>based callback.
>
>Nice!
>
>>
>>Suggested-by: Sagi Grimberg <sagi@grimberg.me>
>>Co-developed-by: Kanchan Joshi <joshi.k@samsung.com>
>>Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
>>Signed-off-by: Anuj Gupta <anuj20.g@samsung.com>
>>---
>>  drivers/nvme/host/ioctl.c     | 141 ++++++++++++++++++++++++++++++++--
>>  drivers/nvme/host/multipath.c |  36 ++++++++-
>>  drivers/nvme/host/nvme.h      |  12 +++
>>  include/linux/io_uring.h      |   2 +
>>  4 files changed, 182 insertions(+), 9 deletions(-)
>>
>>diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
>>index fc02eddd4977..281c21d3c67d 100644
>>--- a/drivers/nvme/host/ioctl.c
>>+++ b/drivers/nvme/host/ioctl.c
>>@@ -340,12 +340,6 @@ struct nvme_uring_data {
>>  	__u32	timeout_ms;
>>  };
>>-static inline struct nvme_uring_cmd_pdu *nvme_uring_cmd_pdu(
>>-		struct io_uring_cmd *ioucmd)
>>-{
>>-	return (struct nvme_uring_cmd_pdu *)&ioucmd->pdu;
>>-}
>>-
>>  static void nvme_uring_task_cb(struct io_uring_cmd *ioucmd)
>>  {
>>  	struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
>>@@ -448,6 +442,14 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
>>  	pdu->meta_buffer = nvme_to_user_ptr(d.metadata);
>>  	pdu->meta_len = d.metadata_len;
>>+	if (issue_flags & IO_URING_F_MPATH) {
>>+		req->cmd_flags |= REQ_NVME_MPATH;
>>+		/*
>>+		 * we would need the buffer address (d.addr field) if we have
>>+		 * to retry the command. Store it by repurposing ioucmd->cmd
>>+		 */
>>+		ioucmd->cmd = (void *)d.addr;
>
>What does repurposing mean here?

This field (ioucmd->cmd) was pointing to passthrough command (which
is embedded in SQE of io_uring). At this point we have consumed
passthrough command, so this field can be reused if we have to. And we
have to beceause failover needs recreating passthrough command.
Please see nvme_uring_cmd_io_retry to see how this helps in recreating
the fields of passthrough command. And more on this below.

>>+	}
>>  	blk_execute_rq_nowait(req, false);
>>  	return -EIOCBQUEUED;
>>  }
>>@@ -665,12 +667,135 @@ int nvme_ns_head_chr_uring_cmd(struct io_uring_cmd *ioucmd,
>>  	int srcu_idx = srcu_read_lock(&head->srcu);
>>  	struct nvme_ns *ns = nvme_find_path(head);
>>  	int ret = -EINVAL;
>>+	struct device *dev = &head->cdev_device;
>>+
>>+	if (likely(ns)) {
>>+		ret = nvme_ns_uring_cmd(ns, ioucmd,
>>+				issue_flags | IO_URING_F_MPATH);
>>+	} else if (nvme_available_path(head)) {
>>+		struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
>>+		struct nvme_uring_cmd *payload = NULL;
>>+
>>+		dev_warn_ratelimited(dev, "no usable path - requeuing I/O\n");
>>+		/*
>>+		 * We may requeue two kinds of uring commands:
>>+		 * 1. command failed post submission. pdu->req will be non-null
>>+		 * for this
>>+		 * 2. command that could not be submitted because path was not
>>+		 * available. For this pdu->req is set to NULL.
>>+		 */
>>+		pdu->req = NULL;
>
>Relying on a pointer does not sound like a good idea to me.
>But why do you care at all where did this command came from?
>This code should not concern itself what happened prior to this
>execution.
Required, please see nvme_uring_cmd_io_retry. And this should be more
clear as part of responses to your other questions.

>>+		/*
>>+		 * passthrough command will not be available during retry as it
>>+		 * is embedded in io_uring's SQE. Hence we allocate/copy here
>>+		 */
>
>OK, that is a nice solution.
Please note that prefered way is to recreate the passthrough command,
and not to allocate it. We allocate it here because this happens very early
(i.e. before processing passthrough command and setting that up inside
struct request). Recreating requires a populated 'struct request' .
>
>>+		payload = kmalloc(sizeof(struct nvme_uring_cmd), GFP_KERNEL);
>>+		if (!payload) {
>>+			ret = -ENOMEM;
>>+			goto out_unlock;
>>+		}
>>+		memcpy(payload, ioucmd->cmd, sizeof(struct nvme_uring_cmd));
>>+		ioucmd->cmd = payload;
>>-	if (ns)
>>-		ret = nvme_ns_uring_cmd(ns, ioucmd, issue_flags);
>>+		spin_lock_irq(&head->requeue_ioucmd_lock);
>>+		ioucmd_list_add(&head->requeue_ioucmd_list, ioucmd);
>>+		spin_unlock_irq(&head->requeue_ioucmd_lock);
>>+		ret = -EIOCBQUEUED;
>>+	} else {
>>+		dev_warn_ratelimited(dev, "no available path - failing I/O\n");
>
>ret=-EIO ?
Did not do as it was initialized to -EINVAL. Do you prefer -EIO instead.
>>+	}
>>+out_unlock:
>>  	srcu_read_unlock(&head->srcu, srcu_idx);
>>  	return ret;
>>  }
>>+
>>+int nvme_uring_cmd_io_retry(struct nvme_ns *ns, struct request *oreq,
>>+		struct io_uring_cmd *ioucmd, struct nvme_uring_cmd_pdu *pdu)
>>+{
>>+	struct nvme_ctrl *ctrl = ns->ctrl;
>>+	struct request_queue *q = ns ? ns->queue : ctrl->admin_q;
>>+	struct nvme_uring_data d;
>>+	struct nvme_command c;
>>+	struct request *req;
>>+	struct bio *obio = oreq->bio;
>>+	void *meta = NULL;
>>+
>>+	memcpy(&c, nvme_req(oreq)->cmd, sizeof(struct nvme_command));
>>+	d.metadata = (__u64)pdu->meta_buffer;
>>+	d.metadata_len = pdu->meta_len;
>>+	d.timeout_ms = oreq->timeout;
>>+	d.addr = (__u64)ioucmd->cmd;
>>+	if (obio) {
>>+		d.data_len = obio->bi_iter.bi_size;
>>+		blk_rq_unmap_user(obio);
>>+	} else {
>>+		d.data_len = 0;
>>+	}
>>+	blk_mq_free_request(oreq);
>
>The way I would do this that in nvme_ioucmd_failover_req (or in the
>retry driven from command retriable failure) I would do the above,
>requeue it and kick the requeue work, to go over the requeue_list and
>just execute them again. Not sure why you even need an explicit retry
>code.
During retry we need passthrough command. But passthrough command is not
stable (i.e. valid only during first submission). We can make it stable
either by:
(a) allocating in nvme 
(b) return -EAGAIN to io_uring, and it will do allocate + deferral
Both add a cost. And since any command can potentially fail, that
means taking that cost for every IO that we issue on mpath node. Even if
no failure (initial or subsquent after IO) occcured.

So to avoid commmon-path cost, we go about doing nothing (no allocation,
no deferral) in the outset and choose to recreate the passthrough
command if failure occured. Hope this explains the purpose of
nvme_uring_cmd_io_retry?


>>+	req = nvme_alloc_user_request(q, &c, nvme_to_user_ptr(d.addr),
>>+			d.data_len, nvme_to_user_ptr(d.metadata),
>>+			d.metadata_len, 0, &meta, d.timeout_ms ?
>>+			msecs_to_jiffies(d.timeout_ms) : 0,
>>+			ioucmd->cmd_op == NVME_URING_CMD_IO_VEC, 0, 0);
>>+	if (IS_ERR(req))
>>+		return PTR_ERR(req);
>>+
>>+	req->end_io = nvme_uring_cmd_end_io;
>>+	req->end_io_data = ioucmd;
>>+	pdu->bio = req->bio;
>>+	pdu->meta = meta;
>>+	req->cmd_flags |= REQ_NVME_MPATH;
>>+	blk_execute_rq_nowait(req, false);
>>+	return -EIOCBQUEUED;
>>+}
>>+
>>+void nvme_ioucmd_mpath_retry(struct io_uring_cmd *ioucmd)
>>+{
>>+	struct cdev *cdev = file_inode(ioucmd->file)->i_cdev;
>>+	struct nvme_ns_head *head = container_of(cdev, struct nvme_ns_head,
>>+			cdev);
>>+	int srcu_idx = srcu_read_lock(&head->srcu);
>>+	struct nvme_ns *ns = nvme_find_path(head);
>>+	unsigned int issue_flags = IO_URING_F_SQE128 | IO_URING_F_CQE32 |
>>+		IO_URING_F_MPATH;
>>+	struct device *dev = &head->cdev_device;
>>+
>>+	if (likely(ns)) {
>>+		struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
>>+		struct request *oreq = pdu->req;
>>+		int ret;
>>+
>>+		if (oreq == NULL) {
>>+			/*
>>+			 * this was not submitted (to device) earlier. For this
>>+			 * ioucmd->cmd points to persistent memory. Free that
>>+			 * up post submission
>>+			 */
>>+			const void *cmd = ioucmd->cmd;
>>+
>>+			ret = nvme_ns_uring_cmd(ns, ioucmd, issue_flags);
>>+			kfree(cmd);
>>+		} else {
>>+			/*
>>+			 * this was submitted (to device) earlier. Use old
>>+			 * request, bio (if it exists) and nvme-pdu to recreate
>>+			 * the command for the discovered path
>>+			 */
>>+			ret = nvme_uring_cmd_io_retry(ns, oreq, ioucmd, pdu);
>
>Why is this needed? Why is reuse important here? Why not always call
>nvme_ns_uring_cmd?

Please see the previous explanation.
If condition is for the case when we made the passthrough command stable
by allocating beforehand.
Else is for the case when we avoided taking that cost.

>>+		}
>>+		if (ret != -EIOCBQUEUED)
>>+			io_uring_cmd_done(ioucmd, ret, 0);
>>+	} else if (nvme_available_path(head)) {
>>+		dev_warn_ratelimited(dev, "no usable path - requeuing I/O\n");
>>+		spin_lock_irq(&head->requeue_ioucmd_lock);
>>+		ioucmd_list_add(&head->requeue_ioucmd_list, ioucmd);
>>+		spin_unlock_irq(&head->requeue_ioucmd_lock);
>>+	} else {
>>+		dev_warn_ratelimited(dev, "no available path - failing I/O\n");
>>+		io_uring_cmd_done(ioucmd, -EINVAL, 0);
>
>-EIO?
Can change -EINVAL to -EIO if that is what you prefer.

>>+	}
>>+	srcu_read_unlock(&head->srcu, srcu_idx);
>>+}
>>  #endif /* CONFIG_NVME_MULTIPATH */
>>  int nvme_dev_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags)
>>diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
>>index f26640ccb955..fe5655d98c36 100644
>>--- a/drivers/nvme/host/multipath.c
>>+++ b/drivers/nvme/host/multipath.c
>>@@ -6,6 +6,7 @@
>>  #include <linux/backing-dev.h>
>>  #include <linux/moduleparam.h>
>>  #include <linux/vmalloc.h>
>>+#include <linux/io_uring.h>
>>  #include <trace/events/block.h>
>>  #include "nvme.h"
>>@@ -80,6 +81,17 @@ void nvme_mpath_start_freeze(struct nvme_subsystem *subsys)
>>  			blk_freeze_queue_start(h->disk->queue);
>>  }
>>+static void nvme_ioucmd_failover_req(struct request *req, struct nvme_ns *ns)
>>+{
>>+	struct io_uring_cmd *ioucmd = req->end_io_data;
>>+	struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
>>+
>>+	/* store the request, to reconstruct the command during retry */
>>+	pdu->req = req;
>>+	/* retry in original submitter context */
>>+	io_uring_cmd_execute_in_task(ioucmd, nvme_ioucmd_mpath_retry);
>
>Why not deinit the command, put it on the request list and just schedule
>requeue_work? Why not just have requeue_work go over the request list
>and call nvme_ns_head_chr_uring_cmd similar to what it does for block
>requests?
We cannot process passthrough command, map user/meta buffer (in bio) etc. in
worker context. We need to do that in task context. So worker is a
hoop that we try to avoid here. Even if we user worker, we need to
switch to task as we do for other case (i.e. when command could not be
submitted in the first place).

>>+}
>>+
>>  void nvme_failover_req(struct request *req)
>>  {
>>  	struct nvme_ns *ns = req->q->queuedata;
>>@@ -99,6 +111,11 @@ void nvme_failover_req(struct request *req)
>>  		queue_work(nvme_wq, &ns->ctrl->ana_work);
>>  	}
>>+	if (blk_rq_is_passthrough(req)) {
>>+		nvme_ioucmd_failover_req(req, ns);
>>+		return;
>>+	}
>>+
>>  	spin_lock_irqsave(&ns->head->requeue_lock, flags);
>>  	for (bio = req->bio; bio; bio = bio->bi_next) {
>>  		bio_set_dev(bio, ns->head->disk->part0);
>>@@ -314,7 +331,7 @@ inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
>>  	return ns;
>>  }
>>-static bool nvme_available_path(struct nvme_ns_head *head)
>>+bool nvme_available_path(struct nvme_ns_head *head)
>>  {
>>  	struct nvme_ns *ns;
>>@@ -459,7 +476,9 @@ static void nvme_requeue_work(struct work_struct *work)
>>  	struct nvme_ns_head *head =
>>  		container_of(work, struct nvme_ns_head, requeue_work);
>>  	struct bio *bio, *next;
>>+	struct io_uring_cmd *ioucmd, *ioucmd_next;
>>+	/* process requeued bios*/
>>  	spin_lock_irq(&head->requeue_lock);
>>  	next = bio_list_get(&head->requeue_list);
>>  	spin_unlock_irq(&head->requeue_lock);
>>@@ -470,6 +489,21 @@ static void nvme_requeue_work(struct work_struct *work)
>>  		submit_bio_noacct(bio);
>>  	}
>>+
>>+	/* process requeued passthrough-commands */
>>+	spin_lock_irq(&head->requeue_ioucmd_lock);
>>+	ioucmd_next = ioucmd_list_get(&head->requeue_ioucmd_list);
>>+	spin_unlock_irq(&head->requeue_ioucmd_lock);
>>+
>>+	while ((ioucmd = ioucmd_next) != NULL) {
>>+		ioucmd_next = ioucmd->next;
>>+		ioucmd->next = NULL;
>>+		/*
>>+		 * Retry in original submitter context as we would be
>>+		 * processing the passthrough command again
>>+		 */
>>+		io_uring_cmd_execute_in_task(ioucmd, nvme_ioucmd_mpath_retry);
>
>Why retry? why not nvme_ns_head_chr_uring_cmd ?

what nvme_ioucmd_mpath_retry does different is explained above. Putting
that processing inside nvme_ns_head_chr_uring_cmd makes the function too
long and hard to read/understand.
Kanchan Joshi July 11, 2022, 6:54 p.m. UTC | #5
On Mon, Jul 11, 2022 at 05:12:23PM +0200, Stefan Metzmacher wrote:
>Hi Sagi,
>
>>>@@ -189,6 +190,12 @@ enum {
>>>      NVME_REQ_USERCMD        = (1 << 1),
>>>  };
>>>+static inline struct nvme_uring_cmd_pdu *nvme_uring_cmd_pdu(
>>>+        struct io_uring_cmd *ioucmd)
>>>+{
>
>Shouldn't we have a BUILD_BUG_ON(sizeof(struct nvme_uring_cmd_pdu) > sizeof(ioucmd->pdu));
>here?
Not sure if I got the concern. We have this already inside
nvme_ns_uring_cmd function.

>>>+    return (struct nvme_uring_cmd_pdu *)&ioucmd->pdu;
>>>+}
>>>+
>
>>>diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
>>>index d734599cbcd7..57f4dfc83316 100644
>>>--- a/include/linux/io_uring.h
>>>+++ b/include/linux/io_uring.h
>>>@@ -15,6 +15,8 @@ enum io_uring_cmd_flags {
>>>      IO_URING_F_SQE128        = 4,
>>>      IO_URING_F_CQE32        = 8,
>>>      IO_URING_F_IOPOLL        = 16,
>>>+    /* to indicate that it is a MPATH req*/
>>>+    IO_URING_F_MPATH        = 32,
>
>Isn't that nvme specific? If so I don't think it belongs in io_uring.h at all...

Right. Removing this was bit ugly in nvme, but yes, need to kill this in
v2.
Sagi Grimberg July 11, 2022, 7:56 p.m. UTC | #6
>>> @@ -448,6 +442,14 @@ static int nvme_uring_cmd_io(struct nvme_ctrl 
>>> *ctrl, struct nvme_ns *ns,
>>>      pdu->meta_buffer = nvme_to_user_ptr(d.metadata);
>>>      pdu->meta_len = d.metadata_len;
>>> +    if (issue_flags & IO_URING_F_MPATH) {
>>> +        req->cmd_flags |= REQ_NVME_MPATH;
>>> +        /*
>>> +         * we would need the buffer address (d.addr field) if we have
>>> +         * to retry the command. Store it by repurposing ioucmd->cmd
>>> +         */
>>> +        ioucmd->cmd = (void *)d.addr;
>>
>> What does repurposing mean here?
> 
> This field (ioucmd->cmd) was pointing to passthrough command (which
> is embedded in SQE of io_uring). At this point we have consumed
> passthrough command, so this field can be reused if we have to. And we
> have to beceause failover needs recreating passthrough command.
> Please see nvme_uring_cmd_io_retry to see how this helps in recreating
> the fields of passthrough command. And more on this below.

so ioucmd->cmd starts as an nvme_uring_cmd pointer and continues as
an address of buffer for a later processing that may or may not
happen in its lifetime?

Sounds a bit of a backwards design to me.

>>> +    }
>>>      blk_execute_rq_nowait(req, false);
>>>      return -EIOCBQUEUED;
>>>  }
>>> @@ -665,12 +667,135 @@ int nvme_ns_head_chr_uring_cmd(struct 
>>> io_uring_cmd *ioucmd,
>>>      int srcu_idx = srcu_read_lock(&head->srcu);
>>>      struct nvme_ns *ns = nvme_find_path(head);
>>>      int ret = -EINVAL;
>>> +    struct device *dev = &head->cdev_device;
>>> +
>>> +    if (likely(ns)) {
>>> +        ret = nvme_ns_uring_cmd(ns, ioucmd,
>>> +                issue_flags | IO_URING_F_MPATH);
>>> +    } else if (nvme_available_path(head)) {
>>> +        struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
>>> +        struct nvme_uring_cmd *payload = NULL;
>>> +
>>> +        dev_warn_ratelimited(dev, "no usable path - requeuing I/O\n");
>>> +        /*
>>> +         * We may requeue two kinds of uring commands:
>>> +         * 1. command failed post submission. pdu->req will be non-null
>>> +         * for this
>>> +         * 2. command that could not be submitted because path was not
>>> +         * available. For this pdu->req is set to NULL.
>>> +         */
>>> +        pdu->req = NULL;
>>
>> Relying on a pointer does not sound like a good idea to me.
>> But why do you care at all where did this command came from?
>> This code should not concern itself what happened prior to this
>> execution.
> Required, please see nvme_uring_cmd_io_retry. And this should be more
> clear as part of responses to your other questions.

I think I understand. But it looks fragile to me.

> 
>>> +        /*
>>> +         * passthrough command will not be available during retry as it
>>> +         * is embedded in io_uring's SQE. Hence we allocate/copy here
>>> +         */
>>
>> OK, that is a nice solution.
> Please note that prefered way is to recreate the passthrough command,
> and not to allocate it. We allocate it here because this happens very early
> (i.e. before processing passthrough command and setting that up inside
> struct request). Recreating requires a populated 'struct request' .

I think that once a driver consumes a command as queued, it needs a
stable copy of the command (could be in a percpu pool).

The current design of nvme multipathing is that the request is stripped
from its bios and placed on a requeue_list, and if a request was not
formed yet (i.e. nvme available path exist but no usable) the bio is
placed on the requeue_list.

So ideally we have something sufficient like a bio, that can be linked
on a requeue list, and is sufficient to build a request, at any point in
time...

>>
>>> +        payload = kmalloc(sizeof(struct nvme_uring_cmd), GFP_KERNEL);
>>> +        if (!payload) {
>>> +            ret = -ENOMEM;
>>> +            goto out_unlock;
>>> +        }
>>> +        memcpy(payload, ioucmd->cmd, sizeof(struct nvme_uring_cmd));
>>> +        ioucmd->cmd = payload;
>>> -    if (ns)
>>> -        ret = nvme_ns_uring_cmd(ns, ioucmd, issue_flags);
>>> +        spin_lock_irq(&head->requeue_ioucmd_lock);
>>> +        ioucmd_list_add(&head->requeue_ioucmd_list, ioucmd);
>>> +        spin_unlock_irq(&head->requeue_ioucmd_lock);
>>> +        ret = -EIOCBQUEUED;
>>> +    } else {
>>> +        dev_warn_ratelimited(dev, "no available path - failing I/O\n");
>>
>> ret=-EIO ?
> Did not do as it was initialized to -EINVAL. Do you prefer -EIO instead.

It is not an invalid argument error here, it is essentially an IO error.
So yes, that would be my preference.

>>> +    }
>>> +out_unlock:
>>>      srcu_read_unlock(&head->srcu, srcu_idx);
>>>      return ret;
>>>  }
>>> +
>>> +int nvme_uring_cmd_io_retry(struct nvme_ns *ns, struct request *oreq,
>>> +        struct io_uring_cmd *ioucmd, struct nvme_uring_cmd_pdu *pdu)
>>> +{
>>> +    struct nvme_ctrl *ctrl = ns->ctrl;
>>> +    struct request_queue *q = ns ? ns->queue : ctrl->admin_q;
>>> +    struct nvme_uring_data d;
>>> +    struct nvme_command c;
>>> +    struct request *req;
>>> +    struct bio *obio = oreq->bio;
>>> +    void *meta = NULL;
>>> +
>>> +    memcpy(&c, nvme_req(oreq)->cmd, sizeof(struct nvme_command));
>>> +    d.metadata = (__u64)pdu->meta_buffer;
>>> +    d.metadata_len = pdu->meta_len;
>>> +    d.timeout_ms = oreq->timeout;
>>> +    d.addr = (__u64)ioucmd->cmd;
>>> +    if (obio) {
>>> +        d.data_len = obio->bi_iter.bi_size;
>>> +        blk_rq_unmap_user(obio);
>>> +    } else {
>>> +        d.data_len = 0;
>>> +    }
>>> +    blk_mq_free_request(oreq);
>>
>> The way I would do this that in nvme_ioucmd_failover_req (or in the
>> retry driven from command retriable failure) I would do the above,
>> requeue it and kick the requeue work, to go over the requeue_list and
>> just execute them again. Not sure why you even need an explicit retry
>> code.
> During retry we need passthrough command. But passthrough command is not
> stable (i.e. valid only during first submission). We can make it stable
> either by:
> (a) allocating in nvme (b) return -EAGAIN to io_uring, and it will do 
> allocate + deferral
> Both add a cost. And since any command can potentially fail, that
> means taking that cost for every IO that we issue on mpath node. Even if
> no failure (initial or subsquent after IO) occcured.

As mentioned, I think that if a driver consumes a command as queued,
it needs a stable copy for a later reformation of the request for
failover purposes.

> So to avoid commmon-path cost, we go about doing nothing (no allocation,
> no deferral) in the outset and choose to recreate the passthrough
> command if failure occured. Hope this explains the purpose of
> nvme_uring_cmd_io_retry?

I think it does, but I'm still having a hard time with it...

> 
> 
>>> +    req = nvme_alloc_user_request(q, &c, nvme_to_user_ptr(d.addr),
>>> +            d.data_len, nvme_to_user_ptr(d.metadata),
>>> +            d.metadata_len, 0, &meta, d.timeout_ms ?
>>> +            msecs_to_jiffies(d.timeout_ms) : 0,
>>> +            ioucmd->cmd_op == NVME_URING_CMD_IO_VEC, 0, 0);
>>> +    if (IS_ERR(req))
>>> +        return PTR_ERR(req);
>>> +
>>> +    req->end_io = nvme_uring_cmd_end_io;
>>> +    req->end_io_data = ioucmd;
>>> +    pdu->bio = req->bio;
>>> +    pdu->meta = meta;
>>> +    req->cmd_flags |= REQ_NVME_MPATH;
>>> +    blk_execute_rq_nowait(req, false);
>>> +    return -EIOCBQUEUED;
>>> +}
>>> +
>>> +void nvme_ioucmd_mpath_retry(struct io_uring_cmd *ioucmd)
>>> +{
>>> +    struct cdev *cdev = file_inode(ioucmd->file)->i_cdev;
>>> +    struct nvme_ns_head *head = container_of(cdev, struct nvme_ns_head,
>>> +            cdev);
>>> +    int srcu_idx = srcu_read_lock(&head->srcu);
>>> +    struct nvme_ns *ns = nvme_find_path(head);
>>> +    unsigned int issue_flags = IO_URING_F_SQE128 | IO_URING_F_CQE32 |
>>> +        IO_URING_F_MPATH;
>>> +    struct device *dev = &head->cdev_device;
>>> +
>>> +    if (likely(ns)) {
>>> +        struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
>>> +        struct request *oreq = pdu->req;
>>> +        int ret;
>>> +
>>> +        if (oreq == NULL) {
>>> +            /*
>>> +             * this was not submitted (to device) earlier. For this
>>> +             * ioucmd->cmd points to persistent memory. Free that
>>> +             * up post submission
>>> +             */
>>> +            const void *cmd = ioucmd->cmd;
>>> +
>>> +            ret = nvme_ns_uring_cmd(ns, ioucmd, issue_flags);
>>> +            kfree(cmd);
>>> +        } else {
>>> +            /*
>>> +             * this was submitted (to device) earlier. Use old
>>> +             * request, bio (if it exists) and nvme-pdu to recreate
>>> +             * the command for the discovered path
>>> +             */
>>> +            ret = nvme_uring_cmd_io_retry(ns, oreq, ioucmd, pdu);
>>
>> Why is this needed? Why is reuse important here? Why not always call
>> nvme_ns_uring_cmd?
> 
> Please see the previous explanation.
> If condition is for the case when we made the passthrough command stable
> by allocating beforehand.
> Else is for the case when we avoided taking that cost.

The current design of the multipath failover code is clean:
1. extract bio(s) from request
2. link in requeue_list
3. schedule requeue_work that,
  3.1 takes bios 1-by-1, and submits them again (exactly the same way)

I'd like us to try to follow the same design where retry is
literally "do the exact same thing, again". That would eliminate
two submission paths that do the same thing, but slightly different.

> 
>>> +        }
>>> +        if (ret != -EIOCBQUEUED)
>>> +            io_uring_cmd_done(ioucmd, ret, 0);
>>> +    } else if (nvme_available_path(head)) {
>>> +        dev_warn_ratelimited(dev, "no usable path - requeuing I/O\n");
>>> +        spin_lock_irq(&head->requeue_ioucmd_lock);
>>> +        ioucmd_list_add(&head->requeue_ioucmd_list, ioucmd);
>>> +        spin_unlock_irq(&head->requeue_ioucmd_lock);
>>> +    } else {
>>> +        dev_warn_ratelimited(dev, "no available path - failing I/O\n");
>>> +        io_uring_cmd_done(ioucmd, -EINVAL, 0);
>>
>> -EIO?
> Can change -EINVAL to -EIO if that is what you prefer.

Yes.
Kanchan Joshi July 12, 2022, 4:23 a.m. UTC | #7
On Mon, Jul 11, 2022 at 10:56:56PM +0300, Sagi Grimberg wrote:
>
>>>>@@ -448,6 +442,14 @@ static int nvme_uring_cmd_io(struct 
>>>>nvme_ctrl *ctrl, struct nvme_ns *ns,
>>>>     pdu->meta_buffer = nvme_to_user_ptr(d.metadata);
>>>>     pdu->meta_len = d.metadata_len;
>>>>+    if (issue_flags & IO_URING_F_MPATH) {
>>>>+        req->cmd_flags |= REQ_NVME_MPATH;
>>>>+        /*
>>>>+         * we would need the buffer address (d.addr field) if we have
>>>>+         * to retry the command. Store it by repurposing ioucmd->cmd
>>>>+         */
>>>>+        ioucmd->cmd = (void *)d.addr;
>>>
>>>What does repurposing mean here?
>>
>>This field (ioucmd->cmd) was pointing to passthrough command (which
>>is embedded in SQE of io_uring). At this point we have consumed
>>passthrough command, so this field can be reused if we have to. And we
>>have to beceause failover needs recreating passthrough command.
>>Please see nvme_uring_cmd_io_retry to see how this helps in recreating
>>the fields of passthrough command. And more on this below.
>
>so ioucmd->cmd starts as an nvme_uring_cmd pointer and continues as
>an address of buffer for a later processing that may or may not
>happen in its lifetime?

Yes. See this as a no-cost way to handle fast/common path. We manage by
doing just this assignment.
Otherwise this would have involved doing high-cost (allocate/copy/deferral)
stuff for later processing that may or may not happen.

>Sounds a bit of a backwards design to me.
>
>>>>+    }
>>>>     blk_execute_rq_nowait(req, false);
>>>>     return -EIOCBQUEUED;
>>>> }
>>>>@@ -665,12 +667,135 @@ int nvme_ns_head_chr_uring_cmd(struct 
>>>>io_uring_cmd *ioucmd,
>>>>     int srcu_idx = srcu_read_lock(&head->srcu);
>>>>     struct nvme_ns *ns = nvme_find_path(head);
>>>>     int ret = -EINVAL;
>>>>+    struct device *dev = &head->cdev_device;
>>>>+
>>>>+    if (likely(ns)) {
>>>>+        ret = nvme_ns_uring_cmd(ns, ioucmd,
>>>>+                issue_flags | IO_URING_F_MPATH);
>>>>+    } else if (nvme_available_path(head)) {
>>>>+        struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
>>>>+        struct nvme_uring_cmd *payload = NULL;
>>>>+
>>>>+        dev_warn_ratelimited(dev, "no usable path - requeuing I/O\n");
>>>>+        /*
>>>>+         * We may requeue two kinds of uring commands:
>>>>+         * 1. command failed post submission. pdu->req will be non-null
>>>>+         * for this
>>>>+         * 2. command that could not be submitted because path was not
>>>>+         * available. For this pdu->req is set to NULL.
>>>>+         */
>>>>+        pdu->req = NULL;
>>>
>>>Relying on a pointer does not sound like a good idea to me.
>>>But why do you care at all where did this command came from?
>>>This code should not concern itself what happened prior to this
>>>execution.
>>Required, please see nvme_uring_cmd_io_retry. And this should be more
>>clear as part of responses to your other questions.
>
>I think I understand. But it looks fragile to me.
>
>>
>>>>+        /*
>>>>+         * passthrough command will not be available during retry as it
>>>>+         * is embedded in io_uring's SQE. Hence we allocate/copy here
>>>>+         */
>>>
>>>OK, that is a nice solution.
>>Please note that prefered way is to recreate the passthrough command,
>>and not to allocate it. We allocate it here because this happens very early
>>(i.e. before processing passthrough command and setting that up inside
>>struct request). Recreating requires a populated 'struct request' .
>
>I think that once a driver consumes a command as queued, it needs a
>stable copy of the command (could be in a percpu pool).
>
>The current design of nvme multipathing is that the request is stripped
>from its bios and placed on a requeue_list, and if a request was not
>formed yet (i.e. nvme available path exist but no usable) the bio is
>placed on the requeue_list.
>
>So ideally we have something sufficient like a bio, that can be linked
>on a requeue list, and is sufficient to build a request, at any point in
>time...

we could be dealing with any passthrough command here. bio is not
guranteed to exist in first place. It can very well be NULL.
As I mentioned in cover letter, this was tested for such passthrough
commands too.
And bio is not a good fit for this interface. For block-path, entry
function is nvme_ns_head_submit_bio() which speaks bio. Request is
allocated afterwards. But since request formation can happen only after
discovering a valid path, it may have to queue the bio if path does not
exist.
For passthrough, interface speaks/understands struct io_uring_cmd.
Request is allocated after. And bio may (or may not) be attached with
this request. It may have to queue the command even before request/bio
gets formed. So cardinal structure (which is
really available all the time) in this case is struct io_uring_cmd.
Hence we use that as the object around which requeuing/failover is
built.
Conceptually io_uring_cmd is the bio of this interface.

>>>>+        payload = kmalloc(sizeof(struct nvme_uring_cmd), GFP_KERNEL);
>>>>+        if (!payload) {
>>>>+            ret = -ENOMEM;
>>>>+            goto out_unlock;
>>>>+        }
>>>>+        memcpy(payload, ioucmd->cmd, sizeof(struct nvme_uring_cmd));
>>>>+        ioucmd->cmd = payload;
>>>>-    if (ns)
>>>>-        ret = nvme_ns_uring_cmd(ns, ioucmd, issue_flags);
>>>>+        spin_lock_irq(&head->requeue_ioucmd_lock);
>>>>+        ioucmd_list_add(&head->requeue_ioucmd_list, ioucmd);
>>>>+        spin_unlock_irq(&head->requeue_ioucmd_lock);
>>>>+        ret = -EIOCBQUEUED;
>>>>+    } else {
>>>>+        dev_warn_ratelimited(dev, "no available path - failing I/O\n");
>>>
>>>ret=-EIO ?
>>Did not do as it was initialized to -EINVAL. Do you prefer -EIO instead.
>
>It is not an invalid argument error here, it is essentially an IO error.
>So yes, that would be my preference.

sure, will change.

>>>>+    }
>>>>+out_unlock:
>>>>     srcu_read_unlock(&head->srcu, srcu_idx);
>>>>     return ret;
>>>> }
>>>>+
>>>>+int nvme_uring_cmd_io_retry(struct nvme_ns *ns, struct request *oreq,
>>>>+        struct io_uring_cmd *ioucmd, struct nvme_uring_cmd_pdu *pdu)
>>>>+{
>>>>+    struct nvme_ctrl *ctrl = ns->ctrl;
>>>>+    struct request_queue *q = ns ? ns->queue : ctrl->admin_q;
>>>>+    struct nvme_uring_data d;
>>>>+    struct nvme_command c;
>>>>+    struct request *req;
>>>>+    struct bio *obio = oreq->bio;
>>>>+    void *meta = NULL;
>>>>+
>>>>+    memcpy(&c, nvme_req(oreq)->cmd, sizeof(struct nvme_command));
>>>>+    d.metadata = (__u64)pdu->meta_buffer;
>>>>+    d.metadata_len = pdu->meta_len;
>>>>+    d.timeout_ms = oreq->timeout;
>>>>+    d.addr = (__u64)ioucmd->cmd;
>>>>+    if (obio) {
>>>>+        d.data_len = obio->bi_iter.bi_size;
>>>>+        blk_rq_unmap_user(obio);
>>>>+    } else {
>>>>+        d.data_len = 0;
>>>>+    }
>>>>+    blk_mq_free_request(oreq);
>>>
>>>The way I would do this that in nvme_ioucmd_failover_req (or in the
>>>retry driven from command retriable failure) I would do the above,
>>>requeue it and kick the requeue work, to go over the requeue_list and
>>>just execute them again. Not sure why you even need an explicit retry
>>>code.
>>During retry we need passthrough command. But passthrough command is not
>>stable (i.e. valid only during first submission). We can make it stable
>>either by:
>>(a) allocating in nvme (b) return -EAGAIN to io_uring, and it will 
>>do allocate + deferral
>>Both add a cost. And since any command can potentially fail, that
>>means taking that cost for every IO that we issue on mpath node. Even if
>>no failure (initial or subsquent after IO) occcured.
>
>As mentioned, I think that if a driver consumes a command as queued,
>it needs a stable copy for a later reformation of the request for
>failover purposes.

So what do you propose to make that stable?
As I mentioned earlier, stable copy requires allocating/copying in fast
path. And for a condition (failover) that may not even occur.
I really think currrent solution is much better as it does not try to make
it stable. Rather it assembles pieces of passthrough command if retry
(which is rare) happens.

>>So to avoid commmon-path cost, we go about doing nothing (no allocation,
>>no deferral) in the outset and choose to recreate the passthrough
>>command if failure occured. Hope this explains the purpose of
>>nvme_uring_cmd_io_retry?
>
>I think it does, but I'm still having a hard time with it...
>
Maybe I am reiterating but few main differences that should help -

- io_uring_cmd is at the forefront, and bio/request as secondary
objects. Former is persistent object across requeue attempts and the
only thing available when we discover the path, while other two are
created every time we retry.

- Receiving bio from upper layer is a luxury that we do not have for
  passthrough. When we receive bio, pages are already mapped and we
  do not have to deal with user-specific fields, so there is more
  freedom in using arbitrary context (workers etc.). But passthrough
  command continues to point to user-space fields/buffers, so we need
  that task context.

>>
>>>>+    req = nvme_alloc_user_request(q, &c, nvme_to_user_ptr(d.addr),
>>>>+            d.data_len, nvme_to_user_ptr(d.metadata),
>>>>+            d.metadata_len, 0, &meta, d.timeout_ms ?
>>>>+            msecs_to_jiffies(d.timeout_ms) : 0,
>>>>+            ioucmd->cmd_op == NVME_URING_CMD_IO_VEC, 0, 0);
>>>>+    if (IS_ERR(req))
>>>>+        return PTR_ERR(req);
>>>>+
>>>>+    req->end_io = nvme_uring_cmd_end_io;
>>>>+    req->end_io_data = ioucmd;
>>>>+    pdu->bio = req->bio;
>>>>+    pdu->meta = meta;
>>>>+    req->cmd_flags |= REQ_NVME_MPATH;
>>>>+    blk_execute_rq_nowait(req, false);
>>>>+    return -EIOCBQUEUED;
>>>>+}
>>>>+
>>>>+void nvme_ioucmd_mpath_retry(struct io_uring_cmd *ioucmd)
>>>>+{
>>>>+    struct cdev *cdev = file_inode(ioucmd->file)->i_cdev;
>>>>+    struct nvme_ns_head *head = container_of(cdev, struct nvme_ns_head,
>>>>+            cdev);
>>>>+    int srcu_idx = srcu_read_lock(&head->srcu);
>>>>+    struct nvme_ns *ns = nvme_find_path(head);
>>>>+    unsigned int issue_flags = IO_URING_F_SQE128 | IO_URING_F_CQE32 |
>>>>+        IO_URING_F_MPATH;
>>>>+    struct device *dev = &head->cdev_device;
>>>>+
>>>>+    if (likely(ns)) {
>>>>+        struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
>>>>+        struct request *oreq = pdu->req;
>>>>+        int ret;
>>>>+
>>>>+        if (oreq == NULL) {
>>>>+            /*
>>>>+             * this was not submitted (to device) earlier. For this
>>>>+             * ioucmd->cmd points to persistent memory. Free that
>>>>+             * up post submission
>>>>+             */
>>>>+            const void *cmd = ioucmd->cmd;
>>>>+
>>>>+            ret = nvme_ns_uring_cmd(ns, ioucmd, issue_flags);
>>>>+            kfree(cmd);
>>>>+        } else {
>>>>+            /*
>>>>+             * this was submitted (to device) earlier. Use old
>>>>+             * request, bio (if it exists) and nvme-pdu to recreate
>>>>+             * the command for the discovered path
>>>>+             */
>>>>+            ret = nvme_uring_cmd_io_retry(ns, oreq, ioucmd, pdu);
>>>
>>>Why is this needed? Why is reuse important here? Why not always call
>>>nvme_ns_uring_cmd?
>>
>>Please see the previous explanation.
>>If condition is for the case when we made the passthrough command stable
>>by allocating beforehand.
>>Else is for the case when we avoided taking that cost.
>
>The current design of the multipath failover code is clean:
>1. extract bio(s) from request
>2. link in requeue_list
>3. schedule requeue_work that,
> 3.1 takes bios 1-by-1, and submits them again (exactly the same way)

It is really clean, and fits really well with bio based entry interface.
But as I said earlier, it does not go well with uring-cmd based entry
interface, and bunch of of other things which needs to be done
differently for generic passthrough command.

>I'd like us to try to follow the same design where retry is
>literally "do the exact same thing, again". That would eliminate
>two submission paths that do the same thing, but slightly different.

Exact same thing is possible if we make the common path slow i.e.
allocate/copy passthrough command and keep it alive until completion.
But that is really not the way to go I suppose.
Christoph Hellwig July 12, 2022, 6:52 a.m. UTC | #8
Hmm, I'm a little confused on what this is trying to archive.

The io_uring passthrough already does support multipathing, it picks
an available path in nvme_ns_head_chr_uring_cmd and uses that.

What this does is adding support for requeing on failure or the
lack of an available path.  Which very strongly is against our
passthrough philosophy both in SCSI and NVMe where error handling
is left entirely to the userspace program issuing the I/O.

So this does radically change behavior in a very unexpected way.
Why?
Kanchan Joshi July 12, 2022, 11:33 a.m. UTC | #9
On Tue, Jul 12, 2022 at 08:52:50AM +0200, Christoph Hellwig wrote:
>Hmm, I'm a little confused on what this is trying to archive.
>
>The io_uring passthrough already does support multipathing, it picks
>an available path in nvme_ns_head_chr_uring_cmd and uses that.
>
>What this does is adding support for requeing on failure or the
>lack of an available path.  Which very strongly is against our
>passthrough philosophy both in SCSI and NVMe where error handling
>is left entirely to the userspace program issuing the I/O.
>
>So this does radically change behavior in a very unexpected way.
>Why?

I think ask has been to add requeue/failover support for
async-passthrough. This came up here - 
https://lore.kernel.org/linux-nvme/88827a86-1304-e699-ec11-2718e280f9ad@grimberg.me/
Sagi Grimberg July 12, 2022, 8:13 p.m. UTC | #10
> Hmm, I'm a little confused on what this is trying to archive.
> 
> The io_uring passthrough already does support multipathing, it picks
> an available path in nvme_ns_head_chr_uring_cmd and uses that.
> 
> What this does is adding support for requeing on failure or the
> lack of an available path.  Which very strongly is against our
> passthrough philosophy both in SCSI and NVMe where error handling
> is left entirely to the userspace program issuing the I/O.
> 
> So this does radically change behavior in a very unexpected way.
> Why?

I think the difference from scsi-generic and controller nvme passthru is
that this is a mpath device node (or mpath chardev). This is why I think
that users would expect that it would have equivalent multipath
capabilities (i.e. failover).

In general, I think that uring passthru as an alternative I/O interface
and as such needs to be able to failover. If this is not expected from
the interface, then why are we exposing a chardev for the mpath device
node? why not only the bottom namespaces?

I can't really imagine a user that would use uring passthru and
when it gets an error completion, would then try to reconcile if there
is an available path (from sysfs?), and submitting it again in hope that
an available path is selected by the driver (without really being able
to control any of this)...

Maybe I'm wrong, but it looks like an awkward interface to operate on a
multipath device node, but implement failover yourself, based on some
information that is not necessarily in-sync with the driver.
Sagi Grimberg July 12, 2022, 9:26 p.m. UTC | #11
>>>>> @@ -448,6 +442,14 @@ static int nvme_uring_cmd_io(struct nvme_ctrl 
>>>>> *ctrl, struct nvme_ns *ns,
>>>>>      pdu->meta_buffer = nvme_to_user_ptr(d.metadata);
>>>>>      pdu->meta_len = d.metadata_len;
>>>>> +    if (issue_flags & IO_URING_F_MPATH) {
>>>>> +        req->cmd_flags |= REQ_NVME_MPATH;
>>>>> +        /*
>>>>> +         * we would need the buffer address (d.addr field) if we have
>>>>> +         * to retry the command. Store it by repurposing ioucmd->cmd
>>>>> +         */
>>>>> +        ioucmd->cmd = (void *)d.addr;
>>>>
>>>> What does repurposing mean here?
>>>
>>> This field (ioucmd->cmd) was pointing to passthrough command (which
>>> is embedded in SQE of io_uring). At this point we have consumed
>>> passthrough command, so this field can be reused if we have to. And we
>>> have to beceause failover needs recreating passthrough command.
>>> Please see nvme_uring_cmd_io_retry to see how this helps in recreating
>>> the fields of passthrough command. And more on this below.
>>
>> so ioucmd->cmd starts as an nvme_uring_cmd pointer and continues as
>> an address of buffer for a later processing that may or may not
>> happen in its lifetime?
> 
> Yes. See this as a no-cost way to handle fast/common path. We manage by
> doing just this assignment.
> Otherwise this would have involved doing high-cost (allocate/copy/deferral)
> stuff for later processing that may or may not happen.

I see it as a hacky no-cost way...

> 
>> Sounds a bit of a backwards design to me.
>>
>>>>> +    }
>>>>>      blk_execute_rq_nowait(req, false);
>>>>>      return -EIOCBQUEUED;
>>>>>  }
>>>>> @@ -665,12 +667,135 @@ int nvme_ns_head_chr_uring_cmd(struct 
>>>>> io_uring_cmd *ioucmd,
>>>>>      int srcu_idx = srcu_read_lock(&head->srcu);
>>>>>      struct nvme_ns *ns = nvme_find_path(head);
>>>>>      int ret = -EINVAL;
>>>>> +    struct device *dev = &head->cdev_device;
>>>>> +
>>>>> +    if (likely(ns)) {
>>>>> +        ret = nvme_ns_uring_cmd(ns, ioucmd,
>>>>> +                issue_flags | IO_URING_F_MPATH);
>>>>> +    } else if (nvme_available_path(head)) {
>>>>> +        struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
>>>>> +        struct nvme_uring_cmd *payload = NULL;
>>>>> +
>>>>> +        dev_warn_ratelimited(dev, "no usable path - requeuing 
>>>>> I/O\n");
>>>>> +        /*
>>>>> +         * We may requeue two kinds of uring commands:
>>>>> +         * 1. command failed post submission. pdu->req will be 
>>>>> non-null
>>>>> +         * for this
>>>>> +         * 2. command that could not be submitted because path was 
>>>>> not
>>>>> +         * available. For this pdu->req is set to NULL.
>>>>> +         */
>>>>> +        pdu->req = NULL;
>>>>
>>>> Relying on a pointer does not sound like a good idea to me.
>>>> But why do you care at all where did this command came from?
>>>> This code should not concern itself what happened prior to this
>>>> execution.
>>> Required, please see nvme_uring_cmd_io_retry. And this should be more
>>> clear as part of responses to your other questions.
>>
>> I think I understand. But it looks fragile to me.
>>
>>>
>>>>> +        /*
>>>>> +         * passthrough command will not be available during retry 
>>>>> as it
>>>>> +         * is embedded in io_uring's SQE. Hence we allocate/copy here
>>>>> +         */
>>>>
>>>> OK, that is a nice solution.
>>> Please note that prefered way is to recreate the passthrough command,
>>> and not to allocate it. We allocate it here because this happens very 
>>> early
>>> (i.e. before processing passthrough command and setting that up inside
>>> struct request). Recreating requires a populated 'struct request' .
>>
>> I think that once a driver consumes a command as queued, it needs a
>> stable copy of the command (could be in a percpu pool).
>>
>> The current design of nvme multipathing is that the request is stripped
>> from its bios and placed on a requeue_list, and if a request was not
>> formed yet (i.e. nvme available path exist but no usable) the bio is
>> placed on the requeue_list.
>>
>> So ideally we have something sufficient like a bio, that can be linked
>> on a requeue list, and is sufficient to build a request, at any point in
>> time...
> 
> we could be dealing with any passthrough command here. bio is not
> guranteed to exist in first place. It can very well be NULL.
> As I mentioned in cover letter, this was tested for such passthrough
> commands too.
> And bio is not a good fit for this interface. For block-path, entry
> function is nvme_ns_head_submit_bio() which speaks bio. Request is
> allocated afterwards. But since request formation can happen only after
> discovering a valid path, it may have to queue the bio if path does not
> exist.
> For passthrough, interface speaks/understands struct io_uring_cmd.
> Request is allocated after. And bio may (or may not) be attached with
> this request. It may have to queue the command even before request/bio
> gets formed. So cardinal structure (which is
> really available all the time) in this case is struct io_uring_cmd.
> Hence we use that as the object around which requeuing/failover is
> built.
> Conceptually io_uring_cmd is the bio of this interface.

I didn't say bio, I said something like a bio that holds stable
information that could be used for requeue purposes...

> 
>>>>> +        payload = kmalloc(sizeof(struct nvme_uring_cmd), GFP_KERNEL);
>>>>> +        if (!payload) {
>>>>> +            ret = -ENOMEM;
>>>>> +            goto out_unlock;
>>>>> +        }
>>>>> +        memcpy(payload, ioucmd->cmd, sizeof(struct nvme_uring_cmd));
>>>>> +        ioucmd->cmd = payload;
>>>>> -    if (ns)
>>>>> -        ret = nvme_ns_uring_cmd(ns, ioucmd, issue_flags);
>>>>> +        spin_lock_irq(&head->requeue_ioucmd_lock);
>>>>> +        ioucmd_list_add(&head->requeue_ioucmd_list, ioucmd);
>>>>> +        spin_unlock_irq(&head->requeue_ioucmd_lock);
>>>>> +        ret = -EIOCBQUEUED;
>>>>> +    } else {
>>>>> +        dev_warn_ratelimited(dev, "no available path - failing 
>>>>> I/O\n");
>>>>
>>>> ret=-EIO ?
>>> Did not do as it was initialized to -EINVAL. Do you prefer -EIO instead.
>>
>> It is not an invalid argument error here, it is essentially an IO error.
>> So yes, that would be my preference.
> 
> sure, will change.
> 
>>>>> +    }
>>>>> +out_unlock:
>>>>>      srcu_read_unlock(&head->srcu, srcu_idx);
>>>>>      return ret;
>>>>>  }
>>>>> +
>>>>> +int nvme_uring_cmd_io_retry(struct nvme_ns *ns, struct request *oreq,
>>>>> +        struct io_uring_cmd *ioucmd, struct nvme_uring_cmd_pdu *pdu)
>>>>> +{
>>>>> +    struct nvme_ctrl *ctrl = ns->ctrl;
>>>>> +    struct request_queue *q = ns ? ns->queue : ctrl->admin_q;
>>>>> +    struct nvme_uring_data d;
>>>>> +    struct nvme_command c;
>>>>> +    struct request *req;
>>>>> +    struct bio *obio = oreq->bio;
>>>>> +    void *meta = NULL;
>>>>> +
>>>>> +    memcpy(&c, nvme_req(oreq)->cmd, sizeof(struct nvme_command));
>>>>> +    d.metadata = (__u64)pdu->meta_buffer;
>>>>> +    d.metadata_len = pdu->meta_len;
>>>>> +    d.timeout_ms = oreq->timeout;
>>>>> +    d.addr = (__u64)ioucmd->cmd;
>>>>> +    if (obio) {
>>>>> +        d.data_len = obio->bi_iter.bi_size;
>>>>> +        blk_rq_unmap_user(obio);
>>>>> +    } else {
>>>>> +        d.data_len = 0;
>>>>> +    }
>>>>> +    blk_mq_free_request(oreq);
>>>>
>>>> The way I would do this that in nvme_ioucmd_failover_req (or in the
>>>> retry driven from command retriable failure) I would do the above,
>>>> requeue it and kick the requeue work, to go over the requeue_list and
>>>> just execute them again. Not sure why you even need an explicit retry
>>>> code.
>>> During retry we need passthrough command. But passthrough command is not
>>> stable (i.e. valid only during first submission). We can make it stable
>>> either by:
>>> (a) allocating in nvme (b) return -EAGAIN to io_uring, and it will do 
>>> allocate + deferral
>>> Both add a cost. And since any command can potentially fail, that
>>> means taking that cost for every IO that we issue on mpath node. Even if
>>> no failure (initial or subsquent after IO) occcured.
>>
>> As mentioned, I think that if a driver consumes a command as queued,
>> it needs a stable copy for a later reformation of the request for
>> failover purposes.
> 
> So what do you propose to make that stable?
> As I mentioned earlier, stable copy requires allocating/copying in fast
> path. And for a condition (failover) that may not even occur.
> I really think currrent solution is much better as it does not try to make
> it stable. Rather it assembles pieces of passthrough command if retry
> (which is rare) happens.

Well, I can understand that io_uring_cmd is space constrained, otherwise
we wouldn't be having this discussion. However io_kiocb is less
constrained, and could be used as a context to hold such a space.

Even if it is undesired to have io_kiocb be passed to uring_cmd(), it
can still hold a driver specific space paired with a helper to obtain it
(i.e. something like io_uring_cmd_to_driver_ctx(ioucmd) ). Then if the
space is pre-allocated it is only a small memory copy for a stable copy
that would allow a saner failover design.

>>> So to avoid commmon-path cost, we go about doing nothing (no allocation,
>>> no deferral) in the outset and choose to recreate the passthrough
>>> command if failure occured. Hope this explains the purpose of
>>> nvme_uring_cmd_io_retry?
>>
>> I think it does, but I'm still having a hard time with it...
>>
> Maybe I am reiterating but few main differences that should help -
> 
> - io_uring_cmd is at the forefront, and bio/request as secondary
> objects. Former is persistent object across requeue attempts and the
> only thing available when we discover the path, while other two are
> created every time we retry.
> 
> - Receiving bio from upper layer is a luxury that we do not have for
>   passthrough. When we receive bio, pages are already mapped and we
>   do not have to deal with user-specific fields, so there is more
>   freedom in using arbitrary context (workers etc.). But passthrough
>   command continues to point to user-space fields/buffers, so we need
>   that task context.
> 
>>>
>>>>> +    req = nvme_alloc_user_request(q, &c, nvme_to_user_ptr(d.addr),
>>>>> +            d.data_len, nvme_to_user_ptr(d.metadata),
>>>>> +            d.metadata_len, 0, &meta, d.timeout_ms ?
>>>>> +            msecs_to_jiffies(d.timeout_ms) : 0,
>>>>> +            ioucmd->cmd_op == NVME_URING_CMD_IO_VEC, 0, 0);
>>>>> +    if (IS_ERR(req))
>>>>> +        return PTR_ERR(req);
>>>>> +
>>>>> +    req->end_io = nvme_uring_cmd_end_io;
>>>>> +    req->end_io_data = ioucmd;
>>>>> +    pdu->bio = req->bio;
>>>>> +    pdu->meta = meta;
>>>>> +    req->cmd_flags |= REQ_NVME_MPATH;
>>>>> +    blk_execute_rq_nowait(req, false);
>>>>> +    return -EIOCBQUEUED;
>>>>> +}
>>>>> +
>>>>> +void nvme_ioucmd_mpath_retry(struct io_uring_cmd *ioucmd)
>>>>> +{
>>>>> +    struct cdev *cdev = file_inode(ioucmd->file)->i_cdev;
>>>>> +    struct nvme_ns_head *head = container_of(cdev, struct 
>>>>> nvme_ns_head,
>>>>> +            cdev);
>>>>> +    int srcu_idx = srcu_read_lock(&head->srcu);
>>>>> +    struct nvme_ns *ns = nvme_find_path(head);
>>>>> +    unsigned int issue_flags = IO_URING_F_SQE128 | IO_URING_F_CQE32 |
>>>>> +        IO_URING_F_MPATH;
>>>>> +    struct device *dev = &head->cdev_device;
>>>>> +
>>>>> +    if (likely(ns)) {
>>>>> +        struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
>>>>> +        struct request *oreq = pdu->req;
>>>>> +        int ret;
>>>>> +
>>>>> +        if (oreq == NULL) {
>>>>> +            /*
>>>>> +             * this was not submitted (to device) earlier. For this
>>>>> +             * ioucmd->cmd points to persistent memory. Free that
>>>>> +             * up post submission
>>>>> +             */
>>>>> +            const void *cmd = ioucmd->cmd;
>>>>> +
>>>>> +            ret = nvme_ns_uring_cmd(ns, ioucmd, issue_flags);
>>>>> +            kfree(cmd);
>>>>> +        } else {
>>>>> +            /*
>>>>> +             * this was submitted (to device) earlier. Use old
>>>>> +             * request, bio (if it exists) and nvme-pdu to recreate
>>>>> +             * the command for the discovered path
>>>>> +             */
>>>>> +            ret = nvme_uring_cmd_io_retry(ns, oreq, ioucmd, pdu);
>>>>
>>>> Why is this needed? Why is reuse important here? Why not always call
>>>> nvme_ns_uring_cmd?
>>>
>>> Please see the previous explanation.
>>> If condition is for the case when we made the passthrough command stable
>>> by allocating beforehand.
>>> Else is for the case when we avoided taking that cost.
>>
>> The current design of the multipath failover code is clean:
>> 1. extract bio(s) from request
>> 2. link in requeue_list
>> 3. schedule requeue_work that,
>> 3.1 takes bios 1-by-1, and submits them again (exactly the same way)
> 
> It is really clean, and fits really well with bio based entry interface.
> But as I said earlier, it does not go well with uring-cmd based entry
> interface, and bunch of of other things which needs to be done
> differently for generic passthrough command.
> 
>> I'd like us to try to follow the same design where retry is
>> literally "do the exact same thing, again". That would eliminate
>> two submission paths that do the same thing, but slightly different.
> 
> Exact same thing is possible if we make the common path slow i.e.
> allocate/copy passthrough command and keep it alive until completion.
> But that is really not the way to go I suppose.

I'm not sure. With Christoph's response, I'm not sure it is
universally desired to support failover (in my opinion it should). But
if we do in fact choose to support it, I think we need a better
solution. If fast-path allocation is your prime concern, then let's try
to address that with space pre-allocation.
Christoph Hellwig July 13, 2022, 5:36 a.m. UTC | #12
On Tue, Jul 12, 2022 at 11:13:57PM +0300, Sagi Grimberg wrote:
> I think the difference from scsi-generic and controller nvme passthru is
> that this is a mpath device node (or mpath chardev). This is why I think
> that users would expect that it would have equivalent multipath
> capabilities (i.e. failover).

How is that different from /dev/sg?

> In general, I think that uring passthru as an alternative I/O interface
> and as such needs to be able to failover. If this is not expected from
> the interface, then why are we exposing a chardev for the mpath device
> node? why not only the bottom namespaces?

The failover will happen when you retry, but we leave that retry to
userspace.  There even is the uevent to tell userspace when a new
path is up.
Kanchan Joshi July 13, 2022, 5:37 a.m. UTC | #13
>>>>>The way I would do this that in nvme_ioucmd_failover_req (or in the
>>>>>retry driven from command retriable failure) I would do the above,
>>>>>requeue it and kick the requeue work, to go over the requeue_list and
>>>>>just execute them again. Not sure why you even need an explicit retry
>>>>>code.
>>>>During retry we need passthrough command. But passthrough command is not
>>>>stable (i.e. valid only during first submission). We can make it stable
>>>>either by:
>>>>(a) allocating in nvme (b) return -EAGAIN to io_uring, and it 
>>>>will do allocate + deferral
>>>>Both add a cost. And since any command can potentially fail, that
>>>>means taking that cost for every IO that we issue on mpath node. Even if
>>>>no failure (initial or subsquent after IO) occcured.
>>>
>>>As mentioned, I think that if a driver consumes a command as queued,
>>>it needs a stable copy for a later reformation of the request for
>>>failover purposes.
>>
>>So what do you propose to make that stable?
>>As I mentioned earlier, stable copy requires allocating/copying in fast
>>path. And for a condition (failover) that may not even occur.
>>I really think currrent solution is much better as it does not try to make
>>it stable. Rather it assembles pieces of passthrough command if retry
>>(which is rare) happens.
>
>Well, I can understand that io_uring_cmd is space constrained, otherwise
>we wouldn't be having this discussion. 

Indeed. If we had space for keeping passthrough command stable for
retry, that would really have simplified the plumbing. Retry logic would
be same as first submission.

>However io_kiocb is less
>constrained, and could be used as a context to hold such a space.
>
>Even if it is undesired to have io_kiocb be passed to uring_cmd(), it
>can still hold a driver specific space paired with a helper to obtain it
>(i.e. something like io_uring_cmd_to_driver_ctx(ioucmd) ). Then if the
>space is pre-allocated it is only a small memory copy for a stable copy
>that would allow a saner failover design.

I am thinking along the same lines, but it's not about few bytes of
space rather we need 80 (72 to be precise). Will think more, but
these 72 bytes really stand tall in front of my optimism.

Do you see anything is possible in nvme-side?
Now also passthrough command (although in a modified form) gets copied
into this preallocated space i.e. nvme_req(req)->cmd. This part -

void nvme_init_request(struct request *req, struct nvme_command *cmd) 
{
	...
	memcpy(nvme_req(req)->cmd, cmd, sizeof(*cmd)); 
}

>>>>So to avoid commmon-path cost, we go about doing nothing (no allocation,
>>>>no deferral) in the outset and choose to recreate the passthrough
>>>>command if failure occured. Hope this explains the purpose of
>>>>nvme_uring_cmd_io_retry?
>>>
>>>I think it does, but I'm still having a hard time with it...
>>>
>>Maybe I am reiterating but few main differences that should help -
>>
>>- io_uring_cmd is at the forefront, and bio/request as secondary
>>objects. Former is persistent object across requeue attempts and the
>>only thing available when we discover the path, while other two are
>>created every time we retry.
>>
>>- Receiving bio from upper layer is a luxury that we do not have for
>>  passthrough. When we receive bio, pages are already mapped and we
>>  do not have to deal with user-specific fields, so there is more
>>  freedom in using arbitrary context (workers etc.). But passthrough
>>  command continues to point to user-space fields/buffers, so we need
>>  that task context.
>>
>>>>
>>>>>>+    req = nvme_alloc_user_request(q, &c, nvme_to_user_ptr(d.addr),
>>>>>>+            d.data_len, nvme_to_user_ptr(d.metadata),
>>>>>>+            d.metadata_len, 0, &meta, d.timeout_ms ?
>>>>>>+            msecs_to_jiffies(d.timeout_ms) : 0,
>>>>>>+            ioucmd->cmd_op == NVME_URING_CMD_IO_VEC, 0, 0);
>>>>>>+    if (IS_ERR(req))
>>>>>>+        return PTR_ERR(req);
>>>>>>+
>>>>>>+    req->end_io = nvme_uring_cmd_end_io;
>>>>>>+    req->end_io_data = ioucmd;
>>>>>>+    pdu->bio = req->bio;
>>>>>>+    pdu->meta = meta;
>>>>>>+    req->cmd_flags |= REQ_NVME_MPATH;
>>>>>>+    blk_execute_rq_nowait(req, false);
>>>>>>+    return -EIOCBQUEUED;
>>>>>>+}
>>>>>>+
>>>>>>+void nvme_ioucmd_mpath_retry(struct io_uring_cmd *ioucmd)
>>>>>>+{
>>>>>>+    struct cdev *cdev = file_inode(ioucmd->file)->i_cdev;
>>>>>>+    struct nvme_ns_head *head = container_of(cdev, struct 
>>>>>>nvme_ns_head,
>>>>>>+            cdev);
>>>>>>+    int srcu_idx = srcu_read_lock(&head->srcu);
>>>>>>+    struct nvme_ns *ns = nvme_find_path(head);
>>>>>>+    unsigned int issue_flags = IO_URING_F_SQE128 | IO_URING_F_CQE32 |
>>>>>>+        IO_URING_F_MPATH;
>>>>>>+    struct device *dev = &head->cdev_device;
>>>>>>+
>>>>>>+    if (likely(ns)) {
>>>>>>+        struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
>>>>>>+        struct request *oreq = pdu->req;
>>>>>>+        int ret;
>>>>>>+
>>>>>>+        if (oreq == NULL) {
>>>>>>+            /*
>>>>>>+             * this was not submitted (to device) earlier. For this
>>>>>>+             * ioucmd->cmd points to persistent memory. Free that
>>>>>>+             * up post submission
>>>>>>+             */
>>>>>>+            const void *cmd = ioucmd->cmd;
>>>>>>+
>>>>>>+            ret = nvme_ns_uring_cmd(ns, ioucmd, issue_flags);
>>>>>>+            kfree(cmd);
>>>>>>+        } else {
>>>>>>+            /*
>>>>>>+             * this was submitted (to device) earlier. Use old
>>>>>>+             * request, bio (if it exists) and nvme-pdu to recreate
>>>>>>+             * the command for the discovered path
>>>>>>+             */
>>>>>>+            ret = nvme_uring_cmd_io_retry(ns, oreq, ioucmd, pdu);
>>>>>
>>>>>Why is this needed? Why is reuse important here? Why not always call
>>>>>nvme_ns_uring_cmd?
>>>>
>>>>Please see the previous explanation.
>>>>If condition is for the case when we made the passthrough command stable
>>>>by allocating beforehand.
>>>>Else is for the case when we avoided taking that cost.
>>>
>>>The current design of the multipath failover code is clean:
>>>1. extract bio(s) from request
>>>2. link in requeue_list
>>>3. schedule requeue_work that,
>>>3.1 takes bios 1-by-1, and submits them again (exactly the same way)
>>
>>It is really clean, and fits really well with bio based entry interface.
>>But as I said earlier, it does not go well with uring-cmd based entry
>>interface, and bunch of of other things which needs to be done
>>differently for generic passthrough command.
>>
>>>I'd like us to try to follow the same design where retry is
>>>literally "do the exact same thing, again". That would eliminate
>>>two submission paths that do the same thing, but slightly different.
>>
>>Exact same thing is possible if we make the common path slow i.e.
>>allocate/copy passthrough command and keep it alive until completion.
>>But that is really not the way to go I suppose.
>
>I'm not sure. With Christoph's response, I'm not sure it is
>universally desired to support failover (in my opinion it should). But
>if we do in fact choose to support it, I think we need a better
>solution. If fast-path allocation is your prime concern, then let's try
>to address that with space pre-allocation.

Sure. I understand the benefit that space pre-allocation will give.

And overall, these are the top three things to iron out: 
- to do (failover) or not to do
- better way to keep the passthrough-cmd stable
- better way to link io_uring_cmd
Sagi Grimberg July 13, 2022, 8:04 a.m. UTC | #14
>> I think the difference from scsi-generic and controller nvme passthru is
>> that this is a mpath device node (or mpath chardev). This is why I think
>> that users would expect that it would have equivalent multipath
>> capabilities (i.e. failover).
> 
> How is that different from /dev/sg?

/dev/sg it is not a multipath device.

Maybe the solution is to just not expose a /dev/ng for the mpath device
node, but only for bottom namespaces. Then it would be completely
equivalent to scsi-generic devices.

It just creates an unexpected mix of semantics of best-effort
multipathing with just path selection, but no requeue/failover...

>> In general, I think that uring passthru as an alternative I/O interface
>> and as such needs to be able to failover. If this is not expected from
>> the interface, then why are we exposing a chardev for the mpath device
>> node? why not only the bottom namespaces?
> 
> The failover will happen when you retry, but we leave that retry to
> userspace.  There even is the uevent to tell userspace when a new
> path is up.

If the user needs to do the retry, discover and understand namespace
paths, ANA states, controller states, etc. Why does the user need a
mpath chardev at all?

The user basically needs to understand everything including indirectly
path selection for the I/O. IMO it is cleaner for the user to just have 
the bottom devices and do the path selection directly.
Sagi Grimberg July 13, 2022, 9:03 a.m. UTC | #15
>> However io_kiocb is less
>> constrained, and could be used as a context to hold such a space.
>>
>> Even if it is undesired to have io_kiocb be passed to uring_cmd(), it
>> can still hold a driver specific space paired with a helper to obtain it
>> (i.e. something like io_uring_cmd_to_driver_ctx(ioucmd) ). Then if the
>> space is pre-allocated it is only a small memory copy for a stable copy
>> that would allow a saner failover design.
> 
> I am thinking along the same lines, but it's not about few bytes of
> space rather we need 80 (72 to be precise). Will think more, but
> these 72 bytes really stand tall in front of my optimism.

You don't have to populate this space on every I/O, you can just
populate it when there is no usable path and when you failover a
request...
Christoph Hellwig July 13, 2022, 10:12 a.m. UTC | #16
On Wed, Jul 13, 2022 at 11:04:31AM +0300, Sagi Grimberg wrote:
> Maybe the solution is to just not expose a /dev/ng for the mpath device
> node, but only for bottom namespaces. Then it would be completely
> equivalent to scsi-generic devices.
>
> It just creates an unexpected mix of semantics of best-effort
> multipathing with just path selection, but no requeue/failover...

Which is exactly the same semanics as SG_IO on the dm-mpath nodes.

> If the user needs to do the retry, discover and understand namespace
> paths, ANA states, controller states, etc. Why does the user need a
> mpath chardev at all?

The user needs to do that for all kinds of other resons anyway,
as we don't do any retries for passthrough at all.
Sagi Grimberg July 13, 2022, 11 a.m. UTC | #17
>> Maybe the solution is to just not expose a /dev/ng for the mpath device
>> node, but only for bottom namespaces. Then it would be completely
>> equivalent to scsi-generic devices.
>>
>> It just creates an unexpected mix of semantics of best-effort
>> multipathing with just path selection, but no requeue/failover...
> 
> Which is exactly the same semanics as SG_IO on the dm-mpath nodes.

I view uring passthru somewhat as a different thing than sending SG_IO
ioctls to dm-mpath. But it can be argued otherwise.

BTW, the only consumer of it that I'm aware of commented that he
expects dm-mpath to retry SG_IO when dm-mpath retry for SG_IO submission
was attempted (https://www.spinics.net/lists/dm-devel/msg46924.html).

 From Paolo:
"The problem is that userspace does not have a way to direct the command 
to a different path in the resubmission. It may not even have permission 
to issue DM_TABLE_STATUS, or to access the /dev nodes for the underlying 
paths, so without Martin's patches SG_IO on dm-mpath is basically 
unreliable by design."

I didn't manage to track down any followup after that email though...

>> If the user needs to do the retry, discover and understand namespace
>> paths, ANA states, controller states, etc. Why does the user need a
>> mpath chardev at all?
> 
> The user needs to do that for all kinds of other resons anyway,
> as we don't do any retries for passthrough at all.

I still think that there is a problem with the existing semantics for
passthru requests over mpath device nodes.

Again, I think it will actually be cleaner not to expose passthru
devices for mpath at all if we are not going to support retry/failover.
Christoph Hellwig July 13, 2022, 11:28 a.m. UTC | #18
On Wed, Jul 13, 2022 at 02:00:56PM +0300, Sagi Grimberg wrote:
> I view uring passthru somewhat as a different thing than sending SG_IO
> ioctls to dm-mpath. But it can be argued otherwise.
>
> BTW, the only consumer of it that I'm aware of commented that he
> expects dm-mpath to retry SG_IO when dm-mpath retry for SG_IO submission
> was attempted (https://www.spinics.net/lists/dm-devel/msg46924.html).

Yeah.  But the point is that if we have a path failure, the kernel
will pick a new path next time anyway, both in dm-mpath and nvme-mpath.

> I still think that there is a problem with the existing semantics for
> passthru requests over mpath device nodes.
>
> Again, I think it will actually be cleaner not to expose passthru
> devices for mpath at all if we are not going to support retry/failover.

I think they are very useful here.  Users of passthrough interface
need to be able to retry anyway, even on non-multipath setups.  And
a dumb retry will do the right thing.
Kanchan Joshi July 13, 2022, 11:28 a.m. UTC | #19
On Wed, Jul 13, 2022 at 12:03:48PM +0300, Sagi Grimberg wrote:
>
>>>However io_kiocb is less
>>>constrained, and could be used as a context to hold such a space.
>>>
>>>Even if it is undesired to have io_kiocb be passed to uring_cmd(), it
>>>can still hold a driver specific space paired with a helper to obtain it
>>>(i.e. something like io_uring_cmd_to_driver_ctx(ioucmd) ). Then if the
>>>space is pre-allocated it is only a small memory copy for a stable copy
>>>that would allow a saner failover design.
>>
>>I am thinking along the same lines, but it's not about few bytes of
>>space rather we need 80 (72 to be precise). Will think more, but
>>these 72 bytes really stand tall in front of my optimism.
>
>You don't have to populate this space on every I/O, you can just
>populate it when there is no usable path and when you failover a
>request...

Getting the space and when/how to populate it - related but diferent
topics in this context.

It is about the lifetime of SQE which is valid only for the first
submission. If we don't make the command stable at that point, we don't
have another chance. And that is exactly what happens for failover.
Since we know IO is failed only when it fails, but by that time
original passthrough-command is gone out of hand. I think if we somehow
get the space (preallocated), it is ok to copy to command for every IO
in mpath case.

The other part (no usuable path) is fine, because we hit that condition
during initial submission and therefore have the chance to allocate/copy
the passthrough command. This patch already does that.
Hannes Reinecke July 13, 2022, 11:49 a.m. UTC | #20
On 7/13/22 13:00, Sagi Grimberg wrote:
> 
>>> Maybe the solution is to just not expose a /dev/ng for the mpath device
>>> node, but only for bottom namespaces. Then it would be completely
>>> equivalent to scsi-generic devices.
>>>
>>> It just creates an unexpected mix of semantics of best-effort
>>> multipathing with just path selection, but no requeue/failover...
>>
>> Which is exactly the same semanics as SG_IO on the dm-mpath nodes.
> 
> I view uring passthru somewhat as a different thing than sending SG_IO
> ioctls to dm-mpath. But it can be argued otherwise.
> 
> BTW, the only consumer of it that I'm aware of commented that he
> expects dm-mpath to retry SG_IO when dm-mpath retry for SG_IO submission
> was attempted (https://www.spinics.net/lists/dm-devel/msg46924.html).
> 
>  From Paolo:
> "The problem is that userspace does not have a way to direct the command 
> to a different path in the resubmission. It may not even have permission 
> to issue DM_TABLE_STATUS, or to access the /dev nodes for the underlying 
> paths, so without Martin's patches SG_IO on dm-mpath is basically 
> unreliable by design."
> 
> I didn't manage to track down any followup after that email though...
> 
I did; 'twas me who was involved in the initial customer issue leading 
up to that.

Amongst all the other issue we've found the prime problem with SG_IO is 
that it needs to be directed to the 'active' path.
For the device-mapper has a distinct callout (dm_prepare_ioctl), which 
essentially returns the current active path device. And then the 
device-mapper core issues the command on that active path.

All nice and good, _unless_ that command triggers an error.
Normally it'd be intercepted by the dm-multipath end_io handler, and 
would set the path to offline.
But as ioctls do not use the normal I/O path the end_io handler is never 
called, and further SG_IO calls are happily routed down the failed path.

And the customer had to use SG_IO (or, in qemu-speak, LUN passthrough) 
as his application/filesystem makes heavy use of persistent reservations.

So yeah, nvme-multipathing should be okay here, as io_uring and normal 
I/O are using the same code paths, hence the above scenario really won't 
occur.

Cheers,

Hannes
Sagi Grimberg July 13, 2022, 12:16 p.m. UTC | #21
>> I view uring passthru somewhat as a different thing than sending SG_IO
>> ioctls to dm-mpath. But it can be argued otherwise.
>>
>> BTW, the only consumer of it that I'm aware of commented that he
>> expects dm-mpath to retry SG_IO when dm-mpath retry for SG_IO submission
>> was attempted (https://www.spinics.net/lists/dm-devel/msg46924.html).
> 
> Yeah.  But the point is that if we have a path failure, the kernel
> will pick a new path next time anyway, both in dm-mpath and nvme-mpath.

If such a path is available at all.

>> I still think that there is a problem with the existing semantics for
>> passthru requests over mpath device nodes.
>>
>> Again, I think it will actually be cleaner not to expose passthru
>> devices for mpath at all if we are not going to support retry/failover.
> 
> I think they are very useful here.  Users of passthrough interface
> need to be able to retry anyway, even on non-multipath setups.  And
> a dumb retry will do the right thing.

I think you are painting a simple picture while this is not the case
necessarily. It is not a dumb retry, because the user needs to determine
if an available path for this particular namespace exists or wait for
one if it doesn't want to do a submit/fail constant loop.

A passthru interface does not mean that the user by definition needs to
understand multipathing, ana/ctrl/ns states/mappings, etc. The user may
just want to be able issue vendor-specific commands to the device.

If the user needs to understand multipathing by definition, he/she has
zero use of a mpath passthru device if it doesn't retry IMO.
Sagi Grimberg July 13, 2022, 12:17 p.m. UTC | #22
>>>> However io_kiocb is less
>>>> constrained, and could be used as a context to hold such a space.
>>>>
>>>> Even if it is undesired to have io_kiocb be passed to uring_cmd(), it
>>>> can still hold a driver specific space paired with a helper to 
>>>> obtain it
>>>> (i.e. something like io_uring_cmd_to_driver_ctx(ioucmd) ). Then if the
>>>> space is pre-allocated it is only a small memory copy for a stable copy
>>>> that would allow a saner failover design.
>>>
>>> I am thinking along the same lines, but it's not about few bytes of
>>> space rather we need 80 (72 to be precise). Will think more, but
>>> these 72 bytes really stand tall in front of my optimism.
>>
>> You don't have to populate this space on every I/O, you can just
>> populate it when there is no usable path and when you failover a
>> request...
> 
> Getting the space and when/how to populate it - related but diferent
> topics in this context.
> 
> It is about the lifetime of SQE which is valid only for the first
> submission. If we don't make the command stable at that point, we don't
> have another chance. And that is exactly what happens for failover.
> Since we know IO is failed only when it fails, but by that time
> original passthrough-command is gone out of hand. I think if we somehow
> get the space (preallocated), it is ok to copy to command for every IO
> in mpath case.

Yea you're right. you need to populate it as soon as you queue the
uring command.
Sagi Grimberg July 13, 2022, 12:43 p.m. UTC | #23
On 7/13/22 14:49, Hannes Reinecke wrote:
> On 7/13/22 13:00, Sagi Grimberg wrote:
>>
>>>> Maybe the solution is to just not expose a /dev/ng for the mpath device
>>>> node, but only for bottom namespaces. Then it would be completely
>>>> equivalent to scsi-generic devices.
>>>>
>>>> It just creates an unexpected mix of semantics of best-effort
>>>> multipathing with just path selection, but no requeue/failover...
>>>
>>> Which is exactly the same semanics as SG_IO on the dm-mpath nodes.
>>
>> I view uring passthru somewhat as a different thing than sending SG_IO
>> ioctls to dm-mpath. But it can be argued otherwise.
>>
>> BTW, the only consumer of it that I'm aware of commented that he
>> expects dm-mpath to retry SG_IO when dm-mpath retry for SG_IO submission
>> was attempted (https://www.spinics.net/lists/dm-devel/msg46924.html).
>>
>>  From Paolo:
>> "The problem is that userspace does not have a way to direct the 
>> command to a different path in the resubmission. It may not even have 
>> permission to issue DM_TABLE_STATUS, or to access the /dev nodes for 
>> the underlying paths, so without Martin's patches SG_IO on dm-mpath is 
>> basically unreliable by design."
>>
>> I didn't manage to track down any followup after that email though...
>>
> I did; 'twas me who was involved in the initial customer issue leading 
> up to that.
> 
> Amongst all the other issue we've found the prime problem with SG_IO is 
> that it needs to be directed to the 'active' path.
> For the device-mapper has a distinct callout (dm_prepare_ioctl), which 
> essentially returns the current active path device. And then the 
> device-mapper core issues the command on that active path.
> 
> All nice and good, _unless_ that command triggers an error.
> Normally it'd be intercepted by the dm-multipath end_io handler, and 
> would set the path to offline.
> But as ioctls do not use the normal I/O path the end_io handler is never 
> called, and further SG_IO calls are happily routed down the failed path.
> 
> And the customer had to use SG_IO (or, in qemu-speak, LUN passthrough) 
> as his application/filesystem makes heavy use of persistent reservations.

How did this conclude Hannes?
Hannes Reinecke July 13, 2022, 1:30 p.m. UTC | #24
On 7/13/22 14:43, Sagi Grimberg wrote:
> 
> 
> On 7/13/22 14:49, Hannes Reinecke wrote:
>> On 7/13/22 13:00, Sagi Grimberg wrote:
>>>
>>>>> Maybe the solution is to just not expose a /dev/ng for the mpath 
>>>>> device
>>>>> node, but only for bottom namespaces. Then it would be completely
>>>>> equivalent to scsi-generic devices.
>>>>>
>>>>> It just creates an unexpected mix of semantics of best-effort
>>>>> multipathing with just path selection, but no requeue/failover...
>>>>
>>>> Which is exactly the same semanics as SG_IO on the dm-mpath nodes.
>>>
>>> I view uring passthru somewhat as a different thing than sending SG_IO
>>> ioctls to dm-mpath. But it can be argued otherwise.
>>>
>>> BTW, the only consumer of it that I'm aware of commented that he
>>> expects dm-mpath to retry SG_IO when dm-mpath retry for SG_IO submission
>>> was attempted (https://www.spinics.net/lists/dm-devel/msg46924.html).
>>>
>>>  From Paolo:
>>> "The problem is that userspace does not have a way to direct the 
>>> command to a different path in the resubmission. It may not even have 
>>> permission to issue DM_TABLE_STATUS, or to access the /dev nodes for 
>>> the underlying paths, so without Martin's patches SG_IO on dm-mpath 
>>> is basically unreliable by design."
>>>
>>> I didn't manage to track down any followup after that email though...
>>>
>> I did; 'twas me who was involved in the initial customer issue leading 
>> up to that.
>>
>> Amongst all the other issue we've found the prime problem with SG_IO 
>> is that it needs to be directed to the 'active' path.
>> For the device-mapper has a distinct callout (dm_prepare_ioctl), which 
>> essentially returns the current active path device. And then the 
>> device-mapper core issues the command on that active path.
>>
>> All nice and good, _unless_ that command triggers an error.
>> Normally it'd be intercepted by the dm-multipath end_io handler, and 
>> would set the path to offline.
>> But as ioctls do not use the normal I/O path the end_io handler is 
>> never called, and further SG_IO calls are happily routed down the 
>> failed path.
>>
>> And the customer had to use SG_IO (or, in qemu-speak, LUN passthrough) 
>> as his application/filesystem makes heavy use of persistent reservations.
> 
> How did this conclude Hannes?

It didn't. The proposed interface got rejected, and now we need to come 
up with an alternative solution.
Which we haven't found yet.

Cheers,

Hannes
Sagi Grimberg July 13, 2022, 1:41 p.m. UTC | #25
>>>>>> Maybe the solution is to just not expose a /dev/ng for the mpath 
>>>>>> device
>>>>>> node, but only for bottom namespaces. Then it would be completely
>>>>>> equivalent to scsi-generic devices.
>>>>>>
>>>>>> It just creates an unexpected mix of semantics of best-effort
>>>>>> multipathing with just path selection, but no requeue/failover...
>>>>>
>>>>> Which is exactly the same semanics as SG_IO on the dm-mpath nodes.
>>>>
>>>> I view uring passthru somewhat as a different thing than sending SG_IO
>>>> ioctls to dm-mpath. But it can be argued otherwise.
>>>>
>>>> BTW, the only consumer of it that I'm aware of commented that he
>>>> expects dm-mpath to retry SG_IO when dm-mpath retry for SG_IO 
>>>> submission
>>>> was attempted (https://www.spinics.net/lists/dm-devel/msg46924.html).
>>>>
>>>>  From Paolo:
>>>> "The problem is that userspace does not have a way to direct the 
>>>> command to a different path in the resubmission. It may not even 
>>>> have permission to issue DM_TABLE_STATUS, or to access the /dev 
>>>> nodes for the underlying paths, so without Martin's patches SG_IO on 
>>>> dm-mpath is basically unreliable by design."
>>>>
>>>> I didn't manage to track down any followup after that email though...
>>>>
>>> I did; 'twas me who was involved in the initial customer issue 
>>> leading up to that.
>>>
>>> Amongst all the other issue we've found the prime problem with SG_IO 
>>> is that it needs to be directed to the 'active' path.
>>> For the device-mapper has a distinct callout (dm_prepare_ioctl), 
>>> which essentially returns the current active path device. And then 
>>> the device-mapper core issues the command on that active path.
>>>
>>> All nice and good, _unless_ that command triggers an error.
>>> Normally it'd be intercepted by the dm-multipath end_io handler, and 
>>> would set the path to offline.
>>> But as ioctls do not use the normal I/O path the end_io handler is 
>>> never called, and further SG_IO calls are happily routed down the 
>>> failed path.
>>>
>>> And the customer had to use SG_IO (or, in qemu-speak, LUN 
>>> passthrough) as his application/filesystem makes heavy use of 
>>> persistent reservations.
>>
>> How did this conclude Hannes?
> 
> It didn't. The proposed interface got rejected, and now we need to come 
> up with an alternative solution.
> Which we haven't found yet.

Lets assume for the sake of discussion, had dm-mpath set a path to be
offline on ioctl errors, what would qemu do upon this error? blindly
retry? Until When? Or would qemu need to learn about the path tables in
order to know when there is at least one online path in order to retry?

What is the model that a passthru consumer needs to follow when
operating against a mpath device?
Hannes Reinecke July 13, 2022, 2:07 p.m. UTC | #26
On 7/13/22 15:41, Sagi Grimberg wrote:
> 
>>>>>>> Maybe the solution is to just not expose a /dev/ng for the mpath 
>>>>>>> device
>>>>>>> node, but only for bottom namespaces. Then it would be completely
>>>>>>> equivalent to scsi-generic devices.
>>>>>>>
>>>>>>> It just creates an unexpected mix of semantics of best-effort
>>>>>>> multipathing with just path selection, but no requeue/failover...
>>>>>>
>>>>>> Which is exactly the same semanics as SG_IO on the dm-mpath nodes.
>>>>>
>>>>> I view uring passthru somewhat as a different thing than sending SG_IO
>>>>> ioctls to dm-mpath. But it can be argued otherwise.
>>>>>
>>>>> BTW, the only consumer of it that I'm aware of commented that he
>>>>> expects dm-mpath to retry SG_IO when dm-mpath retry for SG_IO 
>>>>> submission
>>>>> was attempted (https://www.spinics.net/lists/dm-devel/msg46924.html).
>>>>>
>>>>>  From Paolo:
>>>>> "The problem is that userspace does not have a way to direct the 
>>>>> command to a different path in the resubmission. It may not even 
>>>>> have permission to issue DM_TABLE_STATUS, or to access the /dev 
>>>>> nodes for the underlying paths, so without Martin's patches SG_IO 
>>>>> on dm-mpath is basically unreliable by design."
>>>>>
>>>>> I didn't manage to track down any followup after that email though...
>>>>>
>>>> I did; 'twas me who was involved in the initial customer issue 
>>>> leading up to that.
>>>>
>>>> Amongst all the other issue we've found the prime problem with SG_IO 
>>>> is that it needs to be directed to the 'active' path.
>>>> For the device-mapper has a distinct callout (dm_prepare_ioctl), 
>>>> which essentially returns the current active path device. And then 
>>>> the device-mapper core issues the command on that active path.
>>>>
>>>> All nice and good, _unless_ that command triggers an error.
>>>> Normally it'd be intercepted by the dm-multipath end_io handler, and 
>>>> would set the path to offline.
>>>> But as ioctls do not use the normal I/O path the end_io handler is 
>>>> never called, and further SG_IO calls are happily routed down the 
>>>> failed path.
>>>>
>>>> And the customer had to use SG_IO (or, in qemu-speak, LUN 
>>>> passthrough) as his application/filesystem makes heavy use of 
>>>> persistent reservations.
>>>
>>> How did this conclude Hannes?
>>
>> It didn't. The proposed interface got rejected, and now we need to 
>> come up with an alternative solution.
>> Which we haven't found yet.
> 
> Lets assume for the sake of discussion, had dm-mpath set a path to be
> offline on ioctl errors, what would qemu do upon this error? blindly
> retry? Until When? Or would qemu need to learn about the path tables in
> order to know when there is at least one online path in order to retry?
> 
IIRC that was one of the points why it got rejected.
Ideally we would return an errno indicating that the path had failed, 
but further paths are available, so a retry is in order.
Once no paths are available qemu would be getting a different error 
indicating that all paths are failed.

But we would be overloading a new meaning to existing error numbers, or 
even inventing our own error numbers. Which makes it rather awkward to use.

Ideally we would be able to return this as the SG_IO status, as that is 
well capable of expressing these situations. But then we would need to 
parse and/or return the error ourselves, essentially moving sg_io 
funtionality into dm-mpath. Also not what one wants.

> What is the model that a passthru consumer needs to follow when
> operating against a mpath device?

The model really is that passthru consumer needs to deal with these errors:
- No error (obviously)
- I/O error (error status will not change with a retry)
- Temporary/path related error (error status might change with a retry)

Then the consumer can decide whether to invoke a retry (for the last 
class), or whether it should pass up that error, as maybe there are 
applications with need a quick response time and can handle temporary 
failures (or, in fact, want to be informed about temporary failures).

IE the 'DNR' bit should serve nicely here, keeping in mind that we might 
need to 'fake' an NVMe error status if the connection is severed.

Cheers,

Hannes
Sagi Grimberg July 13, 2022, 3:59 p.m. UTC | #27
>>>>> Amongst all the other issue we've found the prime problem with 
>>>>> SG_IO is that it needs to be directed to the 'active' path.
>>>>> For the device-mapper has a distinct callout (dm_prepare_ioctl), 
>>>>> which essentially returns the current active path device. And then 
>>>>> the device-mapper core issues the command on that active path.
>>>>>
>>>>> All nice and good, _unless_ that command triggers an error.
>>>>> Normally it'd be intercepted by the dm-multipath end_io handler, 
>>>>> and would set the path to offline.
>>>>> But as ioctls do not use the normal I/O path the end_io handler is 
>>>>> never called, and further SG_IO calls are happily routed down the 
>>>>> failed path.
>>>>>
>>>>> And the customer had to use SG_IO (or, in qemu-speak, LUN 
>>>>> passthrough) as his application/filesystem makes heavy use of 
>>>>> persistent reservations.
>>>>
>>>> How did this conclude Hannes?
>>>
>>> It didn't. The proposed interface got rejected, and now we need to 
>>> come up with an alternative solution.
>>> Which we haven't found yet.
>>
>> Lets assume for the sake of discussion, had dm-mpath set a path to be
>> offline on ioctl errors, what would qemu do upon this error? blindly
>> retry? Until When? Or would qemu need to learn about the path tables in
>> order to know when there is at least one online path in order to retry?
>>
> IIRC that was one of the points why it got rejected.
> Ideally we would return an errno indicating that the path had failed, 
> but further paths are available, so a retry is in order.
> Once no paths are available qemu would be getting a different error 
> indicating that all paths are failed.

There is no such no-paths-available error.

> 
> But we would be overloading a new meaning to existing error numbers, or 
> even inventing our own error numbers. Which makes it rather awkward to use.

I agree that this sounds awkward.

> Ideally we would be able to return this as the SG_IO status, as that is 
> well capable of expressing these situations. But then we would need to 
> parse and/or return the error ourselves, essentially moving sg_io 
> funtionality into dm-mpath. Also not what one wants.

uring actually should send back the cqe for passthru, but there is no
concept like "Path error, but no paths are available".

> 
>> What is the model that a passthru consumer needs to follow when
>> operating against a mpath device?
> 
> The model really is that passthru consumer needs to deal with these errors:
> - No error (obviously)
> - I/O error (error status will not change with a retry)
> - Temporary/path related error (error status might change with a retry)
> 
> Then the consumer can decide whether to invoke a retry (for the last 
> class), or whether it should pass up that error, as maybe there are 
> applications with need a quick response time and can handle temporary 
> failures (or, in fact, want to be informed about temporary failures).
> 
> IE the 'DNR' bit should serve nicely here, keeping in mind that we might 
> need to 'fake' an NVMe error status if the connection is severed.

uring passthru sends the cqe status to userspace IIRC. But nothing in
there indicates about path availability. That would be something that
userspace would need to reconcile on its own from traversing sysfs or
alike...
Ming Lei July 14, 2022, 3:14 p.m. UTC | #28
On Wed, Jul 13, 2022 at 11:07:57AM +0530, Kanchan Joshi wrote:
> > > > > > The way I would do this that in nvme_ioucmd_failover_req (or in the
> > > > > > retry driven from command retriable failure) I would do the above,
> > > > > > requeue it and kick the requeue work, to go over the requeue_list and
> > > > > > just execute them again. Not sure why you even need an explicit retry
> > > > > > code.
> > > > > During retry we need passthrough command. But passthrough command is not
> > > > > stable (i.e. valid only during first submission). We can make it stable
> > > > > either by:
> > > > > (a) allocating in nvme (b) return -EAGAIN to io_uring, and
> > > > > it will do allocate + deferral
> > > > > Both add a cost. And since any command can potentially fail, that
> > > > > means taking that cost for every IO that we issue on mpath node. Even if
> > > > > no failure (initial or subsquent after IO) occcured.
> > > > 
> > > > As mentioned, I think that if a driver consumes a command as queued,
> > > > it needs a stable copy for a later reformation of the request for
> > > > failover purposes.
> > > 
> > > So what do you propose to make that stable?
> > > As I mentioned earlier, stable copy requires allocating/copying in fast
> > > path. And for a condition (failover) that may not even occur.
> > > I really think currrent solution is much better as it does not try to make
> > > it stable. Rather it assembles pieces of passthrough command if retry
> > > (which is rare) happens.
> > 
> > Well, I can understand that io_uring_cmd is space constrained, otherwise
> > we wouldn't be having this discussion.
> 
> Indeed. If we had space for keeping passthrough command stable for
> retry, that would really have simplified the plumbing. Retry logic would
> be same as first submission.
> 
> > However io_kiocb is less
> > constrained, and could be used as a context to hold such a space.
> > 
> > Even if it is undesired to have io_kiocb be passed to uring_cmd(), it
> > can still hold a driver specific space paired with a helper to obtain it
> > (i.e. something like io_uring_cmd_to_driver_ctx(ioucmd) ). Then if the
> > space is pre-allocated it is only a small memory copy for a stable copy
> > that would allow a saner failover design.
> 
> I am thinking along the same lines, but it's not about few bytes of
> space rather we need 80 (72 to be precise). Will think more, but
> these 72 bytes really stand tall in front of my optimism.
> 
> Do you see anything is possible in nvme-side?
> Now also passthrough command (although in a modified form) gets copied
> into this preallocated space i.e. nvme_req(req)->cmd. This part -

I understand it can't be allocated in nvme request which is freed
during retry, and looks the extra space has to be bound with
io_uring_cmd.


thanks, 
Ming
Kanchan Joshi July 14, 2022, 11:05 p.m. UTC | #29
On Thu, Jul 14, 2022 at 11:14:32PM +0800, Ming Lei wrote:
>On Wed, Jul 13, 2022 at 11:07:57AM +0530, Kanchan Joshi wrote:
>> > > > > > The way I would do this that in nvme_ioucmd_failover_req (or in the
>> > > > > > retry driven from command retriable failure) I would do the above,
>> > > > > > requeue it and kick the requeue work, to go over the requeue_list and
>> > > > > > just execute them again. Not sure why you even need an explicit retry
>> > > > > > code.
>> > > > > During retry we need passthrough command. But passthrough command is not
>> > > > > stable (i.e. valid only during first submission). We can make it stable
>> > > > > either by:
>> > > > > (a) allocating in nvme (b) return -EAGAIN to io_uring, and
>> > > > > it will do allocate + deferral
>> > > > > Both add a cost. And since any command can potentially fail, that
>> > > > > means taking that cost for every IO that we issue on mpath node. Even if
>> > > > > no failure (initial or subsquent after IO) occcured.
>> > > >
>> > > > As mentioned, I think that if a driver consumes a command as queued,
>> > > > it needs a stable copy for a later reformation of the request for
>> > > > failover purposes.
>> > >
>> > > So what do you propose to make that stable?
>> > > As I mentioned earlier, stable copy requires allocating/copying in fast
>> > > path. And for a condition (failover) that may not even occur.
>> > > I really think currrent solution is much better as it does not try to make
>> > > it stable. Rather it assembles pieces of passthrough command if retry
>> > > (which is rare) happens.
>> >
>> > Well, I can understand that io_uring_cmd is space constrained, otherwise
>> > we wouldn't be having this discussion.
>>
>> Indeed. If we had space for keeping passthrough command stable for
>> retry, that would really have simplified the plumbing. Retry logic would
>> be same as first submission.
>>
>> > However io_kiocb is less
>> > constrained, and could be used as a context to hold such a space.
>> >
>> > Even if it is undesired to have io_kiocb be passed to uring_cmd(), it
>> > can still hold a driver specific space paired with a helper to obtain it
>> > (i.e. something like io_uring_cmd_to_driver_ctx(ioucmd) ). Then if the
>> > space is pre-allocated it is only a small memory copy for a stable copy
>> > that would allow a saner failover design.
>>
>> I am thinking along the same lines, but it's not about few bytes of
>> space rather we need 80 (72 to be precise). Will think more, but
>> these 72 bytes really stand tall in front of my optimism.
>>
>> Do you see anything is possible in nvme-side?
>> Now also passthrough command (although in a modified form) gets copied
>> into this preallocated space i.e. nvme_req(req)->cmd. This part -
>
>I understand it can't be allocated in nvme request which is freed
>during retry,

Why not. Yes it gets freed, but we have control over when it gets freed
and we can do if anything needs to be done before freeing it. Please see
below as well.

>and looks the extra space has to be bound with
>io_uring_cmd.

if extra space is bound with io_uring_cmd, it helps to reduce the code
(and just that. I don't see that efficiency will improve - rather it
will be tad bit less because of one more 72 byte copy opeation in fast-path).

Alternate is to use this space that is bound with request i.e.
nvme_req(req)->cmd. This is also preallocated and passthru-cmd
already gets copied. But it is ~80% of the original command. 
Rest 20% includes few fields (data/meta buffer addres, rspective len)
which are not maintained (as bio/request can give that). 
During retry, we take out stuff from nvme_req(req)->cmd and then free
req. Please see nvme_uring_cmd_io_retry in the patch. And here is the
fragement for quick glance -

+       memcpy(&c, nvme_req(oreq)->cmd, sizeof(struct nvme_command));
+       d.metadata = (__u64)pdu->meta_buffer;
+       d.metadata_len = pdu->meta_len;
+       d.timeout_ms = oreq->timeout;
+       d.addr = (__u64)ioucmd->cmd;
+       if (obio) {
+               d.data_len = obio->bi_iter.bi_size;
+               blk_rq_unmap_user(obio);
+       } else {
+               d.data_len = 0;
+       }
+       blk_mq_free_request(oreq);

Do you see chinks in above?
Ming Lei July 15, 2022, 1:35 a.m. UTC | #30
On Fri, Jul 15, 2022 at 04:35:23AM +0530, Kanchan Joshi wrote:
> On Thu, Jul 14, 2022 at 11:14:32PM +0800, Ming Lei wrote:
> > On Wed, Jul 13, 2022 at 11:07:57AM +0530, Kanchan Joshi wrote:
> > > > > > > > The way I would do this that in nvme_ioucmd_failover_req (or in the
> > > > > > > > retry driven from command retriable failure) I would do the above,
> > > > > > > > requeue it and kick the requeue work, to go over the requeue_list and
> > > > > > > > just execute them again. Not sure why you even need an explicit retry
> > > > > > > > code.
> > > > > > > During retry we need passthrough command. But passthrough command is not
> > > > > > > stable (i.e. valid only during first submission). We can make it stable
> > > > > > > either by:
> > > > > > > (a) allocating in nvme (b) return -EAGAIN to io_uring, and
> > > > > > > it will do allocate + deferral
> > > > > > > Both add a cost. And since any command can potentially fail, that
> > > > > > > means taking that cost for every IO that we issue on mpath node. Even if
> > > > > > > no failure (initial or subsquent after IO) occcured.
> > > > > >
> > > > > > As mentioned, I think that if a driver consumes a command as queued,
> > > > > > it needs a stable copy for a later reformation of the request for
> > > > > > failover purposes.
> > > > >
> > > > > So what do you propose to make that stable?
> > > > > As I mentioned earlier, stable copy requires allocating/copying in fast
> > > > > path. And for a condition (failover) that may not even occur.
> > > > > I really think currrent solution is much better as it does not try to make
> > > > > it stable. Rather it assembles pieces of passthrough command if retry
> > > > > (which is rare) happens.
> > > >
> > > > Well, I can understand that io_uring_cmd is space constrained, otherwise
> > > > we wouldn't be having this discussion.
> > > 
> > > Indeed. If we had space for keeping passthrough command stable for
> > > retry, that would really have simplified the plumbing. Retry logic would
> > > be same as first submission.
> > > 
> > > > However io_kiocb is less
> > > > constrained, and could be used as a context to hold such a space.
> > > >
> > > > Even if it is undesired to have io_kiocb be passed to uring_cmd(), it
> > > > can still hold a driver specific space paired with a helper to obtain it
> > > > (i.e. something like io_uring_cmd_to_driver_ctx(ioucmd) ). Then if the
> > > > space is pre-allocated it is only a small memory copy for a stable copy
> > > > that would allow a saner failover design.
> > > 
> > > I am thinking along the same lines, but it's not about few bytes of
> > > space rather we need 80 (72 to be precise). Will think more, but
> > > these 72 bytes really stand tall in front of my optimism.
> > > 
> > > Do you see anything is possible in nvme-side?
> > > Now also passthrough command (although in a modified form) gets copied
> > > into this preallocated space i.e. nvme_req(req)->cmd. This part -
> > 
> > I understand it can't be allocated in nvme request which is freed
> > during retry,
> 
> Why not. Yes it gets freed, but we have control over when it gets freed
> and we can do if anything needs to be done before freeing it. Please see
> below as well.

This way requires you to hold the old request until one new path is
found, and it is fragile.

What if there isn't any path available then controller tries to
reset the path? If the requeue or io_uring_cmd holds the old request,
it might cause error recovery hang or make error handler code more
complicated.

> 
> > and looks the extra space has to be bound with
> > io_uring_cmd.
> 
> if extra space is bound with io_uring_cmd, it helps to reduce the code
> (and just that. I don't see that efficiency will improve - rather it

Does retry have to be efficient?

> will be tad bit less because of one more 72 byte copy opeation in fast-path).

Allocating one buffer and bind it with io_uring_cmd in case of retry is actually
similar with current model, retry is triggered by FS bio, and the allocated
buffer can play similar role with FS bio.



Thanks, 
Ming
Ming Lei July 15, 2022, 1:46 a.m. UTC | #31
On Fri, Jul 15, 2022 at 09:35:38AM +0800, Ming Lei wrote:
> On Fri, Jul 15, 2022 at 04:35:23AM +0530, Kanchan Joshi wrote:
> > On Thu, Jul 14, 2022 at 11:14:32PM +0800, Ming Lei wrote:
> > > On Wed, Jul 13, 2022 at 11:07:57AM +0530, Kanchan Joshi wrote:
> > > > > > > > > The way I would do this that in nvme_ioucmd_failover_req (or in the
> > > > > > > > > retry driven from command retriable failure) I would do the above,
> > > > > > > > > requeue it and kick the requeue work, to go over the requeue_list and
> > > > > > > > > just execute them again. Not sure why you even need an explicit retry
> > > > > > > > > code.
> > > > > > > > During retry we need passthrough command. But passthrough command is not
> > > > > > > > stable (i.e. valid only during first submission). We can make it stable
> > > > > > > > either by:
> > > > > > > > (a) allocating in nvme (b) return -EAGAIN to io_uring, and
> > > > > > > > it will do allocate + deferral
> > > > > > > > Both add a cost. And since any command can potentially fail, that
> > > > > > > > means taking that cost for every IO that we issue on mpath node. Even if
> > > > > > > > no failure (initial or subsquent after IO) occcured.
> > > > > > >
> > > > > > > As mentioned, I think that if a driver consumes a command as queued,
> > > > > > > it needs a stable copy for a later reformation of the request for
> > > > > > > failover purposes.
> > > > > >
> > > > > > So what do you propose to make that stable?
> > > > > > As I mentioned earlier, stable copy requires allocating/copying in fast
> > > > > > path. And for a condition (failover) that may not even occur.
> > > > > > I really think currrent solution is much better as it does not try to make
> > > > > > it stable. Rather it assembles pieces of passthrough command if retry
> > > > > > (which is rare) happens.
> > > > >
> > > > > Well, I can understand that io_uring_cmd is space constrained, otherwise
> > > > > we wouldn't be having this discussion.
> > > > 
> > > > Indeed. If we had space for keeping passthrough command stable for
> > > > retry, that would really have simplified the plumbing. Retry logic would
> > > > be same as first submission.
> > > > 
> > > > > However io_kiocb is less
> > > > > constrained, and could be used as a context to hold such a space.
> > > > >
> > > > > Even if it is undesired to have io_kiocb be passed to uring_cmd(), it
> > > > > can still hold a driver specific space paired with a helper to obtain it
> > > > > (i.e. something like io_uring_cmd_to_driver_ctx(ioucmd) ). Then if the
> > > > > space is pre-allocated it is only a small memory copy for a stable copy
> > > > > that would allow a saner failover design.
> > > > 
> > > > I am thinking along the same lines, but it's not about few bytes of
> > > > space rather we need 80 (72 to be precise). Will think more, but
> > > > these 72 bytes really stand tall in front of my optimism.
> > > > 
> > > > Do you see anything is possible in nvme-side?
> > > > Now also passthrough command (although in a modified form) gets copied
> > > > into this preallocated space i.e. nvme_req(req)->cmd. This part -
> > > 
> > > I understand it can't be allocated in nvme request which is freed
> > > during retry,
> > 
> > Why not. Yes it gets freed, but we have control over when it gets freed
> > and we can do if anything needs to be done before freeing it. Please see
> > below as well.
> 
> This way requires you to hold the old request until one new path is
> found, and it is fragile.
> 
> What if there isn't any path available then controller tries to
> reset the path? If the requeue or io_uring_cmd holds the old request,
> it might cause error recovery hang or make error handler code more
> complicated.
> 
> > 
> > > and looks the extra space has to be bound with
> > > io_uring_cmd.
> > 
> > if extra space is bound with io_uring_cmd, it helps to reduce the code
> > (and just that. I don't see that efficiency will improve - rather it
> 
> Does retry have to be efficient?
> 
> > will be tad bit less because of one more 72 byte copy opeation in fast-path).
> 
> Allocating one buffer and bind it with io_uring_cmd in case of retry is actually
> similar with current model, retry is triggered by FS bio, and the allocated
> buffer can play similar role with FS bio.

oops, just think of SQE data only valid during submission, so the buffer
has to be allocated during 1st submission, but the allocation for each io_uring_cmd
shouldn't cost much by creating one slab cache, especially inline data
size of io_uring_cmd has been 56(24 + 32) bytes.


thanks,
Ming
Kanchan Joshi July 15, 2022, 4:24 a.m. UTC | #32
On Fri, Jul 15, 2022 at 09:46:16AM +0800, Ming Lei wrote:
>On Fri, Jul 15, 2022 at 09:35:38AM +0800, Ming Lei wrote:
>> On Fri, Jul 15, 2022 at 04:35:23AM +0530, Kanchan Joshi wrote:
>> > On Thu, Jul 14, 2022 at 11:14:32PM +0800, Ming Lei wrote:
>> > > On Wed, Jul 13, 2022 at 11:07:57AM +0530, Kanchan Joshi wrote:
>> > > > > > > > > The way I would do this that in nvme_ioucmd_failover_req (or in the
>> > > > > > > > > retry driven from command retriable failure) I would do the above,
>> > > > > > > > > requeue it and kick the requeue work, to go over the requeue_list and
>> > > > > > > > > just execute them again. Not sure why you even need an explicit retry
>> > > > > > > > > code.
>> > > > > > > > During retry we need passthrough command. But passthrough command is not
>> > > > > > > > stable (i.e. valid only during first submission). We can make it stable
>> > > > > > > > either by:
>> > > > > > > > (a) allocating in nvme (b) return -EAGAIN to io_uring, and
>> > > > > > > > it will do allocate + deferral
>> > > > > > > > Both add a cost. And since any command can potentially fail, that
>> > > > > > > > means taking that cost for every IO that we issue on mpath node. Even if
>> > > > > > > > no failure (initial or subsquent after IO) occcured.
>> > > > > > >
>> > > > > > > As mentioned, I think that if a driver consumes a command as queued,
>> > > > > > > it needs a stable copy for a later reformation of the request for
>> > > > > > > failover purposes.
>> > > > > >
>> > > > > > So what do you propose to make that stable?
>> > > > > > As I mentioned earlier, stable copy requires allocating/copying in fast
>> > > > > > path. And for a condition (failover) that may not even occur.
>> > > > > > I really think currrent solution is much better as it does not try to make
>> > > > > > it stable. Rather it assembles pieces of passthrough command if retry
>> > > > > > (which is rare) happens.
>> > > > >
>> > > > > Well, I can understand that io_uring_cmd is space constrained, otherwise
>> > > > > we wouldn't be having this discussion.
>> > > >
>> > > > Indeed. If we had space for keeping passthrough command stable for
>> > > > retry, that would really have simplified the plumbing. Retry logic would
>> > > > be same as first submission.
>> > > >
>> > > > > However io_kiocb is less
>> > > > > constrained, and could be used as a context to hold such a space.
>> > > > >
>> > > > > Even if it is undesired to have io_kiocb be passed to uring_cmd(), it
>> > > > > can still hold a driver specific space paired with a helper to obtain it
>> > > > > (i.e. something like io_uring_cmd_to_driver_ctx(ioucmd) ). Then if the
>> > > > > space is pre-allocated it is only a small memory copy for a stable copy
>> > > > > that would allow a saner failover design.
>> > > >
>> > > > I am thinking along the same lines, but it's not about few bytes of
>> > > > space rather we need 80 (72 to be precise). Will think more, but
>> > > > these 72 bytes really stand tall in front of my optimism.
>> > > >
>> > > > Do you see anything is possible in nvme-side?
>> > > > Now also passthrough command (although in a modified form) gets copied
>> > > > into this preallocated space i.e. nvme_req(req)->cmd. This part -
>> > >
>> > > I understand it can't be allocated in nvme request which is freed
>> > > during retry,
>> >
>> > Why not. Yes it gets freed, but we have control over when it gets freed
>> > and we can do if anything needs to be done before freeing it. Please see
>> > below as well.
>>
>> This way requires you to hold the old request until one new path is
>> found, and it is fragile.
>>
>> What if there isn't any path available then controller tries to
>> reset the path? If the requeue or io_uring_cmd holds the old request,
>> it might cause error recovery hang or make error handler code more
>> complicated.

no, no, this code does not hold the old request until path is found.
It is not tied to path discovery at all.
old request is freed much early, just after extracting pieces of
passthrough-commands from it (i.e. from nvme_req(req)->cmd).
Seems you are yet to look at the snippet I shared.

>> >
>> > > and looks the extra space has to be bound with
>> > > io_uring_cmd.
>> >
>> > if extra space is bound with io_uring_cmd, it helps to reduce the code
>> > (and just that. I don't see that efficiency will improve - rather it
>>
>> Does retry have to be efficient?

I also do not care about that. But as mentioned earlier, it is each-io
cost not failure-only. Lifetime of original passthrough-cmd is
first submission. So if we take this route, we need to do extra copy or
allocate+copy (which is trivial by just returing -EAGAIN to io_uring) 
for each io that is issued on mpath-node.  

>> > will be tad bit less because of one more 72 byte copy opeation in fast-path).
>>
>> Allocating one buffer and bind it with io_uring_cmd in case of retry is actually
>> similar with current model, retry is triggered by FS bio, and the allocated
>> buffer can play similar role with FS bio.
>
>oops, just think of SQE data only valid during submission, so the buffer
>has to be allocated during 1st submission, 

Exactly.

> but the allocation for each io_uring_cmd
>shouldn't cost much by creating one slab cache, especially inline data
>size of io_uring_cmd has been 56(24 + 32) bytes.

io_uring_cmd does not take any extra memory currently. It is part of the
existing (per-op) space of io_kiocb. Is it a good idea to amplify + decouple 
that into its own slab cache for each io (that may not be going to mpath
node, and may not be failing either).

I really wonder why current solution using  nvme_req(req)->cmd (which is
also preallocated stable storage) is not much better.
diff mbox series

Patch

diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
index fc02eddd4977..281c21d3c67d 100644
--- a/drivers/nvme/host/ioctl.c
+++ b/drivers/nvme/host/ioctl.c
@@ -340,12 +340,6 @@  struct nvme_uring_data {
 	__u32	timeout_ms;
 };
 
-static inline struct nvme_uring_cmd_pdu *nvme_uring_cmd_pdu(
-		struct io_uring_cmd *ioucmd)
-{
-	return (struct nvme_uring_cmd_pdu *)&ioucmd->pdu;
-}
-
 static void nvme_uring_task_cb(struct io_uring_cmd *ioucmd)
 {
 	struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
@@ -448,6 +442,14 @@  static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 	pdu->meta_buffer = nvme_to_user_ptr(d.metadata);
 	pdu->meta_len = d.metadata_len;
 
+	if (issue_flags & IO_URING_F_MPATH) {
+		req->cmd_flags |= REQ_NVME_MPATH;
+		/*
+		 * we would need the buffer address (d.addr field) if we have
+		 * to retry the command. Store it by repurposing ioucmd->cmd
+		 */
+		ioucmd->cmd = (void *)d.addr;
+	}
 	blk_execute_rq_nowait(req, false);
 	return -EIOCBQUEUED;
 }
@@ -665,12 +667,135 @@  int nvme_ns_head_chr_uring_cmd(struct io_uring_cmd *ioucmd,
 	int srcu_idx = srcu_read_lock(&head->srcu);
 	struct nvme_ns *ns = nvme_find_path(head);
 	int ret = -EINVAL;
+	struct device *dev = &head->cdev_device;
+
+	if (likely(ns)) {
+		ret = nvme_ns_uring_cmd(ns, ioucmd,
+				issue_flags | IO_URING_F_MPATH);
+	} else if (nvme_available_path(head)) {
+		struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
+		struct nvme_uring_cmd *payload = NULL;
+
+		dev_warn_ratelimited(dev, "no usable path - requeuing I/O\n");
+		/*
+		 * We may requeue two kinds of uring commands:
+		 * 1. command failed post submission. pdu->req will be non-null
+		 * for this
+		 * 2. command that could not be submitted because path was not
+		 * available. For this pdu->req is set to NULL.
+		 */
+		pdu->req = NULL;
+		/*
+		 * passthrough command will not be available during retry as it
+		 * is embedded in io_uring's SQE. Hence we allocate/copy here
+		 */
+		payload = kmalloc(sizeof(struct nvme_uring_cmd), GFP_KERNEL);
+		if (!payload) {
+			ret = -ENOMEM;
+			goto out_unlock;
+		}
+		memcpy(payload, ioucmd->cmd, sizeof(struct nvme_uring_cmd));
+		ioucmd->cmd = payload;
 
-	if (ns)
-		ret = nvme_ns_uring_cmd(ns, ioucmd, issue_flags);
+		spin_lock_irq(&head->requeue_ioucmd_lock);
+		ioucmd_list_add(&head->requeue_ioucmd_list, ioucmd);
+		spin_unlock_irq(&head->requeue_ioucmd_lock);
+		ret = -EIOCBQUEUED;
+	} else {
+		dev_warn_ratelimited(dev, "no available path - failing I/O\n");
+	}
+out_unlock:
 	srcu_read_unlock(&head->srcu, srcu_idx);
 	return ret;
 }
+
+int nvme_uring_cmd_io_retry(struct nvme_ns *ns, struct request *oreq,
+		struct io_uring_cmd *ioucmd, struct nvme_uring_cmd_pdu *pdu)
+{
+	struct nvme_ctrl *ctrl = ns->ctrl;
+	struct request_queue *q = ns ? ns->queue : ctrl->admin_q;
+	struct nvme_uring_data d;
+	struct nvme_command c;
+	struct request *req;
+	struct bio *obio = oreq->bio;
+	void *meta = NULL;
+
+	memcpy(&c, nvme_req(oreq)->cmd, sizeof(struct nvme_command));
+	d.metadata = (__u64)pdu->meta_buffer;
+	d.metadata_len = pdu->meta_len;
+	d.timeout_ms = oreq->timeout;
+	d.addr = (__u64)ioucmd->cmd;
+	if (obio) {
+		d.data_len = obio->bi_iter.bi_size;
+		blk_rq_unmap_user(obio);
+	} else {
+		d.data_len = 0;
+	}
+	blk_mq_free_request(oreq);
+	req = nvme_alloc_user_request(q, &c, nvme_to_user_ptr(d.addr),
+			d.data_len, nvme_to_user_ptr(d.metadata),
+			d.metadata_len, 0, &meta, d.timeout_ms ?
+			msecs_to_jiffies(d.timeout_ms) : 0,
+			ioucmd->cmd_op == NVME_URING_CMD_IO_VEC, 0, 0);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	req->end_io = nvme_uring_cmd_end_io;
+	req->end_io_data = ioucmd;
+	pdu->bio = req->bio;
+	pdu->meta = meta;
+	req->cmd_flags |= REQ_NVME_MPATH;
+	blk_execute_rq_nowait(req, false);
+	return -EIOCBQUEUED;
+}
+
+void nvme_ioucmd_mpath_retry(struct io_uring_cmd *ioucmd)
+{
+	struct cdev *cdev = file_inode(ioucmd->file)->i_cdev;
+	struct nvme_ns_head *head = container_of(cdev, struct nvme_ns_head,
+			cdev);
+	int srcu_idx = srcu_read_lock(&head->srcu);
+	struct nvme_ns *ns = nvme_find_path(head);
+	unsigned int issue_flags = IO_URING_F_SQE128 | IO_URING_F_CQE32 |
+		IO_URING_F_MPATH;
+	struct device *dev = &head->cdev_device;
+
+	if (likely(ns)) {
+		struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
+		struct request *oreq = pdu->req;
+		int ret;
+
+		if (oreq == NULL) {
+			/*
+			 * this was not submitted (to device) earlier. For this
+			 * ioucmd->cmd points to persistent memory. Free that
+			 * up post submission
+			 */
+			const void *cmd = ioucmd->cmd;
+
+			ret = nvme_ns_uring_cmd(ns, ioucmd, issue_flags);
+			kfree(cmd);
+		} else {
+			/*
+			 * this was submitted (to device) earlier. Use old
+			 * request, bio (if it exists) and nvme-pdu to recreate
+			 * the command for the discovered path
+			 */
+			ret = nvme_uring_cmd_io_retry(ns, oreq, ioucmd, pdu);
+		}
+		if (ret != -EIOCBQUEUED)
+			io_uring_cmd_done(ioucmd, ret, 0);
+	} else if (nvme_available_path(head)) {
+		dev_warn_ratelimited(dev, "no usable path - requeuing I/O\n");
+		spin_lock_irq(&head->requeue_ioucmd_lock);
+		ioucmd_list_add(&head->requeue_ioucmd_list, ioucmd);
+		spin_unlock_irq(&head->requeue_ioucmd_lock);
+	} else {
+		dev_warn_ratelimited(dev, "no available path - failing I/O\n");
+		io_uring_cmd_done(ioucmd, -EINVAL, 0);
+	}
+	srcu_read_unlock(&head->srcu, srcu_idx);
+}
 #endif /* CONFIG_NVME_MULTIPATH */
 
 int nvme_dev_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags)
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index f26640ccb955..fe5655d98c36 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -6,6 +6,7 @@ 
 #include <linux/backing-dev.h>
 #include <linux/moduleparam.h>
 #include <linux/vmalloc.h>
+#include <linux/io_uring.h>
 #include <trace/events/block.h>
 #include "nvme.h"
 
@@ -80,6 +81,17 @@  void nvme_mpath_start_freeze(struct nvme_subsystem *subsys)
 			blk_freeze_queue_start(h->disk->queue);
 }
 
+static void nvme_ioucmd_failover_req(struct request *req, struct nvme_ns *ns)
+{
+	struct io_uring_cmd *ioucmd = req->end_io_data;
+	struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
+
+	/* store the request, to reconstruct the command during retry */
+	pdu->req = req;
+	/* retry in original submitter context */
+	io_uring_cmd_execute_in_task(ioucmd, nvme_ioucmd_mpath_retry);
+}
+
 void nvme_failover_req(struct request *req)
 {
 	struct nvme_ns *ns = req->q->queuedata;
@@ -99,6 +111,11 @@  void nvme_failover_req(struct request *req)
 		queue_work(nvme_wq, &ns->ctrl->ana_work);
 	}
 
+	if (blk_rq_is_passthrough(req)) {
+		nvme_ioucmd_failover_req(req, ns);
+		return;
+	}
+
 	spin_lock_irqsave(&ns->head->requeue_lock, flags);
 	for (bio = req->bio; bio; bio = bio->bi_next) {
 		bio_set_dev(bio, ns->head->disk->part0);
@@ -314,7 +331,7 @@  inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
 	return ns;
 }
 
-static bool nvme_available_path(struct nvme_ns_head *head)
+bool nvme_available_path(struct nvme_ns_head *head)
 {
 	struct nvme_ns *ns;
 
@@ -459,7 +476,9 @@  static void nvme_requeue_work(struct work_struct *work)
 	struct nvme_ns_head *head =
 		container_of(work, struct nvme_ns_head, requeue_work);
 	struct bio *bio, *next;
+	struct io_uring_cmd *ioucmd, *ioucmd_next;
 
+	/* process requeued bios*/
 	spin_lock_irq(&head->requeue_lock);
 	next = bio_list_get(&head->requeue_list);
 	spin_unlock_irq(&head->requeue_lock);
@@ -470,6 +489,21 @@  static void nvme_requeue_work(struct work_struct *work)
 
 		submit_bio_noacct(bio);
 	}
+
+	/* process requeued passthrough-commands */
+	spin_lock_irq(&head->requeue_ioucmd_lock);
+	ioucmd_next = ioucmd_list_get(&head->requeue_ioucmd_list);
+	spin_unlock_irq(&head->requeue_ioucmd_lock);
+
+	while ((ioucmd = ioucmd_next) != NULL) {
+		ioucmd_next = ioucmd->next;
+		ioucmd->next = NULL;
+		/*
+		 * Retry in original submitter context as we would be
+		 * processing the passthrough command again
+		 */
+		io_uring_cmd_execute_in_task(ioucmd, nvme_ioucmd_mpath_retry);
+	}
 }
 
 int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 9d3ff6feda06..125b48e74e72 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -16,6 +16,7 @@ 
 #include <linux/rcupdate.h>
 #include <linux/wait.h>
 #include <linux/t10-pi.h>
+#include <linux/io_uring.h>
 
 #include <trace/events/block.h>
 
@@ -189,6 +190,12 @@  enum {
 	NVME_REQ_USERCMD		= (1 << 1),
 };
 
+static inline struct nvme_uring_cmd_pdu *nvme_uring_cmd_pdu(
+		struct io_uring_cmd *ioucmd)
+{
+	return (struct nvme_uring_cmd_pdu *)&ioucmd->pdu;
+}
+
 static inline struct nvme_request *nvme_req(struct request *req)
 {
 	return blk_mq_rq_to_pdu(req);
@@ -442,6 +449,9 @@  struct nvme_ns_head {
 	struct work_struct	requeue_work;
 	struct mutex		lock;
 	unsigned long		flags;
+	/* for uring-passthru multipath handling */
+	struct ioucmd_list	requeue_ioucmd_list;
+	spinlock_t		requeue_ioucmd_lock;
 #define NVME_NSHEAD_DISK_LIVE	0
 	struct nvme_ns __rcu	*current_path[];
 #endif
@@ -830,6 +840,7 @@  int nvme_ns_chr_uring_cmd(struct io_uring_cmd *ioucmd,
 		unsigned int issue_flags);
 int nvme_ns_head_chr_uring_cmd(struct io_uring_cmd *ioucmd,
 		unsigned int issue_flags);
+void nvme_ioucmd_mpath_retry(struct io_uring_cmd *ioucmd);
 int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo);
 int nvme_dev_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags);
 
@@ -844,6 +855,7 @@  static inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl)
 	return ctrl->ana_log_buf != NULL;
 }
 
+bool nvme_available_path(struct nvme_ns_head *head);
 void nvme_mpath_unfreeze(struct nvme_subsystem *subsys);
 void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys);
 void nvme_mpath_start_freeze(struct nvme_subsystem *subsys);
diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index d734599cbcd7..57f4dfc83316 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -15,6 +15,8 @@  enum io_uring_cmd_flags {
 	IO_URING_F_SQE128		= 4,
 	IO_URING_F_CQE32		= 8,
 	IO_URING_F_IOPOLL		= 16,
+	/* to indicate that it is a MPATH req*/
+	IO_URING_F_MPATH		= 32,
 };
 
 struct io_uring_cmd {