diff mbox series

[05/17] nvme: wire-up support for async-passthru on char-device.

Message ID 20220308152105.309618-6-joshi.k@samsung.com (mailing list archive)
State New, archived
Headers show
Series io_uring passthru over nvme | expand

Commit Message

Kanchan Joshi March 8, 2022, 3:20 p.m. UTC
Introduce handler for fops->async_cmd(), implementing async passthru
on char device (/dev/ngX). The handler supports NVME_IOCTL_IO64_CMD for
read and write commands. Returns failure for other commands.
This is low overhead path for processing the inline commands housed
inside io_uring's sqe. Neither the commmand is fetched via
copy_from_user, nor the result (inside passthru command) is updated via
put_user.

Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
Signed-off-by: Anuj Gupta <anuj20.g@samsung.com>
---
 drivers/nvme/host/core.c      |   1 +
 drivers/nvme/host/ioctl.c     | 205 ++++++++++++++++++++++++++++------
 drivers/nvme/host/multipath.c |   1 +
 drivers/nvme/host/nvme.h      |   3 +
 4 files changed, 178 insertions(+), 32 deletions(-)

Comments

Clay Mayers March 10, 2022, 12:02 a.m. UTC | #1
> From: Linux-nvme <linux-nvme-bounces@lists.infradead.org> On Behalf Of
> Kanchan Joshi
> Sent: Tuesday, March 8, 2022 7:21 AM
> To: axboe@kernel.dk; hch@lst.de; kbusch@kernel.org;
> asml.silence@gmail.com
> Cc: io-uring@vger.kernel.org; linux-nvme@lists.infradead.org; linux-
> block@vger.kernel.org; sbates@raithlin.com; logang@deltatee.com;
> pankydev8@gmail.com; javier@javigon.com; mcgrof@kernel.org;
> a.manzanares@samsung.com; joshiiitr@gmail.com; anuj20.g@samsung.com
> Subject: [PATCH 05/17] nvme: wire-up support for async-passthru on char-
> device.
> 
> Introduce handler for fops->async_cmd(), implementing async passthru
> on char device (/dev/ngX). The handler supports NVME_IOCTL_IO64_CMD
> for
> read and write commands. Returns failure for other commands.
> This is low overhead path for processing the inline commands housed
> inside io_uring's sqe. Neither the commmand is fetched via
> copy_from_user, nor the result (inside passthru command) is updated via
> put_user.
> 
> Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
> Signed-off-by: Anuj Gupta <anuj20.g@samsung.com>
> ---
>  drivers/nvme/host/core.c      |   1 +
>  drivers/nvme/host/ioctl.c     | 205 ++++++++++++++++++++++++++++------
>  drivers/nvme/host/multipath.c |   1 +
>  drivers/nvme/host/nvme.h      |   3 +
>  4 files changed, 178 insertions(+), 32 deletions(-)
> 
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index 159944499c4f..3fe8f5901cd9 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -3667,6 +3667,7 @@ static const struct file_operations
> nvme_ns_chr_fops = {
>  	.release	= nvme_ns_chr_release,
>  	.unlocked_ioctl	= nvme_ns_chr_ioctl,
>  	.compat_ioctl	= compat_ptr_ioctl,
> +	.async_cmd	= nvme_ns_chr_async_cmd,
>  };
> 
>  static int nvme_add_ns_cdev(struct nvme_ns *ns)
> diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
> index 5c9cd9695519..1df270b47af5 100644
> --- a/drivers/nvme/host/ioctl.c
> +++ b/drivers/nvme/host/ioctl.c
> @@ -18,6 +18,76 @@ static void __user *nvme_to_user_ptr(uintptr_t ptrval)
>  		ptrval = (compat_uptr_t)ptrval;
>  	return (void __user *)ptrval;
>  }
> +/*
> + * This overlays struct io_uring_cmd pdu.
> + * Expect build errors if this grows larger than that.
> + */
> +struct nvme_uring_cmd_pdu {
> +	u32 meta_len;
> +	union {
> +		struct bio *bio;
> +		struct request *req;
> +	};
> +	void *meta; /* kernel-resident buffer */
> +	void __user *meta_buffer;
> +} __packed;
> +
> +static struct nvme_uring_cmd_pdu *nvme_uring_cmd_pdu(struct
> io_uring_cmd *ioucmd)
> +{
> +	return (struct nvme_uring_cmd_pdu *)&ioucmd->pdu;
> +}
> +
> +static void nvme_pt_task_cb(struct io_uring_cmd *ioucmd)
> +{
> +	struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
> +	struct request *req = pdu->req;
> +	int status;
> +	struct bio *bio = req->bio;
> +
> +	if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
> +		status = -EINTR;
> +	else
> +		status = nvme_req(req)->status;
> +
> +	/* we can free request */
> +	blk_mq_free_request(req);
> +	blk_rq_unmap_user(bio);
> +
> +	if (!status && pdu->meta_buffer) {
> +		if (copy_to_user(pdu->meta_buffer, pdu->meta, pdu-
> >meta_len))
> +			status = -EFAULT;
> +	}
> +	kfree(pdu->meta);
> +
> +	io_uring_cmd_done(ioucmd, status);
> +}
> +
> +static void nvme_end_async_pt(struct request *req, blk_status_t err)
> +{
> +	struct io_uring_cmd *ioucmd = req->end_io_data;
> +	struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
> +	/* extract bio before reusing the same field for request */
> +	struct bio *bio = pdu->bio;
> +
> +	pdu->req = req;
> +	req->bio = bio;
> +	/* this takes care of setting up task-work */
> +	io_uring_cmd_complete_in_task(ioucmd, nvme_pt_task_cb);
> +}
> +
> +static void nvme_setup_uring_cmd_data(struct request *rq,
> +		struct io_uring_cmd *ioucmd, void *meta,
> +		void __user *meta_buffer, u32 meta_len)
> +{
> +	struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
> +
> +	/* to free bio on completion, as req->bio will be null at that time */
> +	pdu->bio = rq->bio;
> +	pdu->meta = meta;
> +	pdu->meta_buffer = meta_buffer;
> +	pdu->meta_len = meta_len;
> +	rq->end_io_data = ioucmd;
> +}
> 
>  static void *nvme_add_user_metadata(struct bio *bio, void __user *ubuf,
>  		unsigned len, u32 seed, bool write)
> @@ -56,7 +126,8 @@ static void *nvme_add_user_metadata(struct bio
> *bio, void __user *ubuf,
>  static int nvme_submit_user_cmd(struct request_queue *q,
>  		struct nvme_command *cmd, void __user *ubuffer,
>  		unsigned bufflen, void __user *meta_buffer, unsigned
> meta_len,
> -		u32 meta_seed, u64 *result, unsigned timeout)
> +		u32 meta_seed, u64 *result, unsigned timeout,
> +		struct io_uring_cmd *ioucmd)
>  {
>  	bool write = nvme_is_write(cmd);
>  	struct nvme_ns *ns = q->queuedata;
> @@ -64,9 +135,15 @@ static int nvme_submit_user_cmd(struct
> request_queue *q,
>  	struct request *req;
>  	struct bio *bio = NULL;
>  	void *meta = NULL;
> +	unsigned int rq_flags = 0;
> +	blk_mq_req_flags_t blk_flags = 0;
>  	int ret;
> 
> -	req = nvme_alloc_request(q, cmd, 0, 0);
> +	if (ioucmd && (ioucmd->flags & IO_URING_F_NONBLOCK)) {
> +		rq_flags |= REQ_NOWAIT;
> +		blk_flags |= BLK_MQ_REQ_NOWAIT;
> +	}
> +	req = nvme_alloc_request(q, cmd, blk_flags, rq_flags);
>  	if (IS_ERR(req))
>  		return PTR_ERR(req);
> 
> @@ -92,6 +169,19 @@ static int nvme_submit_user_cmd(struct
> request_queue *q,
>  			req->cmd_flags |= REQ_INTEGRITY;
>  		}
>  	}
> +	if (ioucmd) { /* async dispatch */
> +		if (cmd->common.opcode == nvme_cmd_write ||
> +				cmd->common.opcode == nvme_cmd_read) {
> +			nvme_setup_uring_cmd_data(req, ioucmd, meta,
> meta_buffer,
> +					meta_len);
> +			blk_execute_rq_nowait(req, 0, nvme_end_async_pt);
> +			return 0;
> +		} else {
> +			/* support only read and write for now. */
> +			ret = -EINVAL;
> +			goto out_meta;
> +		}
> +	}
> 
>  	ret = nvme_execute_passthru_rq(req);
>  	if (result)
> @@ -100,6 +190,7 @@ static int nvme_submit_user_cmd(struct
> request_queue *q,
>  		if (copy_to_user(meta_buffer, meta, meta_len))
>  			ret = -EFAULT;
>  	}
> + out_meta:
>  	kfree(meta);
>   out_unmap:
>  	if (bio)
> @@ -170,7 +261,8 @@ static int nvme_submit_io(struct nvme_ns *ns, struct
> nvme_user_io __user *uio)
> 
>  	return nvme_submit_user_cmd(ns->queue, &c,
>  			nvme_to_user_ptr(io.addr), length,
> -			metadata, meta_len, lower_32_bits(io.slba), NULL,
> 0);
> +			metadata, meta_len, lower_32_bits(io.slba), NULL, 0,
> +			NULL);
>  }
> 
>  static bool nvme_validate_passthru_nsid(struct nvme_ctrl *ctrl,
> @@ -224,7 +316,7 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl,
> struct nvme_ns *ns,
>  	status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
>  			nvme_to_user_ptr(cmd.addr), cmd.data_len,
>  			nvme_to_user_ptr(cmd.metadata),
> cmd.metadata_len,
> -			0, &result, timeout);
> +			0, &result, timeout, NULL);
> 
>  	if (status >= 0) {
>  		if (put_user(result, &ucmd->result))
> @@ -235,45 +327,53 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl,
> struct nvme_ns *ns,
>  }
> 
>  static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
> -			struct nvme_passthru_cmd64 __user *ucmd)
> +			struct nvme_passthru_cmd64 __user *ucmd,
> +			struct io_uring_cmd *ioucmd)
>  {
> -	struct nvme_passthru_cmd64 cmd;
> +	struct nvme_passthru_cmd64 cmd, *cptr;
>  	struct nvme_command c;
>  	unsigned timeout = 0;
>  	int status;
> 
>  	if (!capable(CAP_SYS_ADMIN))
>  		return -EACCES;
> -	if (copy_from_user(&cmd, ucmd, sizeof(cmd)))
> -		return -EFAULT;
> -	if (cmd.flags)
> +	if (!ioucmd) {
> +		if (copy_from_user(&cmd, ucmd, sizeof(cmd)))
> +			return -EFAULT;
> +		cptr = &cmd;
> +	} else {
> +		if (ioucmd->cmd_len != sizeof(struct nvme_passthru_cmd64))
> +			return -EINVAL;
> +		cptr = (struct nvme_passthru_cmd64 *)ioucmd->cmd;
> +	}
> +	if (cptr->flags)
>  		return -EINVAL;
> -	if (!nvme_validate_passthru_nsid(ctrl, ns, cmd.nsid))
> +	if (!nvme_validate_passthru_nsid(ctrl, ns, cptr->nsid))
>  		return -EINVAL;
> 
>  	memset(&c, 0, sizeof(c));
> -	c.common.opcode = cmd.opcode;
> -	c.common.flags = cmd.flags;
> -	c.common.nsid = cpu_to_le32(cmd.nsid);
> -	c.common.cdw2[0] = cpu_to_le32(cmd.cdw2);
> -	c.common.cdw2[1] = cpu_to_le32(cmd.cdw3);
> -	c.common.cdw10 = cpu_to_le32(cmd.cdw10);
> -	c.common.cdw11 = cpu_to_le32(cmd.cdw11);
> -	c.common.cdw12 = cpu_to_le32(cmd.cdw12);
> -	c.common.cdw13 = cpu_to_le32(cmd.cdw13);
> -	c.common.cdw14 = cpu_to_le32(cmd.cdw14);
> -	c.common.cdw15 = cpu_to_le32(cmd.cdw15);
> -
> -	if (cmd.timeout_ms)
> -		timeout = msecs_to_jiffies(cmd.timeout_ms);
> +	c.common.opcode = cptr->opcode;
> +	c.common.flags = cptr->flags;
> +	c.common.nsid = cpu_to_le32(cptr->nsid);
> +	c.common.cdw2[0] = cpu_to_le32(cptr->cdw2);
> +	c.common.cdw2[1] = cpu_to_le32(cptr->cdw3);
> +	c.common.cdw10 = cpu_to_le32(cptr->cdw10);
> +	c.common.cdw11 = cpu_to_le32(cptr->cdw11);
> +	c.common.cdw12 = cpu_to_le32(cptr->cdw12);
> +	c.common.cdw13 = cpu_to_le32(cptr->cdw13);
> +	c.common.cdw14 = cpu_to_le32(cptr->cdw14);
> +	c.common.cdw15 = cpu_to_le32(cptr->cdw15);
> +
> +	if (cptr->timeout_ms)
> +		timeout = msecs_to_jiffies(cptr->timeout_ms);
> 
>  	status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
> -			nvme_to_user_ptr(cmd.addr), cmd.data_len,
> -			nvme_to_user_ptr(cmd.metadata),
> cmd.metadata_len,
> -			0, &cmd.result, timeout);
> +			nvme_to_user_ptr(cptr->addr), cptr->data_len,
> +			nvme_to_user_ptr(cptr->metadata), cptr-
> >metadata_len,
> +			0, &cptr->result, timeout, ioucmd);
> 
> -	if (status >= 0) {
> -		if (put_user(cmd.result, &ucmd->result))
> +	if (!ioucmd && status >= 0) {
> +		if (put_user(cptr->result, &ucmd->result))
>  			return -EFAULT;
>  	}
> 
> @@ -296,7 +396,7 @@ static int nvme_ctrl_ioctl(struct nvme_ctrl *ctrl,
> unsigned int cmd,
>  	case NVME_IOCTL_ADMIN_CMD:
>  		return nvme_user_cmd(ctrl, NULL, argp);
>  	case NVME_IOCTL_ADMIN64_CMD:
> -		return nvme_user_cmd64(ctrl, NULL, argp);
> +		return nvme_user_cmd64(ctrl, NULL, argp, NULL);
>  	default:
>  		return sed_ioctl(ctrl->opal_dev, cmd, argp);
>  	}
> @@ -340,7 +440,7 @@ static int nvme_ns_ioctl(struct nvme_ns *ns,
> unsigned int cmd,
>  	case NVME_IOCTL_SUBMIT_IO:
>  		return nvme_submit_io(ns, argp);
>  	case NVME_IOCTL_IO64_CMD:
> -		return nvme_user_cmd64(ns->ctrl, ns, argp);
> +		return nvme_user_cmd64(ns->ctrl, ns, argp, NULL);
>  	default:
>  		return -ENOTTY;
>  	}
> @@ -369,6 +469,33 @@ long nvme_ns_chr_ioctl(struct file *file, unsigned int
> cmd, unsigned long arg)
>  	return __nvme_ioctl(ns, cmd, (void __user *)arg);
>  }
> 
> +static int nvme_ns_async_ioctl(struct nvme_ns *ns, struct io_uring_cmd
> *ioucmd)
> +{
> +	int ret;
> +
> +	BUILD_BUG_ON(sizeof(struct nvme_uring_cmd_pdu) >
> sizeof(ioucmd->pdu));
> +
> +	switch (ioucmd->cmd_op) {
> +	case NVME_IOCTL_IO64_CMD:
> +		ret = nvme_user_cmd64(ns->ctrl, ns, NULL, ioucmd);
> +		break;
> +	default:
> +		ret = -ENOTTY;
> +	}
> +
> +	if (ret >= 0)
> +		ret = -EIOCBQUEUED;
> +	return ret;
> +}

ret can equal -EAGAIN, which will cause io_uring to reissue the cmd
from a worker thread.  This can happen when ioucmd->flags has
IO_URING_F_NONBLOCK set causing nvme_alloc_request() to return
-EAGAIN when there are no tags available.

Either -EAGAIN needs to be remapped or force set REQ_F_NOWAIT in the
io_uring cmd request in patch 3 (the 2nd option is untested).

> +
> +int nvme_ns_chr_async_cmd(struct io_uring_cmd *ioucmd)
> +{
> +	struct nvme_ns *ns = container_of(file_inode(ioucmd->file)->i_cdev,
> +			struct nvme_ns, cdev);
> +
> +	return nvme_ns_async_ioctl(ns, ioucmd);
> +}
> +
>  #ifdef CONFIG_NVME_MULTIPATH
>  static int nvme_ns_head_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd,
>  		void __user *argp, struct nvme_ns_head *head, int srcu_idx)
> @@ -412,6 +539,20 @@ int nvme_ns_head_ioctl(struct block_device *bdev,
> fmode_t mode,
>  	return ret;
>  }
> 
> +int nvme_ns_head_chr_async_cmd(struct io_uring_cmd *ioucmd)
> +{
> +	struct cdev *cdev = file_inode(ioucmd->file)->i_cdev;
> +	struct nvme_ns_head *head = container_of(cdev, struct
> nvme_ns_head, cdev);
> +	int srcu_idx = srcu_read_lock(&head->srcu);
> +	struct nvme_ns *ns = nvme_find_path(head);
> +	int ret = -EWOULDBLOCK;

-EWOULDBLOCK has the same value as -EAGAIN so the same issue
Is here as with nvme_ns_async_ioctl() returning it.

> +
> +	if (ns)
> +		ret = nvme_ns_async_ioctl(ns, ioucmd);
> +	srcu_read_unlock(&head->srcu, srcu_idx);
> +	return ret;
> +}
> +
>  long nvme_ns_head_chr_ioctl(struct file *file, unsigned int cmd,
>  		unsigned long arg)
>  {
> @@ -480,7 +621,7 @@ long nvme_dev_ioctl(struct file *file, unsigned int
> cmd,
>  	case NVME_IOCTL_ADMIN_CMD:
>  		return nvme_user_cmd(ctrl, NULL, argp);
>  	case NVME_IOCTL_ADMIN64_CMD:
> -		return nvme_user_cmd64(ctrl, NULL, argp);
> +		return nvme_user_cmd64(ctrl, NULL, argp, NULL);
>  	case NVME_IOCTL_IO_CMD:
>  		return nvme_dev_user_cmd(ctrl, argp);
>  	case NVME_IOCTL_RESET:
> diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
> index f8bf6606eb2f..1d798d09456f 100644
> --- a/drivers/nvme/host/multipath.c
> +++ b/drivers/nvme/host/multipath.c
> @@ -459,6 +459,7 @@ static const struct file_operations
> nvme_ns_head_chr_fops = {
>  	.release	= nvme_ns_head_chr_release,
>  	.unlocked_ioctl	= nvme_ns_head_chr_ioctl,
>  	.compat_ioctl	= compat_ptr_ioctl,
> +	.async_cmd	= nvme_ns_head_chr_async_cmd,
>  };
> 
>  static int nvme_add_ns_head_cdev(struct nvme_ns_head *head)
> diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
> index b32f4e2c68fd..e6a30543d7c8 100644
> --- a/drivers/nvme/host/nvme.h
> +++ b/drivers/nvme/host/nvme.h
> @@ -16,6 +16,7 @@
>  #include <linux/rcupdate.h>
>  #include <linux/wait.h>
>  #include <linux/t10-pi.h>
> +#include <linux/io_uring.h>
> 
>  #include <trace/events/block.h>
> 
> @@ -752,6 +753,8 @@ long nvme_ns_head_chr_ioctl(struct file *file,
> unsigned int cmd,
>  		unsigned long arg);
>  long nvme_dev_ioctl(struct file *file, unsigned int cmd,
>  		unsigned long arg);
> +int nvme_ns_chr_async_cmd(struct io_uring_cmd *ioucmd);
> +int nvme_ns_head_chr_async_cmd(struct io_uring_cmd *ioucmd);
>  int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo);
> 
>  extern const struct attribute_group *nvme_ns_id_attr_groups[];
> --
> 2.25.1

On 5.10 with our version of this patch, I've seen that returning -EAGAIN to
io_uring results in poisoned bios and crashed kernel threads (NULL current->mm)
while constructing the async pass through request.  I looked at
git://git.kernel.dk/linux-block and git://git.infradead.org/nvme.git
and as best as I can tell, the same thing will  happen.
Kanchan Joshi March 10, 2022, 8:32 a.m. UTC | #2
On Thu, Mar 10, 2022 at 5:32 AM Clay Mayers <Clay.Mayers@kioxia.com> wrote:
>
> > From: Linux-nvme <linux-nvme-bounces@lists.infradead.org> On Behalf Of
> > Kanchan Joshi
> > Sent: Tuesday, March 8, 2022 7:21 AM
> > To: axboe@kernel.dk; hch@lst.de; kbusch@kernel.org;
> > asml.silence@gmail.com
> > Cc: io-uring@vger.kernel.org; linux-nvme@lists.infradead.org; linux-
> > block@vger.kernel.org; sbates@raithlin.com; logang@deltatee.com;
> > pankydev8@gmail.com; javier@javigon.com; mcgrof@kernel.org;
> > a.manzanares@samsung.com; joshiiitr@gmail.com; anuj20.g@samsung.com
> > Subject: [PATCH 05/17] nvme: wire-up support for async-passthru on char-
> > device.
> >
> > Introduce handler for fops->async_cmd(), implementing async passthru
> > on char device (/dev/ngX). The handler supports NVME_IOCTL_IO64_CMD
> > for
> > read and write commands. Returns failure for other commands.
> > This is low overhead path for processing the inline commands housed
> > inside io_uring's sqe. Neither the commmand is fetched via
> > copy_from_user, nor the result (inside passthru command) is updated via
> > put_user.
> >
> > Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
> > Signed-off-by: Anuj Gupta <anuj20.g@samsung.com>
> > ---
> >  drivers/nvme/host/core.c      |   1 +
> >  drivers/nvme/host/ioctl.c     | 205 ++++++++++++++++++++++++++++------
> >  drivers/nvme/host/multipath.c |   1 +
> >  drivers/nvme/host/nvme.h      |   3 +
> >  4 files changed, 178 insertions(+), 32 deletions(-)
> >
> > diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> > index 159944499c4f..3fe8f5901cd9 100644
> > --- a/drivers/nvme/host/core.c
> > +++ b/drivers/nvme/host/core.c
> > @@ -3667,6 +3667,7 @@ static const struct file_operations
> > nvme_ns_chr_fops = {
> >       .release        = nvme_ns_chr_release,
> >       .unlocked_ioctl = nvme_ns_chr_ioctl,
> >       .compat_ioctl   = compat_ptr_ioctl,
> > +     .async_cmd      = nvme_ns_chr_async_cmd,
> >  };
> >
> >  static int nvme_add_ns_cdev(struct nvme_ns *ns)
> > diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
> > index 5c9cd9695519..1df270b47af5 100644
> > --- a/drivers/nvme/host/ioctl.c
> > +++ b/drivers/nvme/host/ioctl.c
> > @@ -18,6 +18,76 @@ static void __user *nvme_to_user_ptr(uintptr_t ptrval)
> >               ptrval = (compat_uptr_t)ptrval;
> >       return (void __user *)ptrval;
> >  }
> > +/*
> > + * This overlays struct io_uring_cmd pdu.
> > + * Expect build errors if this grows larger than that.
> > + */
> > +struct nvme_uring_cmd_pdu {
> > +     u32 meta_len;
> > +     union {
> > +             struct bio *bio;
> > +             struct request *req;
> > +     };
> > +     void *meta; /* kernel-resident buffer */
> > +     void __user *meta_buffer;
> > +} __packed;
> > +
> > +static struct nvme_uring_cmd_pdu *nvme_uring_cmd_pdu(struct
> > io_uring_cmd *ioucmd)
> > +{
> > +     return (struct nvme_uring_cmd_pdu *)&ioucmd->pdu;
> > +}
> > +
> > +static void nvme_pt_task_cb(struct io_uring_cmd *ioucmd)
> > +{
> > +     struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
> > +     struct request *req = pdu->req;
> > +     int status;
> > +     struct bio *bio = req->bio;
> > +
> > +     if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
> > +             status = -EINTR;
> > +     else
> > +             status = nvme_req(req)->status;
> > +
> > +     /* we can free request */
> > +     blk_mq_free_request(req);
> > +     blk_rq_unmap_user(bio);
> > +
> > +     if (!status && pdu->meta_buffer) {
> > +             if (copy_to_user(pdu->meta_buffer, pdu->meta, pdu-
> > >meta_len))
> > +                     status = -EFAULT;
> > +     }
> > +     kfree(pdu->meta);
> > +
> > +     io_uring_cmd_done(ioucmd, status);
> > +}
> > +
> > +static void nvme_end_async_pt(struct request *req, blk_status_t err)
> > +{
> > +     struct io_uring_cmd *ioucmd = req->end_io_data;
> > +     struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
> > +     /* extract bio before reusing the same field for request */
> > +     struct bio *bio = pdu->bio;
> > +
> > +     pdu->req = req;
> > +     req->bio = bio;
> > +     /* this takes care of setting up task-work */
> > +     io_uring_cmd_complete_in_task(ioucmd, nvme_pt_task_cb);
> > +}
> > +
> > +static void nvme_setup_uring_cmd_data(struct request *rq,
> > +             struct io_uring_cmd *ioucmd, void *meta,
> > +             void __user *meta_buffer, u32 meta_len)
> > +{
> > +     struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
> > +
> > +     /* to free bio on completion, as req->bio will be null at that time */
> > +     pdu->bio = rq->bio;
> > +     pdu->meta = meta;
> > +     pdu->meta_buffer = meta_buffer;
> > +     pdu->meta_len = meta_len;
> > +     rq->end_io_data = ioucmd;
> > +}
> >
> >  static void *nvme_add_user_metadata(struct bio *bio, void __user *ubuf,
> >               unsigned len, u32 seed, bool write)
> > @@ -56,7 +126,8 @@ static void *nvme_add_user_metadata(struct bio
> > *bio, void __user *ubuf,
> >  static int nvme_submit_user_cmd(struct request_queue *q,
> >               struct nvme_command *cmd, void __user *ubuffer,
> >               unsigned bufflen, void __user *meta_buffer, unsigned
> > meta_len,
> > -             u32 meta_seed, u64 *result, unsigned timeout)
> > +             u32 meta_seed, u64 *result, unsigned timeout,
> > +             struct io_uring_cmd *ioucmd)
> >  {
> >       bool write = nvme_is_write(cmd);
> >       struct nvme_ns *ns = q->queuedata;
> > @@ -64,9 +135,15 @@ static int nvme_submit_user_cmd(struct
> > request_queue *q,
> >       struct request *req;
> >       struct bio *bio = NULL;
> >       void *meta = NULL;
> > +     unsigned int rq_flags = 0;
> > +     blk_mq_req_flags_t blk_flags = 0;
> >       int ret;
> >
> > -     req = nvme_alloc_request(q, cmd, 0, 0);
> > +     if (ioucmd && (ioucmd->flags & IO_URING_F_NONBLOCK)) {
> > +             rq_flags |= REQ_NOWAIT;
> > +             blk_flags |= BLK_MQ_REQ_NOWAIT;
> > +     }
> > +     req = nvme_alloc_request(q, cmd, blk_flags, rq_flags);
> >       if (IS_ERR(req))
> >               return PTR_ERR(req);
> >
> > @@ -92,6 +169,19 @@ static int nvme_submit_user_cmd(struct
> > request_queue *q,
> >                       req->cmd_flags |= REQ_INTEGRITY;
> >               }
> >       }
> > +     if (ioucmd) { /* async dispatch */
> > +             if (cmd->common.opcode == nvme_cmd_write ||
> > +                             cmd->common.opcode == nvme_cmd_read) {
> > +                     nvme_setup_uring_cmd_data(req, ioucmd, meta,
> > meta_buffer,
> > +                                     meta_len);
> > +                     blk_execute_rq_nowait(req, 0, nvme_end_async_pt);
> > +                     return 0;
> > +             } else {
> > +                     /* support only read and write for now. */
> > +                     ret = -EINVAL;
> > +                     goto out_meta;
> > +             }
> > +     }
> >
> >       ret = nvme_execute_passthru_rq(req);
> >       if (result)
> > @@ -100,6 +190,7 @@ static int nvme_submit_user_cmd(struct
> > request_queue *q,
> >               if (copy_to_user(meta_buffer, meta, meta_len))
> >                       ret = -EFAULT;
> >       }
> > + out_meta:
> >       kfree(meta);
> >   out_unmap:
> >       if (bio)
> > @@ -170,7 +261,8 @@ static int nvme_submit_io(struct nvme_ns *ns, struct
> > nvme_user_io __user *uio)
> >
> >       return nvme_submit_user_cmd(ns->queue, &c,
> >                       nvme_to_user_ptr(io.addr), length,
> > -                     metadata, meta_len, lower_32_bits(io.slba), NULL,
> > 0);
> > +                     metadata, meta_len, lower_32_bits(io.slba), NULL, 0,
> > +                     NULL);
> >  }
> >
> >  static bool nvme_validate_passthru_nsid(struct nvme_ctrl *ctrl,
> > @@ -224,7 +316,7 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl,
> > struct nvme_ns *ns,
> >       status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
> >                       nvme_to_user_ptr(cmd.addr), cmd.data_len,
> >                       nvme_to_user_ptr(cmd.metadata),
> > cmd.metadata_len,
> > -                     0, &result, timeout);
> > +                     0, &result, timeout, NULL);
> >
> >       if (status >= 0) {
> >               if (put_user(result, &ucmd->result))
> > @@ -235,45 +327,53 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl,
> > struct nvme_ns *ns,
> >  }
> >
> >  static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
> > -                     struct nvme_passthru_cmd64 __user *ucmd)
> > +                     struct nvme_passthru_cmd64 __user *ucmd,
> > +                     struct io_uring_cmd *ioucmd)
> >  {
> > -     struct nvme_passthru_cmd64 cmd;
> > +     struct nvme_passthru_cmd64 cmd, *cptr;
> >       struct nvme_command c;
> >       unsigned timeout = 0;
> >       int status;
> >
> >       if (!capable(CAP_SYS_ADMIN))
> >               return -EACCES;
> > -     if (copy_from_user(&cmd, ucmd, sizeof(cmd)))
> > -             return -EFAULT;
> > -     if (cmd.flags)
> > +     if (!ioucmd) {
> > +             if (copy_from_user(&cmd, ucmd, sizeof(cmd)))
> > +                     return -EFAULT;
> > +             cptr = &cmd;
> > +     } else {
> > +             if (ioucmd->cmd_len != sizeof(struct nvme_passthru_cmd64))
> > +                     return -EINVAL;
> > +             cptr = (struct nvme_passthru_cmd64 *)ioucmd->cmd;
> > +     }
> > +     if (cptr->flags)
> >               return -EINVAL;
> > -     if (!nvme_validate_passthru_nsid(ctrl, ns, cmd.nsid))
> > +     if (!nvme_validate_passthru_nsid(ctrl, ns, cptr->nsid))
> >               return -EINVAL;
> >
> >       memset(&c, 0, sizeof(c));
> > -     c.common.opcode = cmd.opcode;
> > -     c.common.flags = cmd.flags;
> > -     c.common.nsid = cpu_to_le32(cmd.nsid);
> > -     c.common.cdw2[0] = cpu_to_le32(cmd.cdw2);
> > -     c.common.cdw2[1] = cpu_to_le32(cmd.cdw3);
> > -     c.common.cdw10 = cpu_to_le32(cmd.cdw10);
> > -     c.common.cdw11 = cpu_to_le32(cmd.cdw11);
> > -     c.common.cdw12 = cpu_to_le32(cmd.cdw12);
> > -     c.common.cdw13 = cpu_to_le32(cmd.cdw13);
> > -     c.common.cdw14 = cpu_to_le32(cmd.cdw14);
> > -     c.common.cdw15 = cpu_to_le32(cmd.cdw15);
> > -
> > -     if (cmd.timeout_ms)
> > -             timeout = msecs_to_jiffies(cmd.timeout_ms);
> > +     c.common.opcode = cptr->opcode;
> > +     c.common.flags = cptr->flags;
> > +     c.common.nsid = cpu_to_le32(cptr->nsid);
> > +     c.common.cdw2[0] = cpu_to_le32(cptr->cdw2);
> > +     c.common.cdw2[1] = cpu_to_le32(cptr->cdw3);
> > +     c.common.cdw10 = cpu_to_le32(cptr->cdw10);
> > +     c.common.cdw11 = cpu_to_le32(cptr->cdw11);
> > +     c.common.cdw12 = cpu_to_le32(cptr->cdw12);
> > +     c.common.cdw13 = cpu_to_le32(cptr->cdw13);
> > +     c.common.cdw14 = cpu_to_le32(cptr->cdw14);
> > +     c.common.cdw15 = cpu_to_le32(cptr->cdw15);
> > +
> > +     if (cptr->timeout_ms)
> > +             timeout = msecs_to_jiffies(cptr->timeout_ms);
> >
> >       status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
> > -                     nvme_to_user_ptr(cmd.addr), cmd.data_len,
> > -                     nvme_to_user_ptr(cmd.metadata),
> > cmd.metadata_len,
> > -                     0, &cmd.result, timeout);
> > +                     nvme_to_user_ptr(cptr->addr), cptr->data_len,
> > +                     nvme_to_user_ptr(cptr->metadata), cptr-
> > >metadata_len,
> > +                     0, &cptr->result, timeout, ioucmd);
> >
> > -     if (status >= 0) {
> > -             if (put_user(cmd.result, &ucmd->result))
> > +     if (!ioucmd && status >= 0) {
> > +             if (put_user(cptr->result, &ucmd->result))
> >                       return -EFAULT;
> >       }
> >
> > @@ -296,7 +396,7 @@ static int nvme_ctrl_ioctl(struct nvme_ctrl *ctrl,
> > unsigned int cmd,
> >       case NVME_IOCTL_ADMIN_CMD:
> >               return nvme_user_cmd(ctrl, NULL, argp);
> >       case NVME_IOCTL_ADMIN64_CMD:
> > -             return nvme_user_cmd64(ctrl, NULL, argp);
> > +             return nvme_user_cmd64(ctrl, NULL, argp, NULL);
> >       default:
> >               return sed_ioctl(ctrl->opal_dev, cmd, argp);
> >       }
> > @@ -340,7 +440,7 @@ static int nvme_ns_ioctl(struct nvme_ns *ns,
> > unsigned int cmd,
> >       case NVME_IOCTL_SUBMIT_IO:
> >               return nvme_submit_io(ns, argp);
> >       case NVME_IOCTL_IO64_CMD:
> > -             return nvme_user_cmd64(ns->ctrl, ns, argp);
> > +             return nvme_user_cmd64(ns->ctrl, ns, argp, NULL);
> >       default:
> >               return -ENOTTY;
> >       }
> > @@ -369,6 +469,33 @@ long nvme_ns_chr_ioctl(struct file *file, unsigned int
> > cmd, unsigned long arg)
> >       return __nvme_ioctl(ns, cmd, (void __user *)arg);
> >  }
> >
> > +static int nvme_ns_async_ioctl(struct nvme_ns *ns, struct io_uring_cmd
> > *ioucmd)
> > +{
> > +     int ret;
> > +
> > +     BUILD_BUG_ON(sizeof(struct nvme_uring_cmd_pdu) >
> > sizeof(ioucmd->pdu));
> > +
> > +     switch (ioucmd->cmd_op) {
> > +     case NVME_IOCTL_IO64_CMD:
> > +             ret = nvme_user_cmd64(ns->ctrl, ns, NULL, ioucmd);
> > +             break;
> > +     default:
> > +             ret = -ENOTTY;
> > +     }
> > +
> > +     if (ret >= 0)
> > +             ret = -EIOCBQUEUED;
> > +     return ret;
> > +}
>
> ret can equal -EAGAIN, which will cause io_uring to reissue the cmd
> from a worker thread.  This can happen when ioucmd->flags has
> IO_URING_F_NONBLOCK set causing nvme_alloc_request() to return
> -EAGAIN when there are no tags available.
>
> Either -EAGAIN needs to be remapped or force set REQ_F_NOWAIT in the
> io_uring cmd request in patch 3 (the 2nd option is untested).

This patch already sets REQ_F_NOWAIT for IO_URING_F_NONBLOCK flag.
Here:
+ if (ioucmd && (ioucmd->flags & IO_URING_F_NONBLOCK)) {
+ rq_flags |= REQ_NOWAIT;
+ blk_flags |= BLK_MQ_REQ_NOWAIT;
+ }
+ req = nvme_alloc_request(q, cmd, blk_flags, rq_flags);

And if -EAGAIN goes to io_uring, we don't try to setup the worker and
instead return the error to userspace for retry.
Here is the relevant fragment from Patch 3:

+ ioucmd->flags |= issue_flags;
+ ret = file->f_op->async_cmd(ioucmd);
+ /* queued async, consumer will call io_uring_cmd_done() when complete */
+ if (ret == -EIOCBQUEUED)
+ return 0;
+ io_uring_cmd_done(ioucmd, ret);

> > +
> > +int nvme_ns_chr_async_cmd(struct io_uring_cmd *ioucmd)
> > +{
> > +     struct nvme_ns *ns = container_of(file_inode(ioucmd->file)->i_cdev,
> > +                     struct nvme_ns, cdev);
> > +
> > +     return nvme_ns_async_ioctl(ns, ioucmd);
> > +}
> > +
> >  #ifdef CONFIG_NVME_MULTIPATH
> >  static int nvme_ns_head_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd,
> >               void __user *argp, struct nvme_ns_head *head, int srcu_idx)
> > @@ -412,6 +539,20 @@ int nvme_ns_head_ioctl(struct block_device *bdev,
> > fmode_t mode,
> >       return ret;
> >  }
> >
> > +int nvme_ns_head_chr_async_cmd(struct io_uring_cmd *ioucmd)
> > +{
> > +     struct cdev *cdev = file_inode(ioucmd->file)->i_cdev;
> > +     struct nvme_ns_head *head = container_of(cdev, struct
> > nvme_ns_head, cdev);
> > +     int srcu_idx = srcu_read_lock(&head->srcu);
> > +     struct nvme_ns *ns = nvme_find_path(head);
> > +     int ret = -EWOULDBLOCK;
>
> -EWOULDBLOCK has the same value as -EAGAIN so the same issue
> Is here as with nvme_ns_async_ioctl() returning it.

Same as above.

> > +
> > +     if (ns)
> > +             ret = nvme_ns_async_ioctl(ns, ioucmd);
> > +     srcu_read_unlock(&head->srcu, srcu_idx);
> > +     return ret;
> > +}
> > +
> >  long nvme_ns_head_chr_ioctl(struct file *file, unsigned int cmd,
> >               unsigned long arg)
> >  {
> > @@ -480,7 +621,7 @@ long nvme_dev_ioctl(struct file *file, unsigned int
> > cmd,
> >       case NVME_IOCTL_ADMIN_CMD:
> >               return nvme_user_cmd(ctrl, NULL, argp);
> >       case NVME_IOCTL_ADMIN64_CMD:
> > -             return nvme_user_cmd64(ctrl, NULL, argp);
> > +             return nvme_user_cmd64(ctrl, NULL, argp, NULL);
> >       case NVME_IOCTL_IO_CMD:
> >               return nvme_dev_user_cmd(ctrl, argp);
> >       case NVME_IOCTL_RESET:
> > diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
> > index f8bf6606eb2f..1d798d09456f 100644
> > --- a/drivers/nvme/host/multipath.c
> > +++ b/drivers/nvme/host/multipath.c
> > @@ -459,6 +459,7 @@ static const struct file_operations
> > nvme_ns_head_chr_fops = {
> >       .release        = nvme_ns_head_chr_release,
> >       .unlocked_ioctl = nvme_ns_head_chr_ioctl,
> >       .compat_ioctl   = compat_ptr_ioctl,
> > +     .async_cmd      = nvme_ns_head_chr_async_cmd,
> >  };
> >
> >  static int nvme_add_ns_head_cdev(struct nvme_ns_head *head)
> > diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
> > index b32f4e2c68fd..e6a30543d7c8 100644
> > --- a/drivers/nvme/host/nvme.h
> > +++ b/drivers/nvme/host/nvme.h
> > @@ -16,6 +16,7 @@
> >  #include <linux/rcupdate.h>
> >  #include <linux/wait.h>
> >  #include <linux/t10-pi.h>
> > +#include <linux/io_uring.h>
> >
> >  #include <trace/events/block.h>
> >
> > @@ -752,6 +753,8 @@ long nvme_ns_head_chr_ioctl(struct file *file,
> > unsigned int cmd,
> >               unsigned long arg);
> >  long nvme_dev_ioctl(struct file *file, unsigned int cmd,
> >               unsigned long arg);
> > +int nvme_ns_chr_async_cmd(struct io_uring_cmd *ioucmd);
> > +int nvme_ns_head_chr_async_cmd(struct io_uring_cmd *ioucmd);
> >  int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo);
> >
> >  extern const struct attribute_group *nvme_ns_id_attr_groups[];
> > --
> > 2.25.1
>
> On 5.10 with our version of this patch, I've seen that returning -EAGAIN to
> io_uring results in poisoned bios and crashed kernel threads (NULL current->mm)
> while constructing the async pass through request.  I looked at
> git://git.kernel.dk/linux-block and git://git.infradead.org/nvme.git
> and as best as I can tell, the same thing will  happen.

As pointed above in the snippet, any error except -EIOCBQUEUED is just
posted in CQE during submission itself. So I do not see why -EAGAIN
should cause trouble, at least in this patchset.
FWIW- I tested by forcefully returning -EAGAIN from nvme, and also tag
saturation case (which also returns -EAGAIN) and did not see that sort
of issue.

Please take this series for a spin.
Kernel: for-next in linux-block, on top of commit 9e9d83faa
("io_uring: Remove unneeded test in io_run_task_work_sig")
Christoph Hellwig March 11, 2022, 7:01 a.m. UTC | #3
On Tue, Mar 08, 2022 at 08:50:53PM +0530, Kanchan Joshi wrote:
> +/*
> + * This overlays struct io_uring_cmd pdu.
> + * Expect build errors if this grows larger than that.
> + */
> +struct nvme_uring_cmd_pdu {
> +	u32 meta_len;
> +	union {
> +		struct bio *bio;
> +		struct request *req;
> +	};
> +	void *meta; /* kernel-resident buffer */
> +	void __user *meta_buffer;
> +} __packed;

Why is this marked __packed?

In general I'd be much more happy if the meta elelements were a
io_uring-level feature handled outside the driver and typesafe in
struct io_uring_cmd, with just a normal private data pointer for the
actual user, which would remove all the crazy casting.

> +static void nvme_end_async_pt(struct request *req, blk_status_t err)
> +{
> +	struct io_uring_cmd *ioucmd = req->end_io_data;
> +	struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
> +	/* extract bio before reusing the same field for request */
> +	struct bio *bio = pdu->bio;
> +
> +	pdu->req = req;
> +	req->bio = bio;
> +	/* this takes care of setting up task-work */
> +	io_uring_cmd_complete_in_task(ioucmd, nvme_pt_task_cb);

This is a bit silly.  First we defer the actual request I/O completion
from the block layer to a different CPU or softirq and then we have
another callback here.  I think it would be much more useful if we
could find a way to enhance blk_mq_complete_request so that it could
directly complet in a given task.  That would also be really nice for
say normal synchronous direct I/O.

> +	if (ioucmd) { /* async dispatch */
> +		if (cmd->common.opcode == nvme_cmd_write ||
> +				cmd->common.opcode == nvme_cmd_read) {

No we can't just check for specific commands in the passthrough handler.

> +			nvme_setup_uring_cmd_data(req, ioucmd, meta, meta_buffer,
> +					meta_len);
> +			blk_execute_rq_nowait(req, 0, nvme_end_async_pt);
> +			return 0;
> +		} else {
> +			/* support only read and write for now. */
> +			ret = -EINVAL;
> +			goto out_meta;
> +		}

Pleae always handle error in the first branch and don't bother with an
else after a goto or return.

> +static int nvme_ns_async_ioctl(struct nvme_ns *ns, struct io_uring_cmd *ioucmd)
> +{
> +	int ret;
> +
> +	BUILD_BUG_ON(sizeof(struct nvme_uring_cmd_pdu) > sizeof(ioucmd->pdu));
> +
> +	switch (ioucmd->cmd_op) {
> +	case NVME_IOCTL_IO64_CMD:
> +		ret = nvme_user_cmd64(ns->ctrl, ns, NULL, ioucmd);
> +		break;
> +	default:
> +		ret = -ENOTTY;
> +	}
> +
> +	if (ret >= 0)
> +		ret = -EIOCBQUEUED;

That's a weird way to handle the returns.  Just return -EIOCBQUEUED
directly from the handler (which as said before should be split from
the ioctl handler anyway).
Luis Chamberlain March 11, 2022, 5:56 p.m. UTC | #4
On Tue, Mar 08, 2022 at 08:50:53PM +0530, Kanchan Joshi wrote:
> diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
> index 5c9cd9695519..1df270b47af5 100644
> --- a/drivers/nvme/host/ioctl.c
> +++ b/drivers/nvme/host/ioctl.c
> @@ -369,6 +469,33 @@ long nvme_ns_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
>  	return __nvme_ioctl(ns, cmd, (void __user *)arg);
>  }
>  
> +static int nvme_ns_async_ioctl(struct nvme_ns *ns, struct io_uring_cmd *ioucmd)
> +{
> +	int ret;
> +
> +	BUILD_BUG_ON(sizeof(struct nvme_uring_cmd_pdu) > sizeof(ioucmd->pdu));
> +
> +	switch (ioucmd->cmd_op) {
> +	case NVME_IOCTL_IO64_CMD:
> +		ret = nvme_user_cmd64(ns->ctrl, ns, NULL, ioucmd);
> +		break;
> +	default:
> +		ret = -ENOTTY;
> +	}
> +
> +	if (ret >= 0)
> +		ret = -EIOCBQUEUED;
> +	return ret;
> +}

And here I think we'll need something like this:

diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
index ddb7e5864be6..83529adf130d 100644
--- a/drivers/nvme/host/ioctl.c
+++ b/drivers/nvme/host/ioctl.c
@@ -5,6 +5,7 @@
  */
 #include <linux/ptrace.h>	/* for force_successful_syscall_return */
 #include <linux/nvme_ioctl.h>
+#include <linux/security.h>
 #include "nvme.h"
 
 /*
@@ -524,6 +525,11 @@ static int nvme_ns_async_ioctl(struct nvme_ns *ns, struct io_uring_cmd *ioucmd)
 
 	BUILD_BUG_ON(sizeof(struct nvme_uring_cmd_pdu) > sizeof(ioucmd->pdu));
 
+	ret = security_file_ioctl(ioucmd->file, ioucmd->cmd_op,
+				  (unsigned long) ioucmd->cmd);
+	if (ret)
+		return ret;
+
 	switch (ioucmd->cmd_op) {
 	case NVME_IOCTL_IO64_CMD:
 		ret = nvme_user_cmd64(ns->ctrl, ns, NULL, ioucmd);
Paul Moore March 11, 2022, 6:53 p.m. UTC | #5
On Fri, Mar 11, 2022 at 12:56 PM Luis Chamberlain <mcgrof@kernel.org> wrote:
>
> On Tue, Mar 08, 2022 at 08:50:53PM +0530, Kanchan Joshi wrote:
> > diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
> > index 5c9cd9695519..1df270b47af5 100644
> > --- a/drivers/nvme/host/ioctl.c
> > +++ b/drivers/nvme/host/ioctl.c
> > @@ -369,6 +469,33 @@ long nvme_ns_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
> >       return __nvme_ioctl(ns, cmd, (void __user *)arg);
> >  }
> >
> > +static int nvme_ns_async_ioctl(struct nvme_ns *ns, struct io_uring_cmd *ioucmd)
> > +{
> > +     int ret;
> > +
> > +     BUILD_BUG_ON(sizeof(struct nvme_uring_cmd_pdu) > sizeof(ioucmd->pdu));
> > +
> > +     switch (ioucmd->cmd_op) {
> > +     case NVME_IOCTL_IO64_CMD:
> > +             ret = nvme_user_cmd64(ns->ctrl, ns, NULL, ioucmd);
> > +             break;
> > +     default:
> > +             ret = -ENOTTY;
> > +     }
> > +
> > +     if (ret >= 0)
> > +             ret = -EIOCBQUEUED;
> > +     return ret;
> > +}
>
> And here I think we'll need something like this:

If we can promise that we will have a LSM hook for all of the
file_operations::async_cmd implementations that are security relevant
we could skip the LSM passthrough hook at the io_uring layer.  It
would potentially make life easier in that we don't have to worry
about putting the passthrough op in the right context, but risks
missing a LSM hook control point (it will happen at some point and
*boom* CVE).

> diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
> index ddb7e5864be6..83529adf130d 100644
> --- a/drivers/nvme/host/ioctl.c
> +++ b/drivers/nvme/host/ioctl.c
> @@ -5,6 +5,7 @@
>   */
>  #include <linux/ptrace.h>      /* for force_successful_syscall_return */
>  #include <linux/nvme_ioctl.h>
> +#include <linux/security.h>
>  #include "nvme.h"
>
>  /*
> @@ -524,6 +525,11 @@ static int nvme_ns_async_ioctl(struct nvme_ns *ns, struct io_uring_cmd *ioucmd)
>
>         BUILD_BUG_ON(sizeof(struct nvme_uring_cmd_pdu) > sizeof(ioucmd->pdu));
>
> +       ret = security_file_ioctl(ioucmd->file, ioucmd->cmd_op,
> +                                 (unsigned long) ioucmd->cmd);
> +       if (ret)
> +               return ret;
> +
>         switch (ioucmd->cmd_op) {
>         case NVME_IOCTL_IO64_CMD:
>                 ret = nvme_user_cmd64(ns->ctrl, ns, NULL, ioucmd);
Luis Chamberlain March 11, 2022, 9:02 p.m. UTC | #6
On Fri, Mar 11, 2022 at 01:53:03PM -0500, Paul Moore wrote:
> On Fri, Mar 11, 2022 at 12:56 PM Luis Chamberlain <mcgrof@kernel.org> wrote:
> >
> > On Tue, Mar 08, 2022 at 08:50:53PM +0530, Kanchan Joshi wrote:
> > > diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
> > > index 5c9cd9695519..1df270b47af5 100644
> > > --- a/drivers/nvme/host/ioctl.c
> > > +++ b/drivers/nvme/host/ioctl.c
> > > @@ -369,6 +469,33 @@ long nvme_ns_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
> > >       return __nvme_ioctl(ns, cmd, (void __user *)arg);
> > >  }
> > >
> > > +static int nvme_ns_async_ioctl(struct nvme_ns *ns, struct io_uring_cmd *ioucmd)
> > > +{
> > > +     int ret;
> > > +
> > > +     BUILD_BUG_ON(sizeof(struct nvme_uring_cmd_pdu) > sizeof(ioucmd->pdu));
> > > +
> > > +     switch (ioucmd->cmd_op) {
> > > +     case NVME_IOCTL_IO64_CMD:
> > > +             ret = nvme_user_cmd64(ns->ctrl, ns, NULL, ioucmd);
> > > +             break;
> > > +     default:
> > > +             ret = -ENOTTY;
> > > +     }
> > > +
> > > +     if (ret >= 0)
> > > +             ret = -EIOCBQUEUED;
> > > +     return ret;
> > > +}
> >
> > And here I think we'll need something like this:
> 
> If we can promise that we will have a LSM hook for all of the
> file_operations::async_cmd implementations that are security relevant
> we could skip the LSM passthrough hook at the io_uring layer.

There is no way to ensure this unless perhaps we cake that into
the API somehow... Or have a registration system for setting the
respctive file ops / LSM.

> It
> would potentially make life easier in that we don't have to worry
> about putting the passthrough op in the right context, but risks
> missing a LSM hook control point (it will happen at some point and
> *boom* CVE).

Precicely my concern. So we either open code this and ask folks
to do this or I think we do a registration and require both ops
and the the LSM hook at registration.

I think this should be enough information to get Kanchan rolling
on the LSM side.

  Luis
Sagi Grimberg March 13, 2022, 9:53 p.m. UTC | #7
> +int nvme_ns_head_chr_async_cmd(struct io_uring_cmd *ioucmd)
> +{
> +	struct cdev *cdev = file_inode(ioucmd->file)->i_cdev;
> +	struct nvme_ns_head *head = container_of(cdev, struct nvme_ns_head, cdev);
> +	int srcu_idx = srcu_read_lock(&head->srcu);
> +	struct nvme_ns *ns = nvme_find_path(head);
> +	int ret = -EWOULDBLOCK;
> +
> +	if (ns)
> +		ret = nvme_ns_async_ioctl(ns, ioucmd);
> +	srcu_read_unlock(&head->srcu, srcu_idx);
> +	return ret;
> +}

No one cares that this has no multipathing capabilities what-so-ever?
despite being issued on the mpath device node?

I know we are not doing multipathing for userspace today, but this
feels like an alternative I/O interface for nvme, seems a bit cripled
with zero multipathing capabilities...
Kanchan Joshi March 14, 2022, 4:23 p.m. UTC | #8
On Fri, Mar 11, 2022 at 08:01:48AM +0100, Christoph Hellwig wrote:
>On Tue, Mar 08, 2022 at 08:50:53PM +0530, Kanchan Joshi wrote:
>> +/*
>> + * This overlays struct io_uring_cmd pdu.
>> + * Expect build errors if this grows larger than that.
>> + */
>> +struct nvme_uring_cmd_pdu {
>> +	u32 meta_len;
>> +	union {
>> +		struct bio *bio;
>> +		struct request *req;
>> +	};
>> +	void *meta; /* kernel-resident buffer */
>> +	void __user *meta_buffer;
>> +} __packed;
>
>Why is this marked __packed?
Did not like doing it, but had to.
If not packed, this takes 32 bytes of space. While driver-pdu in struct
io_uring_cmd can take max 30 bytes. Packing nvme-pdu brought it down to
28 bytes, which fits and gives 2 bytes back.

For quick reference - 
struct io_uring_cmd {
        struct file *              file;                 /*     0     8 */
        void *                     cmd;                  /*     8     8 */
        union {
                void *             bio;                  /*    16     8 */
                void               (*driver_cb)(struct io_uring_cmd *); /*    16     8 */
        };                                               /*    16     8 */
        u32                        flags;                /*    24     4 */
        u32                        cmd_op;               /*    28     4 */
        u16                        cmd_len;              /*    32     2 */
        u16                        unused;               /*    34     2 */
        u8                         pdu[28];              /*    36    28 */

        /* size: 64, cachelines: 1, members: 8 */
};
io_uring_cmd struct goes into the first cacheline of io_kiocb.
Last field is pdu, taking 28 bytes. Will be 30 if I evaporate above
field.
nvme-pdu after packing:
struct nvme_uring_cmd_pdu {
        u32                        meta_len;             /*     0     4 */
        union {
                struct bio *       bio;                  /*     4     8 */
                struct request *   req;                  /*     4     8 */
        };                                               /*     4     8 */
        void *                     meta;                 /*    12     8 */
        void *                     meta_buffer;          /*    20     8 */

        /* size: 28, cachelines: 1, members: 4 */
        /* last cacheline: 28 bytes */
} __attribute__((__packed__));

>In general I'd be much more happy if the meta elelements were a
>io_uring-level feature handled outside the driver and typesafe in
>struct io_uring_cmd, with just a normal private data pointer for the
>actual user, which would remove all the crazy casting.

Not sure if I got your point.

+static struct nvme_uring_cmd_pdu *nvme_uring_cmd_pdu(struct io_uring_cmd *ioucmd)
+{
+       return (struct nvme_uring_cmd_pdu *)&ioucmd->pdu;
+}
+
+static void nvme_pt_task_cb(struct io_uring_cmd *ioucmd)
+{
+       struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);

Do you mean crazy casting inside nvme_uring_cmd_pdu()?
Somehow this looks sane to me (perhaps because it used to be crazier
earlier).

And on moving meta elements outside the driver, my worry is that it
reduces scope of uring-cmd infra and makes it nvme passthru specific.
At this point uring-cmd is still generic async ioctl/fsctl facility
which may find other users (than nvme-passthru) down the line. 
Organization of fields within "struct io_uring_cmd" is around the rule
that a field is kept out (of 28 bytes pdu) only if is accessed by both
io_uring and driver. 

>> +static void nvme_end_async_pt(struct request *req, blk_status_t err)
>> +{
>> +	struct io_uring_cmd *ioucmd = req->end_io_data;
>> +	struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
>> +	/* extract bio before reusing the same field for request */
>> +	struct bio *bio = pdu->bio;
>> +
>> +	pdu->req = req;
>> +	req->bio = bio;
>> +	/* this takes care of setting up task-work */
>> +	io_uring_cmd_complete_in_task(ioucmd, nvme_pt_task_cb);
>
>This is a bit silly.  First we defer the actual request I/O completion
>from the block layer to a different CPU or softirq and then we have
>another callback here.  I think it would be much more useful if we
>could find a way to enhance blk_mq_complete_request so that it could
>directly complet in a given task.  That would also be really nice for
>say normal synchronous direct I/O.

I see, so there is room for adding some efficiency.
Hope it will be ok if I carry this out as a separate effort.
Since this is about touching blk_mq_complete_request at its heart, and
improving sync-direct-io, this does not seem best-fit and slow this
series down.

FWIW, I ran the tests with counters inside blk_mq_complete_request_remote()

        if (blk_mq_complete_need_ipi(rq)) {
                blk_mq_complete_send_ipi(rq);
                return true;
        }

        if (rq->q->nr_hw_queues == 1) {
                blk_mq_raise_softirq(rq);
                return true;
        }
Deferring by ipi or softirq never occured. Neither for block nor for
char. Softirq is obvious since I was not running against scsi (or nvme with
single queue). I could not spot whether this is really a overhead, at
least for nvme.


>> +	if (ioucmd) { /* async dispatch */
>> +		if (cmd->common.opcode == nvme_cmd_write ||
>> +				cmd->common.opcode == nvme_cmd_read) {
>
>No we can't just check for specific commands in the passthrough handler.

Right. This is for inline-cmd approach. 
Last two patches of the series undo this (for indirect-cmd).
I will do something about it.

>> +			nvme_setup_uring_cmd_data(req, ioucmd, meta, meta_buffer,
>> +					meta_len);
>> +			blk_execute_rq_nowait(req, 0, nvme_end_async_pt);
>> +			return 0;
>> +		} else {
>> +			/* support only read and write for now. */
>> +			ret = -EINVAL;
>> +			goto out_meta;
>> +		}
>
>Pleae always handle error in the first branch and don't bother with an
>else after a goto or return.

Yes, that'll be better.
>> +static int nvme_ns_async_ioctl(struct nvme_ns *ns, struct io_uring_cmd *ioucmd)
>> +{
>> +	int ret;
>> +
>> +	BUILD_BUG_ON(sizeof(struct nvme_uring_cmd_pdu) > sizeof(ioucmd->pdu));
>> +
>> +	switch (ioucmd->cmd_op) {
>> +	case NVME_IOCTL_IO64_CMD:
>> +		ret = nvme_user_cmd64(ns->ctrl, ns, NULL, ioucmd);
>> +		break;
>> +	default:
>> +		ret = -ENOTTY;
>> +	}
>> +
>> +	if (ret >= 0)
>> +		ret = -EIOCBQUEUED;
>
>That's a weird way to handle the returns.  Just return -EIOCBQUEUED
>directly from the handler (which as said before should be split from
>the ioctl handler anyway).

Indeed. That will make it cleaner.
Kanchan Joshi March 14, 2022, 5:54 p.m. UTC | #9
On Mon, Mar 14, 2022 at 3:23 AM Sagi Grimberg <sagi@grimberg.me> wrote:
>
>
> > +int nvme_ns_head_chr_async_cmd(struct io_uring_cmd *ioucmd)
> > +{
> > +     struct cdev *cdev = file_inode(ioucmd->file)->i_cdev;
> > +     struct nvme_ns_head *head = container_of(cdev, struct nvme_ns_head, cdev);
> > +     int srcu_idx = srcu_read_lock(&head->srcu);
> > +     struct nvme_ns *ns = nvme_find_path(head);
> > +     int ret = -EWOULDBLOCK;
> > +
> > +     if (ns)
> > +             ret = nvme_ns_async_ioctl(ns, ioucmd);
> > +     srcu_read_unlock(&head->srcu, srcu_idx);
> > +     return ret;
> > +}
>
> No one cares that this has no multipathing capabilities what-so-ever?
> despite being issued on the mpath device node?
>
> I know we are not doing multipathing for userspace today, but this
> feels like an alternative I/O interface for nvme, seems a bit cripled
> with zero multipathing capabilities...

Multipathing is on the radar. Either in the first cut or in
subsequent. Thanks for bringing this up.
So the char-node (/dev/ngX) will be exposed to the host if we enable
controller passthru on the target side. And then the host can send
commands using uring-passthru in the same way.

May I know what are the other requirements here.
Bit of a shame that I missed adding that in the LSF proposal, but it's
correctible.
Christoph Hellwig March 15, 2022, 8:54 a.m. UTC | #10
On Mon, Mar 14, 2022 at 09:53:56PM +0530, Kanchan Joshi wrote:
>>> +struct nvme_uring_cmd_pdu {
>>> +	u32 meta_len;
>>> +	union {
>>> +		struct bio *bio;
>>> +		struct request *req;
>>> +	};
>>> +	void *meta; /* kernel-resident buffer */
>>> +	void __user *meta_buffer;
>>> +} __packed;
>>
>> Why is this marked __packed?
> Did not like doing it, but had to.
> If not packed, this takes 32 bytes of space. While driver-pdu in struct
> io_uring_cmd can take max 30 bytes. Packing nvme-pdu brought it down to
> 28 bytes, which fits and gives 2 bytes back.

What if you move meta_len to the end?  Even if we need the __packed
that will avoid all the unaligned access to pointers, which on some
architectures will crash the kernel.

> And on moving meta elements outside the driver, my worry is that it
> reduces scope of uring-cmd infra and makes it nvme passthru specific.
> At this point uring-cmd is still generic async ioctl/fsctl facility
> which may find other users (than nvme-passthru) down the line. Organization 
> of fields within "struct io_uring_cmd" is around the rule
> that a field is kept out (of 28 bytes pdu) only if is accessed by both
> io_uring and driver. 

We have plenty of other interfaces of that kind.  Sockets are one case
already, and regular read/write with protection information will be
another one.  So having some core infrastrucure for "secondary data"
seems very useful.

> I see, so there is room for adding some efficiency.
> Hope it will be ok if I carry this out as a separate effort.
> Since this is about touching blk_mq_complete_request at its heart, and
> improving sync-direct-io, this does not seem best-fit and slow this
> series down.

I really rather to this properly.  Especially as the current effort
adds new exported interfaces.

> Deferring by ipi or softirq never occured. Neither for block nor for
> char. Softirq is obvious since I was not running against scsi (or nvme with
> single queue). I could not spot whether this is really a overhead, at
> least for nvme.

This tends to kick in if you have less queues than cpu cores.  Quite
command with either a high core cound or a not very high end nvme
controller.
Sagi Grimberg March 15, 2022, 9:02 a.m. UTC | #11
>>> +int nvme_ns_head_chr_async_cmd(struct io_uring_cmd *ioucmd)
>>> +{
>>> +     struct cdev *cdev = file_inode(ioucmd->file)->i_cdev;
>>> +     struct nvme_ns_head *head = container_of(cdev, struct nvme_ns_head, cdev);
>>> +     int srcu_idx = srcu_read_lock(&head->srcu);
>>> +     struct nvme_ns *ns = nvme_find_path(head);
>>> +     int ret = -EWOULDBLOCK;
>>> +
>>> +     if (ns)
>>> +             ret = nvme_ns_async_ioctl(ns, ioucmd);
>>> +     srcu_read_unlock(&head->srcu, srcu_idx);
>>> +     return ret;
>>> +}
>>
>> No one cares that this has no multipathing capabilities what-so-ever?
>> despite being issued on the mpath device node?
>>
>> I know we are not doing multipathing for userspace today, but this
>> feels like an alternative I/O interface for nvme, seems a bit cripled
>> with zero multipathing capabilities...
> 
> Multipathing is on the radar. Either in the first cut or in
> subsequent. Thanks for bringing this up.

Good to know...

> So the char-node (/dev/ngX) will be exposed to the host if we enable
> controller passthru on the target side. And then the host can send
> commands using uring-passthru in the same way.

Not sure I follow...

> May I know what are the other requirements here.

Again, not sure I follow... The fundamental capability is to
requeue/failover I/O if there is no I/O capable path available...

> Bit of a shame that I missed adding that in the LSF proposal, but it's
> correctible.
Kanchan Joshi March 16, 2022, 7:27 a.m. UTC | #12
On Tue, Mar 15, 2022 at 09:54:10AM +0100, Christoph Hellwig wrote:
>On Mon, Mar 14, 2022 at 09:53:56PM +0530, Kanchan Joshi wrote:
>>>> +struct nvme_uring_cmd_pdu {
>>>> +	u32 meta_len;
>>>> +	union {
>>>> +		struct bio *bio;
>>>> +		struct request *req;
>>>> +	};
>>>> +	void *meta; /* kernel-resident buffer */
>>>> +	void __user *meta_buffer;
>>>> +} __packed;
>>>
>>> Why is this marked __packed?
>> Did not like doing it, but had to.
>> If not packed, this takes 32 bytes of space. While driver-pdu in struct
>> io_uring_cmd can take max 30 bytes. Packing nvme-pdu brought it down to
>> 28 bytes, which fits and gives 2 bytes back.
>
>What if you move meta_len to the end?  Even if we need the __packed
>that will avoid all the unaligned access to pointers, which on some
>architectures will crash the kernel.
ah, right. Will move that to the end.

>> And on moving meta elements outside the driver, my worry is that it
>> reduces scope of uring-cmd infra and makes it nvme passthru specific.
>> At this point uring-cmd is still generic async ioctl/fsctl facility
>> which may find other users (than nvme-passthru) down the line. Organization
>> of fields within "struct io_uring_cmd" is around the rule
>> that a field is kept out (of 28 bytes pdu) only if is accessed by both
>> io_uring and driver.
>
>We have plenty of other interfaces of that kind.  Sockets are one case
>already, and regular read/write with protection information will be
>another one.  So having some core infrastrucure for "secondary data"
>seems very useful.
So what is the picture that you have in mind for struct io_uring_cmd?
Moving meta fields out makes it look like this - 

@@ -28,7 +28,10 @@ struct io_uring_cmd {
        u32             cmd_op;
        u16             cmd_len;
        u16             unused;
-       u8              pdu[28]; /* available inline for free use */
+       void __user     *meta_buffer; /* nvme pt specific */
+       u32             meta_len; /* nvme pt specific */
+       u8              pdu[16]; /* available inline for free use */
+
 };
And corresponding nvme 16 byte pdu - 
 struct nvme_uring_cmd_pdu {
-       u32 meta_len;
        union {
                struct bio *bio;
                struct request *req;
        };
        void *meta; /* kernel-resident buffer */
-       void __user *meta_buffer;
 } __packed;

I do not understand how this helps. Only the generic space (28 bytes)
got reduced to 16 bytes.

>> I see, so there is room for adding some efficiency.
>> Hope it will be ok if I carry this out as a separate effort.
>> Since this is about touching blk_mq_complete_request at its heart, and
>> improving sync-direct-io, this does not seem best-fit and slow this
>> series down.
>
>I really rather to this properly.  Especially as the current effort
>adds new exported interfaces.

Seems you are referring to io_uring_cmd_complete_in_task().

We would still need to use/export that even if we somehow manage to move
task-work trigger from nvme-function to blk_mq_complete_request.
io_uring's task-work infra is more baked than raw task-work infra.
It would not be good to repeat all that code elsewhere.
I tried raw one in the first attempt, and Jens suggested to move to baked
one. Here is the link that gave birth to this interface -
https://lore.kernel.org/linux-nvme/6d847f4a-65a5-bc62-1d36-52e222e3d142@kernel.dk/
 

>> Deferring by ipi or softirq never occured. Neither for block nor for
>> char. Softirq is obvious since I was not running against scsi (or nvme with
>> single queue). I could not spot whether this is really a overhead, at
>> least for nvme.
>
>This tends to kick in if you have less queues than cpu cores.  Quite
>command with either a high core cound or a not very high end nvme
>controller.
I will check that.
But swtiching (irq to task-work) is more generic and not about this series.
Triggering task-work anyway happens for regular read/write
completion too (in io_uring)...in the same return path involving
blk_mq_complete_request. For passthru, we are just triggering this
somewhat earlier in the completion path.
Kanchan Joshi March 16, 2022, 9:21 a.m. UTC | #13
On Tue, Mar 15, 2022 at 11:02:30AM +0200, Sagi Grimberg wrote:
>
>>>>+int nvme_ns_head_chr_async_cmd(struct io_uring_cmd *ioucmd)
>>>>+{
>>>>+     struct cdev *cdev = file_inode(ioucmd->file)->i_cdev;
>>>>+     struct nvme_ns_head *head = container_of(cdev, struct nvme_ns_head, cdev);
>>>>+     int srcu_idx = srcu_read_lock(&head->srcu);
>>>>+     struct nvme_ns *ns = nvme_find_path(head);
>>>>+     int ret = -EWOULDBLOCK;
>>>>+
>>>>+     if (ns)
>>>>+             ret = nvme_ns_async_ioctl(ns, ioucmd);
>>>>+     srcu_read_unlock(&head->srcu, srcu_idx);
>>>>+     return ret;
>>>>+}
>>>
>>>No one cares that this has no multipathing capabilities what-so-ever?
>>>despite being issued on the mpath device node?
>>>
>>>I know we are not doing multipathing for userspace today, but this
>>>feels like an alternative I/O interface for nvme, seems a bit cripled
>>>with zero multipathing capabilities...
>>
>>Multipathing is on the radar. Either in the first cut or in
>>subsequent. Thanks for bringing this up.
>
>Good to know...
>
>>So the char-node (/dev/ngX) will be exposed to the host if we enable
>>controller passthru on the target side. And then the host can send
>>commands using uring-passthru in the same way.
>
>Not sure I follow...

Doing this on target side:
echo -n /dev/nvme0 > /sys/kernel/config/nvmet/subsystems/testnqn/passthru/device_path
echo 1 > /sys/kernel/config/nvmet/subsystems/testnqn/passthru/enable

>>May I know what are the other requirements here.
>
>Again, not sure I follow... The fundamental capability is to
>requeue/failover I/O if there is no I/O capable path available...

That is covered I think, with nvme_find_path() at places including the
one you highlighted above.
Sagi Grimberg March 16, 2022, 10:56 a.m. UTC | #14
On 3/16/22 11:21, Kanchan Joshi wrote:
> On Tue, Mar 15, 2022 at 11:02:30AM +0200, Sagi Grimberg wrote:
>>
>>>>> +int nvme_ns_head_chr_async_cmd(struct io_uring_cmd *ioucmd)
>>>>> +{
>>>>> +     struct cdev *cdev = file_inode(ioucmd->file)->i_cdev;
>>>>> +     struct nvme_ns_head *head = container_of(cdev, struct 
>>>>> nvme_ns_head, cdev);
>>>>> +     int srcu_idx = srcu_read_lock(&head->srcu);
>>>>> +     struct nvme_ns *ns = nvme_find_path(head);
>>>>> +     int ret = -EWOULDBLOCK;
>>>>> +
>>>>> +     if (ns)
>>>>> +             ret = nvme_ns_async_ioctl(ns, ioucmd);
>>>>> +     srcu_read_unlock(&head->srcu, srcu_idx);
>>>>> +     return ret;
>>>>> +}
>>>>
>>>> No one cares that this has no multipathing capabilities what-so-ever?
>>>> despite being issued on the mpath device node?
>>>>
>>>> I know we are not doing multipathing for userspace today, but this
>>>> feels like an alternative I/O interface for nvme, seems a bit cripled
>>>> with zero multipathing capabilities...
>>>
>>> Multipathing is on the radar. Either in the first cut or in
>>> subsequent. Thanks for bringing this up.
>>
>> Good to know...
>>
>>> So the char-node (/dev/ngX) will be exposed to the host if we enable
>>> controller passthru on the target side. And then the host can send
>>> commands using uring-passthru in the same way.
>>
>> Not sure I follow...
> 
> Doing this on target side:
> echo -n /dev/nvme0 > 
> /sys/kernel/config/nvmet/subsystems/testnqn/passthru/device_path
> echo 1 > /sys/kernel/config/nvmet/subsystems/testnqn/passthru/enable

Cool, what does that have to do with what I asked?

>>> May I know what are the other requirements here.
>>
>> Again, not sure I follow... The fundamental capability is to
>> requeue/failover I/O if there is no I/O capable path available...
> 
> That is covered I think, with nvme_find_path() at places including the
> one you highlighted above.

No it isn't. nvme_find_path is a simple function that retrieves an I/O
capable path which is not guaranteed to exist, it has nothing to do with
I/O requeue/failover.

Please see nvme_ns_head_submit_bio, nvme_failover_req,
nvme_requeue_work.
Kanchan Joshi March 16, 2022, 11:51 a.m. UTC | #15
> On 3/16/22 11:21, Kanchan Joshi wrote:
> > On Tue, Mar 15, 2022 at 11:02:30AM +0200, Sagi Grimberg wrote:
> >>
> >>>>> +int nvme_ns_head_chr_async_cmd(struct io_uring_cmd *ioucmd)
> >>>>> +{
> >>>>> +     struct cdev *cdev = file_inode(ioucmd->file)->i_cdev;
> >>>>> +     struct nvme_ns_head *head = container_of(cdev, struct
> >>>>> nvme_ns_head, cdev);
> >>>>> +     int srcu_idx = srcu_read_lock(&head->srcu);
> >>>>> +     struct nvme_ns *ns = nvme_find_path(head);
> >>>>> +     int ret = -EWOULDBLOCK;
> >>>>> +
> >>>>> +     if (ns)
> >>>>> +             ret = nvme_ns_async_ioctl(ns, ioucmd);
> >>>>> +     srcu_read_unlock(&head->srcu, srcu_idx);
> >>>>> +     return ret;
> >>>>> +}
> >>>>
> >>>> No one cares that this has no multipathing capabilities what-so-ever?
> >>>> despite being issued on the mpath device node?
> >>>>
> >>>> I know we are not doing multipathing for userspace today, but this
> >>>> feels like an alternative I/O interface for nvme, seems a bit cripled
> >>>> with zero multipathing capabilities...
> >>>
> >>> Multipathing is on the radar. Either in the first cut or in
> >>> subsequent. Thanks for bringing this up.
> >>
> >> Good to know...
> >>
> >>> So the char-node (/dev/ngX) will be exposed to the host if we enable
> >>> controller passthru on the target side. And then the host can send
> >>> commands using uring-passthru in the same way.
> >>
> >> Not sure I follow...
> >
> > Doing this on target side:
> > echo -n /dev/nvme0 >
> > /sys/kernel/config/nvmet/subsystems/testnqn/passthru/device_path
> > echo 1 > /sys/kernel/config/nvmet/subsystems/testnqn/passthru/enable
>
> Cool, what does that have to do with what I asked?
Maybe nothing.
This was rather about how to set up nvmeof if block-interface does not
exist for the underlying nvme device.

> >>> May I know what are the other requirements here.
> >>
> >> Again, not sure I follow... The fundamental capability is to
> >> requeue/failover I/O if there is no I/O capable path available...
> >
> > That is covered I think, with nvme_find_path() at places including the
> > one you highlighted above.
>
> No it isn't. nvme_find_path is a simple function that retrieves an I/O
> capable path which is not guaranteed to exist, it has nothing to do with
> I/O requeue/failover.

Got it, thanks. Passthrough (sync or async) just returns the failure
to user-space if that fails.
No attempt to retry/requeue as the block path does.
Sagi Grimberg March 16, 2022, 1:52 p.m. UTC | #16
>>>>>> No one cares that this has no multipathing capabilities what-so-ever?
>>>>>> despite being issued on the mpath device node?
>>>>>>
>>>>>> I know we are not doing multipathing for userspace today, but this
>>>>>> feels like an alternative I/O interface for nvme, seems a bit cripled
>>>>>> with zero multipathing capabilities...

[...]

> Got it, thanks. Passthrough (sync or async) just returns the failure
> to user-space if that fails.
> No attempt to retry/requeue as the block path does.

I know, and that was my original question, no one cares that this
interface completely lacks this capability? Maybe it is fine, but
it is not a trivial assumption given that this is designed to be more
than an interface to send admin/vs commands to the controller...
Jens Axboe March 16, 2022, 2:35 p.m. UTC | #17
On 3/16/22 7:52 AM, Sagi Grimberg wrote:
> 
>>>>>>> No one cares that this has no multipathing capabilities what-so-ever?
>>>>>>> despite being issued on the mpath device node?
>>>>>>>
>>>>>>> I know we are not doing multipathing for userspace today, but this
>>>>>>> feels like an alternative I/O interface for nvme, seems a bit cripled
>>>>>>> with zero multipathing capabilities...
> 
> [...]
> 
>> Got it, thanks. Passthrough (sync or async) just returns the failure
>> to user-space if that fails.
>> No attempt to retry/requeue as the block path does.
> 
> I know, and that was my original question, no one cares that this
> interface completely lacks this capability? Maybe it is fine, but
> it is not a trivial assumption given that this is designed to be more
> than an interface to send admin/vs commands to the controller...

Most people don't really care about or use multipath, so it's not a
primary goal. For passthrough, most of request types should hit the
exact target, I would suggest that if someone cares about multipath for
specific commands, that they be flagged as such.
Sagi Grimberg March 16, 2022, 2:50 p.m. UTC | #18
>> [...]
>>
>>> Got it, thanks. Passthrough (sync or async) just returns the failure
>>> to user-space if that fails.
>>> No attempt to retry/requeue as the block path does.
>>
>> I know, and that was my original question, no one cares that this
>> interface completely lacks this capability? Maybe it is fine, but
>> it is not a trivial assumption given that this is designed to be more
>> than an interface to send admin/vs commands to the controller...
> 
> Most people don't really care about or use multipath, so it's not a
> primary goal.

This statement is generally correct. However what application would be 
interested in speaking raw nvme to a device and gaining performance that
is even higher than the block layer (which is great to begin with)?

First thing that comes to mind is a high-end storage array, where
dual-ported drives are considered to be the standard. I could argue the
same for a high-end oracle appliance or something like that... Although
in a lot of cases, each nvme port will connect to a different host...

What are the use-cases that need this interface that are the target
here? Don't remember seeing this come up in the cover-letter or previous
iterations...

> For passthrough, most of request types should hit the
> exact target, I would suggest that if someone cares about multipath for
> specific commands, that they be flagged as such.
What do you mean by "they be flagged as such"?
Clay Mayers March 22, 2022, 3:18 p.m. UTC | #19
> From: Kanchan Joshi
> Sent: Tuesday, March 8, 2022 7:21 AM
> To: axboe@kernel.dk; hch@lst.de; kbusch@kernel.org;
> asml.silence@gmail.com
> Cc: io-uring@vger.kernel.org; linux-nvme@lists.infradead.org; linux-
> block@vger.kernel.org; sbates@raithlin.com; logang@deltatee.com;
> pankydev8@gmail.com; javier@javigon.com; mcgrof@kernel.org;
> a.manzanares@samsung.com; joshiiitr@gmail.com; anuj20.g@samsung.com
> Subject: [PATCH 05/17] nvme: wire-up support for async-passthru on char-
> device.
> 


<snip>
> +static void nvme_pt_task_cb(struct io_uring_cmd *ioucmd)
> +{
> +	struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
> +	struct request *req = pdu->req;
> +	int status;
> +	struct bio *bio = req->bio;
> +
> +	if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
> +		status = -EINTR;
> +	else
> +		status = nvme_req(req)->status;
> +
> +	/* we can free request */
> +	blk_mq_free_request(req);
> +	blk_rq_unmap_user(bio);
> +
> +	if (!status && pdu->meta_buffer) {
> +		if (copy_to_user(pdu->meta_buffer, pdu->meta, pdu-
> >meta_len))

This copy is incorrectly called for writes.

> +			status = -EFAULT;
> +	}
> +	kfree(pdu->meta);
> +
> +	io_uring_cmd_done(ioucmd, status);
> +}
</snip>
Kanchan Joshi March 22, 2022, 4:57 p.m. UTC | #20
> > +
> > +     if (!status && pdu->meta_buffer) {
> > +             if (copy_to_user(pdu->meta_buffer, pdu->meta, pdu-
> > >meta_len))
>
> This copy is incorrectly called for writes.

Indeed. Will fix this up in v2, thanks.
Christoph Hellwig March 24, 2022, 6:20 a.m. UTC | #21
On Wed, Mar 16, 2022 at 04:50:53PM +0200, Sagi Grimberg wrote:
>>> I know, and that was my original question, no one cares that this
>>> interface completely lacks this capability? Maybe it is fine, but
>>> it is not a trivial assumption given that this is designed to be more
>>> than an interface to send admin/vs commands to the controller...
>>
>> Most people don't really care about or use multipath, so it's not a
>> primary goal.
>
> This statement is generally correct. However what application would be 
> interested in speaking raw nvme to a device and gaining performance that
> is even higher than the block layer (which is great to begin with)?

If passthrough is faster than the block I/O path we're doing someting
wrong.  At best it should be the same performance.

That being said multipathing is an integral part of the nvme driver
architecture, and the /dev/ngX devices.  If we want to support uring
async commands on /dev/ngX it will have to support multipath.
Christoph Hellwig March 24, 2022, 6:22 a.m. UTC | #22
On Wed, Mar 16, 2022 at 12:57:27PM +0530, Kanchan Joshi wrote:
> So what is the picture that you have in mind for struct io_uring_cmd?
> Moving meta fields out makes it look like this - 


> @@ -28,7 +28,10 @@ struct io_uring_cmd {
>        u32             cmd_op;
>        u16             cmd_len;
>        u16             unused;
> -       u8              pdu[28]; /* available inline for free use */
> +       void __user     *meta_buffer; /* nvme pt specific */
> +       u32             meta_len; /* nvme pt specific */
> +       u8              pdu[16]; /* available inline for free use */
> +
> };
> And corresponding nvme 16 byte pdu - struct nvme_uring_cmd_pdu {
> -       u32 meta_len;
>        union {
>                struct bio *bio;
>                struct request *req;
>        };
>        void *meta; /* kernel-resident buffer */
> -       void __user *meta_buffer;
> } __packed;

No, I'd also move the meta field (and call it meta_buffer) to
struct io_uring_cmd, and replace the pdu array with a simple
 
 	void *private;

> We would still need to use/export that even if we somehow manage to move
> task-work trigger from nvme-function to blk_mq_complete_request.
> io_uring's task-work infra is more baked than raw task-work infra.
> It would not be good to repeat all that code elsewhere.
> I tried raw one in the first attempt, and Jens suggested to move to baked
> one. Here is the link that gave birth to this interface -
> https://lore.kernel.org/linux-nvme/6d847f4a-65a5-bc62-1d36-52e222e3d142@kernel.dk/

Ok.  I can't say I'm a fan of where this is going.
Sagi Grimberg March 24, 2022, 10:42 a.m. UTC | #23
>>>> I know, and that was my original question, no one cares that this
>>>> interface completely lacks this capability? Maybe it is fine, but
>>>> it is not a trivial assumption given that this is designed to be more
>>>> than an interface to send admin/vs commands to the controller...
>>>
>>> Most people don't really care about or use multipath, so it's not a
>>> primary goal.
>>
>> This statement is generally correct. However what application would be
>> interested in speaking raw nvme to a device and gaining performance that
>> is even higher than the block layer (which is great to begin with)?
> 
> If passthrough is faster than the block I/O path we're doing someting
> wrong.  At best it should be the same performance.

That is not what the changelog says.

> That being said multipathing is an integral part of the nvme driver
> architecture, and the /dev/ngX devices.  If we want to support uring
> async commands on /dev/ngX it will have to support multipath.

Couldn't agree more...
Kanchan Joshi March 24, 2022, 5:45 p.m. UTC | #24
On Thu, Mar 24, 2022 at 11:52 AM Christoph Hellwig <hch@lst.de> wrote:
>
> On Wed, Mar 16, 2022 at 12:57:27PM +0530, Kanchan Joshi wrote:
> > So what is the picture that you have in mind for struct io_uring_cmd?
> > Moving meta fields out makes it look like this -
>
>
> > @@ -28,7 +28,10 @@ struct io_uring_cmd {
> >        u32             cmd_op;
> >        u16             cmd_len;
> >        u16             unused;
> > -       u8              pdu[28]; /* available inline for free use */
> > +       void __user     *meta_buffer; /* nvme pt specific */
> > +       u32             meta_len; /* nvme pt specific */
> > +       u8              pdu[16]; /* available inline for free use */
> > +
> > };
> > And corresponding nvme 16 byte pdu - struct nvme_uring_cmd_pdu {
> > -       u32 meta_len;
> >        union {
> >                struct bio *bio;
> >                struct request *req;
> >        };
> >        void *meta; /* kernel-resident buffer */
> > -       void __user *meta_buffer;
> > } __packed;
>
> No, I'd also move the meta field (and call it meta_buffer) to
> struct io_uring_cmd, and replace the pdu array with a simple
>
>         void *private;

That clears up. Can go that route, but the tradeoff is -
while we clean up one casting in nvme, we end up making async-cmd way
too nvme-passthrough specific.
People have talked about using async-cmd for other use cases; Darrick
mentioned using for xfs-scrub, and Luis had some ideas (other than
nvme) too.

The pdu array of 28 bytes is being used to avoid fast path
allocations. It got reduced to 8 bytes, and that is fine for one
nvme-ioctl as we moved other fields out.
But for other use-cases, 8 bytes of generic space may not be enough to
help with fast-path allocations.
diff mbox series

Patch

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 159944499c4f..3fe8f5901cd9 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -3667,6 +3667,7 @@  static const struct file_operations nvme_ns_chr_fops = {
 	.release	= nvme_ns_chr_release,
 	.unlocked_ioctl	= nvme_ns_chr_ioctl,
 	.compat_ioctl	= compat_ptr_ioctl,
+	.async_cmd	= nvme_ns_chr_async_cmd,
 };
 
 static int nvme_add_ns_cdev(struct nvme_ns *ns)
diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
index 5c9cd9695519..1df270b47af5 100644
--- a/drivers/nvme/host/ioctl.c
+++ b/drivers/nvme/host/ioctl.c
@@ -18,6 +18,76 @@  static void __user *nvme_to_user_ptr(uintptr_t ptrval)
 		ptrval = (compat_uptr_t)ptrval;
 	return (void __user *)ptrval;
 }
+/*
+ * This overlays struct io_uring_cmd pdu.
+ * Expect build errors if this grows larger than that.
+ */
+struct nvme_uring_cmd_pdu {
+	u32 meta_len;
+	union {
+		struct bio *bio;
+		struct request *req;
+	};
+	void *meta; /* kernel-resident buffer */
+	void __user *meta_buffer;
+} __packed;
+
+static struct nvme_uring_cmd_pdu *nvme_uring_cmd_pdu(struct io_uring_cmd *ioucmd)
+{
+	return (struct nvme_uring_cmd_pdu *)&ioucmd->pdu;
+}
+
+static void nvme_pt_task_cb(struct io_uring_cmd *ioucmd)
+{
+	struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
+	struct request *req = pdu->req;
+	int status;
+	struct bio *bio = req->bio;
+
+	if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
+		status = -EINTR;
+	else
+		status = nvme_req(req)->status;
+
+	/* we can free request */
+	blk_mq_free_request(req);
+	blk_rq_unmap_user(bio);
+
+	if (!status && pdu->meta_buffer) {
+		if (copy_to_user(pdu->meta_buffer, pdu->meta, pdu->meta_len))
+			status = -EFAULT;
+	}
+	kfree(pdu->meta);
+
+	io_uring_cmd_done(ioucmd, status);
+}
+
+static void nvme_end_async_pt(struct request *req, blk_status_t err)
+{
+	struct io_uring_cmd *ioucmd = req->end_io_data;
+	struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
+	/* extract bio before reusing the same field for request */
+	struct bio *bio = pdu->bio;
+
+	pdu->req = req;
+	req->bio = bio;
+	/* this takes care of setting up task-work */
+	io_uring_cmd_complete_in_task(ioucmd, nvme_pt_task_cb);
+}
+
+static void nvme_setup_uring_cmd_data(struct request *rq,
+		struct io_uring_cmd *ioucmd, void *meta,
+		void __user *meta_buffer, u32 meta_len)
+{
+	struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
+
+	/* to free bio on completion, as req->bio will be null at that time */
+	pdu->bio = rq->bio;
+	pdu->meta = meta;
+	pdu->meta_buffer = meta_buffer;
+	pdu->meta_len = meta_len;
+	rq->end_io_data = ioucmd;
+}
 
 static void *nvme_add_user_metadata(struct bio *bio, void __user *ubuf,
 		unsigned len, u32 seed, bool write)
@@ -56,7 +126,8 @@  static void *nvme_add_user_metadata(struct bio *bio, void __user *ubuf,
 static int nvme_submit_user_cmd(struct request_queue *q,
 		struct nvme_command *cmd, void __user *ubuffer,
 		unsigned bufflen, void __user *meta_buffer, unsigned meta_len,
-		u32 meta_seed, u64 *result, unsigned timeout)
+		u32 meta_seed, u64 *result, unsigned timeout,
+		struct io_uring_cmd *ioucmd)
 {
 	bool write = nvme_is_write(cmd);
 	struct nvme_ns *ns = q->queuedata;
@@ -64,9 +135,15 @@  static int nvme_submit_user_cmd(struct request_queue *q,
 	struct request *req;
 	struct bio *bio = NULL;
 	void *meta = NULL;
+	unsigned int rq_flags = 0;
+	blk_mq_req_flags_t blk_flags = 0;
 	int ret;
 
-	req = nvme_alloc_request(q, cmd, 0, 0);
+	if (ioucmd && (ioucmd->flags & IO_URING_F_NONBLOCK)) {
+		rq_flags |= REQ_NOWAIT;
+		blk_flags |= BLK_MQ_REQ_NOWAIT;
+	}
+	req = nvme_alloc_request(q, cmd, blk_flags, rq_flags);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
@@ -92,6 +169,19 @@  static int nvme_submit_user_cmd(struct request_queue *q,
 			req->cmd_flags |= REQ_INTEGRITY;
 		}
 	}
+	if (ioucmd) { /* async dispatch */
+		if (cmd->common.opcode == nvme_cmd_write ||
+				cmd->common.opcode == nvme_cmd_read) {
+			nvme_setup_uring_cmd_data(req, ioucmd, meta, meta_buffer,
+					meta_len);
+			blk_execute_rq_nowait(req, 0, nvme_end_async_pt);
+			return 0;
+		} else {
+			/* support only read and write for now. */
+			ret = -EINVAL;
+			goto out_meta;
+		}
+	}
 
 	ret = nvme_execute_passthru_rq(req);
 	if (result)
@@ -100,6 +190,7 @@  static int nvme_submit_user_cmd(struct request_queue *q,
 		if (copy_to_user(meta_buffer, meta, meta_len))
 			ret = -EFAULT;
 	}
+ out_meta:
 	kfree(meta);
  out_unmap:
 	if (bio)
@@ -170,7 +261,8 @@  static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 
 	return nvme_submit_user_cmd(ns->queue, &c,
 			nvme_to_user_ptr(io.addr), length,
-			metadata, meta_len, lower_32_bits(io.slba), NULL, 0);
+			metadata, meta_len, lower_32_bits(io.slba), NULL, 0,
+			NULL);
 }
 
 static bool nvme_validate_passthru_nsid(struct nvme_ctrl *ctrl,
@@ -224,7 +316,7 @@  static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 	status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
 			nvme_to_user_ptr(cmd.addr), cmd.data_len,
 			nvme_to_user_ptr(cmd.metadata), cmd.metadata_len,
-			0, &result, timeout);
+			0, &result, timeout, NULL);
 
 	if (status >= 0) {
 		if (put_user(result, &ucmd->result))
@@ -235,45 +327,53 @@  static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 }
 
 static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
-			struct nvme_passthru_cmd64 __user *ucmd)
+			struct nvme_passthru_cmd64 __user *ucmd,
+			struct io_uring_cmd *ioucmd)
 {
-	struct nvme_passthru_cmd64 cmd;
+	struct nvme_passthru_cmd64 cmd, *cptr;
 	struct nvme_command c;
 	unsigned timeout = 0;
 	int status;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EACCES;
-	if (copy_from_user(&cmd, ucmd, sizeof(cmd)))
-		return -EFAULT;
-	if (cmd.flags)
+	if (!ioucmd) {
+		if (copy_from_user(&cmd, ucmd, sizeof(cmd)))
+			return -EFAULT;
+		cptr = &cmd;
+	} else {
+		if (ioucmd->cmd_len != sizeof(struct nvme_passthru_cmd64))
+			return -EINVAL;
+		cptr = (struct nvme_passthru_cmd64 *)ioucmd->cmd;
+	}
+	if (cptr->flags)
 		return -EINVAL;
-	if (!nvme_validate_passthru_nsid(ctrl, ns, cmd.nsid))
+	if (!nvme_validate_passthru_nsid(ctrl, ns, cptr->nsid))
 		return -EINVAL;
 
 	memset(&c, 0, sizeof(c));
-	c.common.opcode = cmd.opcode;
-	c.common.flags = cmd.flags;
-	c.common.nsid = cpu_to_le32(cmd.nsid);
-	c.common.cdw2[0] = cpu_to_le32(cmd.cdw2);
-	c.common.cdw2[1] = cpu_to_le32(cmd.cdw3);
-	c.common.cdw10 = cpu_to_le32(cmd.cdw10);
-	c.common.cdw11 = cpu_to_le32(cmd.cdw11);
-	c.common.cdw12 = cpu_to_le32(cmd.cdw12);
-	c.common.cdw13 = cpu_to_le32(cmd.cdw13);
-	c.common.cdw14 = cpu_to_le32(cmd.cdw14);
-	c.common.cdw15 = cpu_to_le32(cmd.cdw15);
-
-	if (cmd.timeout_ms)
-		timeout = msecs_to_jiffies(cmd.timeout_ms);
+	c.common.opcode = cptr->opcode;
+	c.common.flags = cptr->flags;
+	c.common.nsid = cpu_to_le32(cptr->nsid);
+	c.common.cdw2[0] = cpu_to_le32(cptr->cdw2);
+	c.common.cdw2[1] = cpu_to_le32(cptr->cdw3);
+	c.common.cdw10 = cpu_to_le32(cptr->cdw10);
+	c.common.cdw11 = cpu_to_le32(cptr->cdw11);
+	c.common.cdw12 = cpu_to_le32(cptr->cdw12);
+	c.common.cdw13 = cpu_to_le32(cptr->cdw13);
+	c.common.cdw14 = cpu_to_le32(cptr->cdw14);
+	c.common.cdw15 = cpu_to_le32(cptr->cdw15);
+
+	if (cptr->timeout_ms)
+		timeout = msecs_to_jiffies(cptr->timeout_ms);
 
 	status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
-			nvme_to_user_ptr(cmd.addr), cmd.data_len,
-			nvme_to_user_ptr(cmd.metadata), cmd.metadata_len,
-			0, &cmd.result, timeout);
+			nvme_to_user_ptr(cptr->addr), cptr->data_len,
+			nvme_to_user_ptr(cptr->metadata), cptr->metadata_len,
+			0, &cptr->result, timeout, ioucmd);
 
-	if (status >= 0) {
-		if (put_user(cmd.result, &ucmd->result))
+	if (!ioucmd && status >= 0) {
+		if (put_user(cptr->result, &ucmd->result))
 			return -EFAULT;
 	}
 
@@ -296,7 +396,7 @@  static int nvme_ctrl_ioctl(struct nvme_ctrl *ctrl, unsigned int cmd,
 	case NVME_IOCTL_ADMIN_CMD:
 		return nvme_user_cmd(ctrl, NULL, argp);
 	case NVME_IOCTL_ADMIN64_CMD:
-		return nvme_user_cmd64(ctrl, NULL, argp);
+		return nvme_user_cmd64(ctrl, NULL, argp, NULL);
 	default:
 		return sed_ioctl(ctrl->opal_dev, cmd, argp);
 	}
@@ -340,7 +440,7 @@  static int nvme_ns_ioctl(struct nvme_ns *ns, unsigned int cmd,
 	case NVME_IOCTL_SUBMIT_IO:
 		return nvme_submit_io(ns, argp);
 	case NVME_IOCTL_IO64_CMD:
-		return nvme_user_cmd64(ns->ctrl, ns, argp);
+		return nvme_user_cmd64(ns->ctrl, ns, argp, NULL);
 	default:
 		return -ENOTTY;
 	}
@@ -369,6 +469,33 @@  long nvme_ns_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	return __nvme_ioctl(ns, cmd, (void __user *)arg);
 }
 
+static int nvme_ns_async_ioctl(struct nvme_ns *ns, struct io_uring_cmd *ioucmd)
+{
+	int ret;
+
+	BUILD_BUG_ON(sizeof(struct nvme_uring_cmd_pdu) > sizeof(ioucmd->pdu));
+
+	switch (ioucmd->cmd_op) {
+	case NVME_IOCTL_IO64_CMD:
+		ret = nvme_user_cmd64(ns->ctrl, ns, NULL, ioucmd);
+		break;
+	default:
+		ret = -ENOTTY;
+	}
+
+	if (ret >= 0)
+		ret = -EIOCBQUEUED;
+	return ret;
+}
+
+int nvme_ns_chr_async_cmd(struct io_uring_cmd *ioucmd)
+{
+	struct nvme_ns *ns = container_of(file_inode(ioucmd->file)->i_cdev,
+			struct nvme_ns, cdev);
+
+	return nvme_ns_async_ioctl(ns, ioucmd);
+}
+
 #ifdef CONFIG_NVME_MULTIPATH
 static int nvme_ns_head_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd,
 		void __user *argp, struct nvme_ns_head *head, int srcu_idx)
@@ -412,6 +539,20 @@  int nvme_ns_head_ioctl(struct block_device *bdev, fmode_t mode,
 	return ret;
 }
 
+int nvme_ns_head_chr_async_cmd(struct io_uring_cmd *ioucmd)
+{
+	struct cdev *cdev = file_inode(ioucmd->file)->i_cdev;
+	struct nvme_ns_head *head = container_of(cdev, struct nvme_ns_head, cdev);
+	int srcu_idx = srcu_read_lock(&head->srcu);
+	struct nvme_ns *ns = nvme_find_path(head);
+	int ret = -EWOULDBLOCK;
+
+	if (ns)
+		ret = nvme_ns_async_ioctl(ns, ioucmd);
+	srcu_read_unlock(&head->srcu, srcu_idx);
+	return ret;
+}
+
 long nvme_ns_head_chr_ioctl(struct file *file, unsigned int cmd,
 		unsigned long arg)
 {
@@ -480,7 +621,7 @@  long nvme_dev_ioctl(struct file *file, unsigned int cmd,
 	case NVME_IOCTL_ADMIN_CMD:
 		return nvme_user_cmd(ctrl, NULL, argp);
 	case NVME_IOCTL_ADMIN64_CMD:
-		return nvme_user_cmd64(ctrl, NULL, argp);
+		return nvme_user_cmd64(ctrl, NULL, argp, NULL);
 	case NVME_IOCTL_IO_CMD:
 		return nvme_dev_user_cmd(ctrl, argp);
 	case NVME_IOCTL_RESET:
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index f8bf6606eb2f..1d798d09456f 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -459,6 +459,7 @@  static const struct file_operations nvme_ns_head_chr_fops = {
 	.release	= nvme_ns_head_chr_release,
 	.unlocked_ioctl	= nvme_ns_head_chr_ioctl,
 	.compat_ioctl	= compat_ptr_ioctl,
+	.async_cmd	= nvme_ns_head_chr_async_cmd,
 };
 
 static int nvme_add_ns_head_cdev(struct nvme_ns_head *head)
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index b32f4e2c68fd..e6a30543d7c8 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -16,6 +16,7 @@ 
 #include <linux/rcupdate.h>
 #include <linux/wait.h>
 #include <linux/t10-pi.h>
+#include <linux/io_uring.h>
 
 #include <trace/events/block.h>
 
@@ -752,6 +753,8 @@  long nvme_ns_head_chr_ioctl(struct file *file, unsigned int cmd,
 		unsigned long arg);
 long nvme_dev_ioctl(struct file *file, unsigned int cmd,
 		unsigned long arg);
+int nvme_ns_chr_async_cmd(struct io_uring_cmd *ioucmd);
+int nvme_ns_head_chr_async_cmd(struct io_uring_cmd *ioucmd);
 int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo);
 
 extern const struct attribute_group *nvme_ns_id_attr_groups[];