Message ID | 20220308152105.309618-6-joshi.k@samsung.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | io_uring passthru over nvme | expand |
> From: Linux-nvme <linux-nvme-bounces@lists.infradead.org> On Behalf Of > Kanchan Joshi > Sent: Tuesday, March 8, 2022 7:21 AM > To: axboe@kernel.dk; hch@lst.de; kbusch@kernel.org; > asml.silence@gmail.com > Cc: io-uring@vger.kernel.org; linux-nvme@lists.infradead.org; linux- > block@vger.kernel.org; sbates@raithlin.com; logang@deltatee.com; > pankydev8@gmail.com; javier@javigon.com; mcgrof@kernel.org; > a.manzanares@samsung.com; joshiiitr@gmail.com; anuj20.g@samsung.com > Subject: [PATCH 05/17] nvme: wire-up support for async-passthru on char- > device. > > Introduce handler for fops->async_cmd(), implementing async passthru > on char device (/dev/ngX). The handler supports NVME_IOCTL_IO64_CMD > for > read and write commands. Returns failure for other commands. > This is low overhead path for processing the inline commands housed > inside io_uring's sqe. Neither the commmand is fetched via > copy_from_user, nor the result (inside passthru command) is updated via > put_user. > > Signed-off-by: Kanchan Joshi <joshi.k@samsung.com> > Signed-off-by: Anuj Gupta <anuj20.g@samsung.com> > --- > drivers/nvme/host/core.c | 1 + > drivers/nvme/host/ioctl.c | 205 ++++++++++++++++++++++++++++------ > drivers/nvme/host/multipath.c | 1 + > drivers/nvme/host/nvme.h | 3 + > 4 files changed, 178 insertions(+), 32 deletions(-) > > diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c > index 159944499c4f..3fe8f5901cd9 100644 > --- a/drivers/nvme/host/core.c > +++ b/drivers/nvme/host/core.c > @@ -3667,6 +3667,7 @@ static const struct file_operations > nvme_ns_chr_fops = { > .release = nvme_ns_chr_release, > .unlocked_ioctl = nvme_ns_chr_ioctl, > .compat_ioctl = compat_ptr_ioctl, > + .async_cmd = nvme_ns_chr_async_cmd, > }; > > static int nvme_add_ns_cdev(struct nvme_ns *ns) > diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c > index 5c9cd9695519..1df270b47af5 100644 > --- a/drivers/nvme/host/ioctl.c > +++ b/drivers/nvme/host/ioctl.c > @@ -18,6 +18,76 @@ static void __user *nvme_to_user_ptr(uintptr_t ptrval) > ptrval = (compat_uptr_t)ptrval; > return (void __user *)ptrval; > } > +/* > + * This overlays struct io_uring_cmd pdu. > + * Expect build errors if this grows larger than that. > + */ > +struct nvme_uring_cmd_pdu { > + u32 meta_len; > + union { > + struct bio *bio; > + struct request *req; > + }; > + void *meta; /* kernel-resident buffer */ > + void __user *meta_buffer; > +} __packed; > + > +static struct nvme_uring_cmd_pdu *nvme_uring_cmd_pdu(struct > io_uring_cmd *ioucmd) > +{ > + return (struct nvme_uring_cmd_pdu *)&ioucmd->pdu; > +} > + > +static void nvme_pt_task_cb(struct io_uring_cmd *ioucmd) > +{ > + struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); > + struct request *req = pdu->req; > + int status; > + struct bio *bio = req->bio; > + > + if (nvme_req(req)->flags & NVME_REQ_CANCELLED) > + status = -EINTR; > + else > + status = nvme_req(req)->status; > + > + /* we can free request */ > + blk_mq_free_request(req); > + blk_rq_unmap_user(bio); > + > + if (!status && pdu->meta_buffer) { > + if (copy_to_user(pdu->meta_buffer, pdu->meta, pdu- > >meta_len)) > + status = -EFAULT; > + } > + kfree(pdu->meta); > + > + io_uring_cmd_done(ioucmd, status); > +} > + > +static void nvme_end_async_pt(struct request *req, blk_status_t err) > +{ > + struct io_uring_cmd *ioucmd = req->end_io_data; > + struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); > + /* extract bio before reusing the same field for request */ > + struct bio *bio = pdu->bio; > + > + pdu->req = req; > + req->bio = bio; > + /* this takes care of setting up task-work */ > + io_uring_cmd_complete_in_task(ioucmd, nvme_pt_task_cb); > +} > + > +static void nvme_setup_uring_cmd_data(struct request *rq, > + struct io_uring_cmd *ioucmd, void *meta, > + void __user *meta_buffer, u32 meta_len) > +{ > + struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); > + > + /* to free bio on completion, as req->bio will be null at that time */ > + pdu->bio = rq->bio; > + pdu->meta = meta; > + pdu->meta_buffer = meta_buffer; > + pdu->meta_len = meta_len; > + rq->end_io_data = ioucmd; > +} > > static void *nvme_add_user_metadata(struct bio *bio, void __user *ubuf, > unsigned len, u32 seed, bool write) > @@ -56,7 +126,8 @@ static void *nvme_add_user_metadata(struct bio > *bio, void __user *ubuf, > static int nvme_submit_user_cmd(struct request_queue *q, > struct nvme_command *cmd, void __user *ubuffer, > unsigned bufflen, void __user *meta_buffer, unsigned > meta_len, > - u32 meta_seed, u64 *result, unsigned timeout) > + u32 meta_seed, u64 *result, unsigned timeout, > + struct io_uring_cmd *ioucmd) > { > bool write = nvme_is_write(cmd); > struct nvme_ns *ns = q->queuedata; > @@ -64,9 +135,15 @@ static int nvme_submit_user_cmd(struct > request_queue *q, > struct request *req; > struct bio *bio = NULL; > void *meta = NULL; > + unsigned int rq_flags = 0; > + blk_mq_req_flags_t blk_flags = 0; > int ret; > > - req = nvme_alloc_request(q, cmd, 0, 0); > + if (ioucmd && (ioucmd->flags & IO_URING_F_NONBLOCK)) { > + rq_flags |= REQ_NOWAIT; > + blk_flags |= BLK_MQ_REQ_NOWAIT; > + } > + req = nvme_alloc_request(q, cmd, blk_flags, rq_flags); > if (IS_ERR(req)) > return PTR_ERR(req); > > @@ -92,6 +169,19 @@ static int nvme_submit_user_cmd(struct > request_queue *q, > req->cmd_flags |= REQ_INTEGRITY; > } > } > + if (ioucmd) { /* async dispatch */ > + if (cmd->common.opcode == nvme_cmd_write || > + cmd->common.opcode == nvme_cmd_read) { > + nvme_setup_uring_cmd_data(req, ioucmd, meta, > meta_buffer, > + meta_len); > + blk_execute_rq_nowait(req, 0, nvme_end_async_pt); > + return 0; > + } else { > + /* support only read and write for now. */ > + ret = -EINVAL; > + goto out_meta; > + } > + } > > ret = nvme_execute_passthru_rq(req); > if (result) > @@ -100,6 +190,7 @@ static int nvme_submit_user_cmd(struct > request_queue *q, > if (copy_to_user(meta_buffer, meta, meta_len)) > ret = -EFAULT; > } > + out_meta: > kfree(meta); > out_unmap: > if (bio) > @@ -170,7 +261,8 @@ static int nvme_submit_io(struct nvme_ns *ns, struct > nvme_user_io __user *uio) > > return nvme_submit_user_cmd(ns->queue, &c, > nvme_to_user_ptr(io.addr), length, > - metadata, meta_len, lower_32_bits(io.slba), NULL, > 0); > + metadata, meta_len, lower_32_bits(io.slba), NULL, 0, > + NULL); > } > > static bool nvme_validate_passthru_nsid(struct nvme_ctrl *ctrl, > @@ -224,7 +316,7 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, > struct nvme_ns *ns, > status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, > nvme_to_user_ptr(cmd.addr), cmd.data_len, > nvme_to_user_ptr(cmd.metadata), > cmd.metadata_len, > - 0, &result, timeout); > + 0, &result, timeout, NULL); > > if (status >= 0) { > if (put_user(result, &ucmd->result)) > @@ -235,45 +327,53 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, > struct nvme_ns *ns, > } > > static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns, > - struct nvme_passthru_cmd64 __user *ucmd) > + struct nvme_passthru_cmd64 __user *ucmd, > + struct io_uring_cmd *ioucmd) > { > - struct nvme_passthru_cmd64 cmd; > + struct nvme_passthru_cmd64 cmd, *cptr; > struct nvme_command c; > unsigned timeout = 0; > int status; > > if (!capable(CAP_SYS_ADMIN)) > return -EACCES; > - if (copy_from_user(&cmd, ucmd, sizeof(cmd))) > - return -EFAULT; > - if (cmd.flags) > + if (!ioucmd) { > + if (copy_from_user(&cmd, ucmd, sizeof(cmd))) > + return -EFAULT; > + cptr = &cmd; > + } else { > + if (ioucmd->cmd_len != sizeof(struct nvme_passthru_cmd64)) > + return -EINVAL; > + cptr = (struct nvme_passthru_cmd64 *)ioucmd->cmd; > + } > + if (cptr->flags) > return -EINVAL; > - if (!nvme_validate_passthru_nsid(ctrl, ns, cmd.nsid)) > + if (!nvme_validate_passthru_nsid(ctrl, ns, cptr->nsid)) > return -EINVAL; > > memset(&c, 0, sizeof(c)); > - c.common.opcode = cmd.opcode; > - c.common.flags = cmd.flags; > - c.common.nsid = cpu_to_le32(cmd.nsid); > - c.common.cdw2[0] = cpu_to_le32(cmd.cdw2); > - c.common.cdw2[1] = cpu_to_le32(cmd.cdw3); > - c.common.cdw10 = cpu_to_le32(cmd.cdw10); > - c.common.cdw11 = cpu_to_le32(cmd.cdw11); > - c.common.cdw12 = cpu_to_le32(cmd.cdw12); > - c.common.cdw13 = cpu_to_le32(cmd.cdw13); > - c.common.cdw14 = cpu_to_le32(cmd.cdw14); > - c.common.cdw15 = cpu_to_le32(cmd.cdw15); > - > - if (cmd.timeout_ms) > - timeout = msecs_to_jiffies(cmd.timeout_ms); > + c.common.opcode = cptr->opcode; > + c.common.flags = cptr->flags; > + c.common.nsid = cpu_to_le32(cptr->nsid); > + c.common.cdw2[0] = cpu_to_le32(cptr->cdw2); > + c.common.cdw2[1] = cpu_to_le32(cptr->cdw3); > + c.common.cdw10 = cpu_to_le32(cptr->cdw10); > + c.common.cdw11 = cpu_to_le32(cptr->cdw11); > + c.common.cdw12 = cpu_to_le32(cptr->cdw12); > + c.common.cdw13 = cpu_to_le32(cptr->cdw13); > + c.common.cdw14 = cpu_to_le32(cptr->cdw14); > + c.common.cdw15 = cpu_to_le32(cptr->cdw15); > + > + if (cptr->timeout_ms) > + timeout = msecs_to_jiffies(cptr->timeout_ms); > > status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, > - nvme_to_user_ptr(cmd.addr), cmd.data_len, > - nvme_to_user_ptr(cmd.metadata), > cmd.metadata_len, > - 0, &cmd.result, timeout); > + nvme_to_user_ptr(cptr->addr), cptr->data_len, > + nvme_to_user_ptr(cptr->metadata), cptr- > >metadata_len, > + 0, &cptr->result, timeout, ioucmd); > > - if (status >= 0) { > - if (put_user(cmd.result, &ucmd->result)) > + if (!ioucmd && status >= 0) { > + if (put_user(cptr->result, &ucmd->result)) > return -EFAULT; > } > > @@ -296,7 +396,7 @@ static int nvme_ctrl_ioctl(struct nvme_ctrl *ctrl, > unsigned int cmd, > case NVME_IOCTL_ADMIN_CMD: > return nvme_user_cmd(ctrl, NULL, argp); > case NVME_IOCTL_ADMIN64_CMD: > - return nvme_user_cmd64(ctrl, NULL, argp); > + return nvme_user_cmd64(ctrl, NULL, argp, NULL); > default: > return sed_ioctl(ctrl->opal_dev, cmd, argp); > } > @@ -340,7 +440,7 @@ static int nvme_ns_ioctl(struct nvme_ns *ns, > unsigned int cmd, > case NVME_IOCTL_SUBMIT_IO: > return nvme_submit_io(ns, argp); > case NVME_IOCTL_IO64_CMD: > - return nvme_user_cmd64(ns->ctrl, ns, argp); > + return nvme_user_cmd64(ns->ctrl, ns, argp, NULL); > default: > return -ENOTTY; > } > @@ -369,6 +469,33 @@ long nvme_ns_chr_ioctl(struct file *file, unsigned int > cmd, unsigned long arg) > return __nvme_ioctl(ns, cmd, (void __user *)arg); > } > > +static int nvme_ns_async_ioctl(struct nvme_ns *ns, struct io_uring_cmd > *ioucmd) > +{ > + int ret; > + > + BUILD_BUG_ON(sizeof(struct nvme_uring_cmd_pdu) > > sizeof(ioucmd->pdu)); > + > + switch (ioucmd->cmd_op) { > + case NVME_IOCTL_IO64_CMD: > + ret = nvme_user_cmd64(ns->ctrl, ns, NULL, ioucmd); > + break; > + default: > + ret = -ENOTTY; > + } > + > + if (ret >= 0) > + ret = -EIOCBQUEUED; > + return ret; > +} ret can equal -EAGAIN, which will cause io_uring to reissue the cmd from a worker thread. This can happen when ioucmd->flags has IO_URING_F_NONBLOCK set causing nvme_alloc_request() to return -EAGAIN when there are no tags available. Either -EAGAIN needs to be remapped or force set REQ_F_NOWAIT in the io_uring cmd request in patch 3 (the 2nd option is untested). > + > +int nvme_ns_chr_async_cmd(struct io_uring_cmd *ioucmd) > +{ > + struct nvme_ns *ns = container_of(file_inode(ioucmd->file)->i_cdev, > + struct nvme_ns, cdev); > + > + return nvme_ns_async_ioctl(ns, ioucmd); > +} > + > #ifdef CONFIG_NVME_MULTIPATH > static int nvme_ns_head_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd, > void __user *argp, struct nvme_ns_head *head, int srcu_idx) > @@ -412,6 +539,20 @@ int nvme_ns_head_ioctl(struct block_device *bdev, > fmode_t mode, > return ret; > } > > +int nvme_ns_head_chr_async_cmd(struct io_uring_cmd *ioucmd) > +{ > + struct cdev *cdev = file_inode(ioucmd->file)->i_cdev; > + struct nvme_ns_head *head = container_of(cdev, struct > nvme_ns_head, cdev); > + int srcu_idx = srcu_read_lock(&head->srcu); > + struct nvme_ns *ns = nvme_find_path(head); > + int ret = -EWOULDBLOCK; -EWOULDBLOCK has the same value as -EAGAIN so the same issue Is here as with nvme_ns_async_ioctl() returning it. > + > + if (ns) > + ret = nvme_ns_async_ioctl(ns, ioucmd); > + srcu_read_unlock(&head->srcu, srcu_idx); > + return ret; > +} > + > long nvme_ns_head_chr_ioctl(struct file *file, unsigned int cmd, > unsigned long arg) > { > @@ -480,7 +621,7 @@ long nvme_dev_ioctl(struct file *file, unsigned int > cmd, > case NVME_IOCTL_ADMIN_CMD: > return nvme_user_cmd(ctrl, NULL, argp); > case NVME_IOCTL_ADMIN64_CMD: > - return nvme_user_cmd64(ctrl, NULL, argp); > + return nvme_user_cmd64(ctrl, NULL, argp, NULL); > case NVME_IOCTL_IO_CMD: > return nvme_dev_user_cmd(ctrl, argp); > case NVME_IOCTL_RESET: > diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c > index f8bf6606eb2f..1d798d09456f 100644 > --- a/drivers/nvme/host/multipath.c > +++ b/drivers/nvme/host/multipath.c > @@ -459,6 +459,7 @@ static const struct file_operations > nvme_ns_head_chr_fops = { > .release = nvme_ns_head_chr_release, > .unlocked_ioctl = nvme_ns_head_chr_ioctl, > .compat_ioctl = compat_ptr_ioctl, > + .async_cmd = nvme_ns_head_chr_async_cmd, > }; > > static int nvme_add_ns_head_cdev(struct nvme_ns_head *head) > diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h > index b32f4e2c68fd..e6a30543d7c8 100644 > --- a/drivers/nvme/host/nvme.h > +++ b/drivers/nvme/host/nvme.h > @@ -16,6 +16,7 @@ > #include <linux/rcupdate.h> > #include <linux/wait.h> > #include <linux/t10-pi.h> > +#include <linux/io_uring.h> > > #include <trace/events/block.h> > > @@ -752,6 +753,8 @@ long nvme_ns_head_chr_ioctl(struct file *file, > unsigned int cmd, > unsigned long arg); > long nvme_dev_ioctl(struct file *file, unsigned int cmd, > unsigned long arg); > +int nvme_ns_chr_async_cmd(struct io_uring_cmd *ioucmd); > +int nvme_ns_head_chr_async_cmd(struct io_uring_cmd *ioucmd); > int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo); > > extern const struct attribute_group *nvme_ns_id_attr_groups[]; > -- > 2.25.1 On 5.10 with our version of this patch, I've seen that returning -EAGAIN to io_uring results in poisoned bios and crashed kernel threads (NULL current->mm) while constructing the async pass through request. I looked at git://git.kernel.dk/linux-block and git://git.infradead.org/nvme.git and as best as I can tell, the same thing will happen.
On Thu, Mar 10, 2022 at 5:32 AM Clay Mayers <Clay.Mayers@kioxia.com> wrote: > > > From: Linux-nvme <linux-nvme-bounces@lists.infradead.org> On Behalf Of > > Kanchan Joshi > > Sent: Tuesday, March 8, 2022 7:21 AM > > To: axboe@kernel.dk; hch@lst.de; kbusch@kernel.org; > > asml.silence@gmail.com > > Cc: io-uring@vger.kernel.org; linux-nvme@lists.infradead.org; linux- > > block@vger.kernel.org; sbates@raithlin.com; logang@deltatee.com; > > pankydev8@gmail.com; javier@javigon.com; mcgrof@kernel.org; > > a.manzanares@samsung.com; joshiiitr@gmail.com; anuj20.g@samsung.com > > Subject: [PATCH 05/17] nvme: wire-up support for async-passthru on char- > > device. > > > > Introduce handler for fops->async_cmd(), implementing async passthru > > on char device (/dev/ngX). The handler supports NVME_IOCTL_IO64_CMD > > for > > read and write commands. Returns failure for other commands. > > This is low overhead path for processing the inline commands housed > > inside io_uring's sqe. Neither the commmand is fetched via > > copy_from_user, nor the result (inside passthru command) is updated via > > put_user. > > > > Signed-off-by: Kanchan Joshi <joshi.k@samsung.com> > > Signed-off-by: Anuj Gupta <anuj20.g@samsung.com> > > --- > > drivers/nvme/host/core.c | 1 + > > drivers/nvme/host/ioctl.c | 205 ++++++++++++++++++++++++++++------ > > drivers/nvme/host/multipath.c | 1 + > > drivers/nvme/host/nvme.h | 3 + > > 4 files changed, 178 insertions(+), 32 deletions(-) > > > > diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c > > index 159944499c4f..3fe8f5901cd9 100644 > > --- a/drivers/nvme/host/core.c > > +++ b/drivers/nvme/host/core.c > > @@ -3667,6 +3667,7 @@ static const struct file_operations > > nvme_ns_chr_fops = { > > .release = nvme_ns_chr_release, > > .unlocked_ioctl = nvme_ns_chr_ioctl, > > .compat_ioctl = compat_ptr_ioctl, > > + .async_cmd = nvme_ns_chr_async_cmd, > > }; > > > > static int nvme_add_ns_cdev(struct nvme_ns *ns) > > diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c > > index 5c9cd9695519..1df270b47af5 100644 > > --- a/drivers/nvme/host/ioctl.c > > +++ b/drivers/nvme/host/ioctl.c > > @@ -18,6 +18,76 @@ static void __user *nvme_to_user_ptr(uintptr_t ptrval) > > ptrval = (compat_uptr_t)ptrval; > > return (void __user *)ptrval; > > } > > +/* > > + * This overlays struct io_uring_cmd pdu. > > + * Expect build errors if this grows larger than that. > > + */ > > +struct nvme_uring_cmd_pdu { > > + u32 meta_len; > > + union { > > + struct bio *bio; > > + struct request *req; > > + }; > > + void *meta; /* kernel-resident buffer */ > > + void __user *meta_buffer; > > +} __packed; > > + > > +static struct nvme_uring_cmd_pdu *nvme_uring_cmd_pdu(struct > > io_uring_cmd *ioucmd) > > +{ > > + return (struct nvme_uring_cmd_pdu *)&ioucmd->pdu; > > +} > > + > > +static void nvme_pt_task_cb(struct io_uring_cmd *ioucmd) > > +{ > > + struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); > > + struct request *req = pdu->req; > > + int status; > > + struct bio *bio = req->bio; > > + > > + if (nvme_req(req)->flags & NVME_REQ_CANCELLED) > > + status = -EINTR; > > + else > > + status = nvme_req(req)->status; > > + > > + /* we can free request */ > > + blk_mq_free_request(req); > > + blk_rq_unmap_user(bio); > > + > > + if (!status && pdu->meta_buffer) { > > + if (copy_to_user(pdu->meta_buffer, pdu->meta, pdu- > > >meta_len)) > > + status = -EFAULT; > > + } > > + kfree(pdu->meta); > > + > > + io_uring_cmd_done(ioucmd, status); > > +} > > + > > +static void nvme_end_async_pt(struct request *req, blk_status_t err) > > +{ > > + struct io_uring_cmd *ioucmd = req->end_io_data; > > + struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); > > + /* extract bio before reusing the same field for request */ > > + struct bio *bio = pdu->bio; > > + > > + pdu->req = req; > > + req->bio = bio; > > + /* this takes care of setting up task-work */ > > + io_uring_cmd_complete_in_task(ioucmd, nvme_pt_task_cb); > > +} > > + > > +static void nvme_setup_uring_cmd_data(struct request *rq, > > + struct io_uring_cmd *ioucmd, void *meta, > > + void __user *meta_buffer, u32 meta_len) > > +{ > > + struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); > > + > > + /* to free bio on completion, as req->bio will be null at that time */ > > + pdu->bio = rq->bio; > > + pdu->meta = meta; > > + pdu->meta_buffer = meta_buffer; > > + pdu->meta_len = meta_len; > > + rq->end_io_data = ioucmd; > > +} > > > > static void *nvme_add_user_metadata(struct bio *bio, void __user *ubuf, > > unsigned len, u32 seed, bool write) > > @@ -56,7 +126,8 @@ static void *nvme_add_user_metadata(struct bio > > *bio, void __user *ubuf, > > static int nvme_submit_user_cmd(struct request_queue *q, > > struct nvme_command *cmd, void __user *ubuffer, > > unsigned bufflen, void __user *meta_buffer, unsigned > > meta_len, > > - u32 meta_seed, u64 *result, unsigned timeout) > > + u32 meta_seed, u64 *result, unsigned timeout, > > + struct io_uring_cmd *ioucmd) > > { > > bool write = nvme_is_write(cmd); > > struct nvme_ns *ns = q->queuedata; > > @@ -64,9 +135,15 @@ static int nvme_submit_user_cmd(struct > > request_queue *q, > > struct request *req; > > struct bio *bio = NULL; > > void *meta = NULL; > > + unsigned int rq_flags = 0; > > + blk_mq_req_flags_t blk_flags = 0; > > int ret; > > > > - req = nvme_alloc_request(q, cmd, 0, 0); > > + if (ioucmd && (ioucmd->flags & IO_URING_F_NONBLOCK)) { > > + rq_flags |= REQ_NOWAIT; > > + blk_flags |= BLK_MQ_REQ_NOWAIT; > > + } > > + req = nvme_alloc_request(q, cmd, blk_flags, rq_flags); > > if (IS_ERR(req)) > > return PTR_ERR(req); > > > > @@ -92,6 +169,19 @@ static int nvme_submit_user_cmd(struct > > request_queue *q, > > req->cmd_flags |= REQ_INTEGRITY; > > } > > } > > + if (ioucmd) { /* async dispatch */ > > + if (cmd->common.opcode == nvme_cmd_write || > > + cmd->common.opcode == nvme_cmd_read) { > > + nvme_setup_uring_cmd_data(req, ioucmd, meta, > > meta_buffer, > > + meta_len); > > + blk_execute_rq_nowait(req, 0, nvme_end_async_pt); > > + return 0; > > + } else { > > + /* support only read and write for now. */ > > + ret = -EINVAL; > > + goto out_meta; > > + } > > + } > > > > ret = nvme_execute_passthru_rq(req); > > if (result) > > @@ -100,6 +190,7 @@ static int nvme_submit_user_cmd(struct > > request_queue *q, > > if (copy_to_user(meta_buffer, meta, meta_len)) > > ret = -EFAULT; > > } > > + out_meta: > > kfree(meta); > > out_unmap: > > if (bio) > > @@ -170,7 +261,8 @@ static int nvme_submit_io(struct nvme_ns *ns, struct > > nvme_user_io __user *uio) > > > > return nvme_submit_user_cmd(ns->queue, &c, > > nvme_to_user_ptr(io.addr), length, > > - metadata, meta_len, lower_32_bits(io.slba), NULL, > > 0); > > + metadata, meta_len, lower_32_bits(io.slba), NULL, 0, > > + NULL); > > } > > > > static bool nvme_validate_passthru_nsid(struct nvme_ctrl *ctrl, > > @@ -224,7 +316,7 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, > > struct nvme_ns *ns, > > status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, > > nvme_to_user_ptr(cmd.addr), cmd.data_len, > > nvme_to_user_ptr(cmd.metadata), > > cmd.metadata_len, > > - 0, &result, timeout); > > + 0, &result, timeout, NULL); > > > > if (status >= 0) { > > if (put_user(result, &ucmd->result)) > > @@ -235,45 +327,53 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, > > struct nvme_ns *ns, > > } > > > > static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns, > > - struct nvme_passthru_cmd64 __user *ucmd) > > + struct nvme_passthru_cmd64 __user *ucmd, > > + struct io_uring_cmd *ioucmd) > > { > > - struct nvme_passthru_cmd64 cmd; > > + struct nvme_passthru_cmd64 cmd, *cptr; > > struct nvme_command c; > > unsigned timeout = 0; > > int status; > > > > if (!capable(CAP_SYS_ADMIN)) > > return -EACCES; > > - if (copy_from_user(&cmd, ucmd, sizeof(cmd))) > > - return -EFAULT; > > - if (cmd.flags) > > + if (!ioucmd) { > > + if (copy_from_user(&cmd, ucmd, sizeof(cmd))) > > + return -EFAULT; > > + cptr = &cmd; > > + } else { > > + if (ioucmd->cmd_len != sizeof(struct nvme_passthru_cmd64)) > > + return -EINVAL; > > + cptr = (struct nvme_passthru_cmd64 *)ioucmd->cmd; > > + } > > + if (cptr->flags) > > return -EINVAL; > > - if (!nvme_validate_passthru_nsid(ctrl, ns, cmd.nsid)) > > + if (!nvme_validate_passthru_nsid(ctrl, ns, cptr->nsid)) > > return -EINVAL; > > > > memset(&c, 0, sizeof(c)); > > - c.common.opcode = cmd.opcode; > > - c.common.flags = cmd.flags; > > - c.common.nsid = cpu_to_le32(cmd.nsid); > > - c.common.cdw2[0] = cpu_to_le32(cmd.cdw2); > > - c.common.cdw2[1] = cpu_to_le32(cmd.cdw3); > > - c.common.cdw10 = cpu_to_le32(cmd.cdw10); > > - c.common.cdw11 = cpu_to_le32(cmd.cdw11); > > - c.common.cdw12 = cpu_to_le32(cmd.cdw12); > > - c.common.cdw13 = cpu_to_le32(cmd.cdw13); > > - c.common.cdw14 = cpu_to_le32(cmd.cdw14); > > - c.common.cdw15 = cpu_to_le32(cmd.cdw15); > > - > > - if (cmd.timeout_ms) > > - timeout = msecs_to_jiffies(cmd.timeout_ms); > > + c.common.opcode = cptr->opcode; > > + c.common.flags = cptr->flags; > > + c.common.nsid = cpu_to_le32(cptr->nsid); > > + c.common.cdw2[0] = cpu_to_le32(cptr->cdw2); > > + c.common.cdw2[1] = cpu_to_le32(cptr->cdw3); > > + c.common.cdw10 = cpu_to_le32(cptr->cdw10); > > + c.common.cdw11 = cpu_to_le32(cptr->cdw11); > > + c.common.cdw12 = cpu_to_le32(cptr->cdw12); > > + c.common.cdw13 = cpu_to_le32(cptr->cdw13); > > + c.common.cdw14 = cpu_to_le32(cptr->cdw14); > > + c.common.cdw15 = cpu_to_le32(cptr->cdw15); > > + > > + if (cptr->timeout_ms) > > + timeout = msecs_to_jiffies(cptr->timeout_ms); > > > > status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, > > - nvme_to_user_ptr(cmd.addr), cmd.data_len, > > - nvme_to_user_ptr(cmd.metadata), > > cmd.metadata_len, > > - 0, &cmd.result, timeout); > > + nvme_to_user_ptr(cptr->addr), cptr->data_len, > > + nvme_to_user_ptr(cptr->metadata), cptr- > > >metadata_len, > > + 0, &cptr->result, timeout, ioucmd); > > > > - if (status >= 0) { > > - if (put_user(cmd.result, &ucmd->result)) > > + if (!ioucmd && status >= 0) { > > + if (put_user(cptr->result, &ucmd->result)) > > return -EFAULT; > > } > > > > @@ -296,7 +396,7 @@ static int nvme_ctrl_ioctl(struct nvme_ctrl *ctrl, > > unsigned int cmd, > > case NVME_IOCTL_ADMIN_CMD: > > return nvme_user_cmd(ctrl, NULL, argp); > > case NVME_IOCTL_ADMIN64_CMD: > > - return nvme_user_cmd64(ctrl, NULL, argp); > > + return nvme_user_cmd64(ctrl, NULL, argp, NULL); > > default: > > return sed_ioctl(ctrl->opal_dev, cmd, argp); > > } > > @@ -340,7 +440,7 @@ static int nvme_ns_ioctl(struct nvme_ns *ns, > > unsigned int cmd, > > case NVME_IOCTL_SUBMIT_IO: > > return nvme_submit_io(ns, argp); > > case NVME_IOCTL_IO64_CMD: > > - return nvme_user_cmd64(ns->ctrl, ns, argp); > > + return nvme_user_cmd64(ns->ctrl, ns, argp, NULL); > > default: > > return -ENOTTY; > > } > > @@ -369,6 +469,33 @@ long nvme_ns_chr_ioctl(struct file *file, unsigned int > > cmd, unsigned long arg) > > return __nvme_ioctl(ns, cmd, (void __user *)arg); > > } > > > > +static int nvme_ns_async_ioctl(struct nvme_ns *ns, struct io_uring_cmd > > *ioucmd) > > +{ > > + int ret; > > + > > + BUILD_BUG_ON(sizeof(struct nvme_uring_cmd_pdu) > > > sizeof(ioucmd->pdu)); > > + > > + switch (ioucmd->cmd_op) { > > + case NVME_IOCTL_IO64_CMD: > > + ret = nvme_user_cmd64(ns->ctrl, ns, NULL, ioucmd); > > + break; > > + default: > > + ret = -ENOTTY; > > + } > > + > > + if (ret >= 0) > > + ret = -EIOCBQUEUED; > > + return ret; > > +} > > ret can equal -EAGAIN, which will cause io_uring to reissue the cmd > from a worker thread. This can happen when ioucmd->flags has > IO_URING_F_NONBLOCK set causing nvme_alloc_request() to return > -EAGAIN when there are no tags available. > > Either -EAGAIN needs to be remapped or force set REQ_F_NOWAIT in the > io_uring cmd request in patch 3 (the 2nd option is untested). This patch already sets REQ_F_NOWAIT for IO_URING_F_NONBLOCK flag. Here: + if (ioucmd && (ioucmd->flags & IO_URING_F_NONBLOCK)) { + rq_flags |= REQ_NOWAIT; + blk_flags |= BLK_MQ_REQ_NOWAIT; + } + req = nvme_alloc_request(q, cmd, blk_flags, rq_flags); And if -EAGAIN goes to io_uring, we don't try to setup the worker and instead return the error to userspace for retry. Here is the relevant fragment from Patch 3: + ioucmd->flags |= issue_flags; + ret = file->f_op->async_cmd(ioucmd); + /* queued async, consumer will call io_uring_cmd_done() when complete */ + if (ret == -EIOCBQUEUED) + return 0; + io_uring_cmd_done(ioucmd, ret); > > + > > +int nvme_ns_chr_async_cmd(struct io_uring_cmd *ioucmd) > > +{ > > + struct nvme_ns *ns = container_of(file_inode(ioucmd->file)->i_cdev, > > + struct nvme_ns, cdev); > > + > > + return nvme_ns_async_ioctl(ns, ioucmd); > > +} > > + > > #ifdef CONFIG_NVME_MULTIPATH > > static int nvme_ns_head_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd, > > void __user *argp, struct nvme_ns_head *head, int srcu_idx) > > @@ -412,6 +539,20 @@ int nvme_ns_head_ioctl(struct block_device *bdev, > > fmode_t mode, > > return ret; > > } > > > > +int nvme_ns_head_chr_async_cmd(struct io_uring_cmd *ioucmd) > > +{ > > + struct cdev *cdev = file_inode(ioucmd->file)->i_cdev; > > + struct nvme_ns_head *head = container_of(cdev, struct > > nvme_ns_head, cdev); > > + int srcu_idx = srcu_read_lock(&head->srcu); > > + struct nvme_ns *ns = nvme_find_path(head); > > + int ret = -EWOULDBLOCK; > > -EWOULDBLOCK has the same value as -EAGAIN so the same issue > Is here as with nvme_ns_async_ioctl() returning it. Same as above. > > + > > + if (ns) > > + ret = nvme_ns_async_ioctl(ns, ioucmd); > > + srcu_read_unlock(&head->srcu, srcu_idx); > > + return ret; > > +} > > + > > long nvme_ns_head_chr_ioctl(struct file *file, unsigned int cmd, > > unsigned long arg) > > { > > @@ -480,7 +621,7 @@ long nvme_dev_ioctl(struct file *file, unsigned int > > cmd, > > case NVME_IOCTL_ADMIN_CMD: > > return nvme_user_cmd(ctrl, NULL, argp); > > case NVME_IOCTL_ADMIN64_CMD: > > - return nvme_user_cmd64(ctrl, NULL, argp); > > + return nvme_user_cmd64(ctrl, NULL, argp, NULL); > > case NVME_IOCTL_IO_CMD: > > return nvme_dev_user_cmd(ctrl, argp); > > case NVME_IOCTL_RESET: > > diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c > > index f8bf6606eb2f..1d798d09456f 100644 > > --- a/drivers/nvme/host/multipath.c > > +++ b/drivers/nvme/host/multipath.c > > @@ -459,6 +459,7 @@ static const struct file_operations > > nvme_ns_head_chr_fops = { > > .release = nvme_ns_head_chr_release, > > .unlocked_ioctl = nvme_ns_head_chr_ioctl, > > .compat_ioctl = compat_ptr_ioctl, > > + .async_cmd = nvme_ns_head_chr_async_cmd, > > }; > > > > static int nvme_add_ns_head_cdev(struct nvme_ns_head *head) > > diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h > > index b32f4e2c68fd..e6a30543d7c8 100644 > > --- a/drivers/nvme/host/nvme.h > > +++ b/drivers/nvme/host/nvme.h > > @@ -16,6 +16,7 @@ > > #include <linux/rcupdate.h> > > #include <linux/wait.h> > > #include <linux/t10-pi.h> > > +#include <linux/io_uring.h> > > > > #include <trace/events/block.h> > > > > @@ -752,6 +753,8 @@ long nvme_ns_head_chr_ioctl(struct file *file, > > unsigned int cmd, > > unsigned long arg); > > long nvme_dev_ioctl(struct file *file, unsigned int cmd, > > unsigned long arg); > > +int nvme_ns_chr_async_cmd(struct io_uring_cmd *ioucmd); > > +int nvme_ns_head_chr_async_cmd(struct io_uring_cmd *ioucmd); > > int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo); > > > > extern const struct attribute_group *nvme_ns_id_attr_groups[]; > > -- > > 2.25.1 > > On 5.10 with our version of this patch, I've seen that returning -EAGAIN to > io_uring results in poisoned bios and crashed kernel threads (NULL current->mm) > while constructing the async pass through request. I looked at > git://git.kernel.dk/linux-block and git://git.infradead.org/nvme.git > and as best as I can tell, the same thing will happen. As pointed above in the snippet, any error except -EIOCBQUEUED is just posted in CQE during submission itself. So I do not see why -EAGAIN should cause trouble, at least in this patchset. FWIW- I tested by forcefully returning -EAGAIN from nvme, and also tag saturation case (which also returns -EAGAIN) and did not see that sort of issue. Please take this series for a spin. Kernel: for-next in linux-block, on top of commit 9e9d83faa ("io_uring: Remove unneeded test in io_run_task_work_sig")
On Tue, Mar 08, 2022 at 08:50:53PM +0530, Kanchan Joshi wrote: > +/* > + * This overlays struct io_uring_cmd pdu. > + * Expect build errors if this grows larger than that. > + */ > +struct nvme_uring_cmd_pdu { > + u32 meta_len; > + union { > + struct bio *bio; > + struct request *req; > + }; > + void *meta; /* kernel-resident buffer */ > + void __user *meta_buffer; > +} __packed; Why is this marked __packed? In general I'd be much more happy if the meta elelements were a io_uring-level feature handled outside the driver and typesafe in struct io_uring_cmd, with just a normal private data pointer for the actual user, which would remove all the crazy casting. > +static void nvme_end_async_pt(struct request *req, blk_status_t err) > +{ > + struct io_uring_cmd *ioucmd = req->end_io_data; > + struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); > + /* extract bio before reusing the same field for request */ > + struct bio *bio = pdu->bio; > + > + pdu->req = req; > + req->bio = bio; > + /* this takes care of setting up task-work */ > + io_uring_cmd_complete_in_task(ioucmd, nvme_pt_task_cb); This is a bit silly. First we defer the actual request I/O completion from the block layer to a different CPU or softirq and then we have another callback here. I think it would be much more useful if we could find a way to enhance blk_mq_complete_request so that it could directly complet in a given task. That would also be really nice for say normal synchronous direct I/O. > + if (ioucmd) { /* async dispatch */ > + if (cmd->common.opcode == nvme_cmd_write || > + cmd->common.opcode == nvme_cmd_read) { No we can't just check for specific commands in the passthrough handler. > + nvme_setup_uring_cmd_data(req, ioucmd, meta, meta_buffer, > + meta_len); > + blk_execute_rq_nowait(req, 0, nvme_end_async_pt); > + return 0; > + } else { > + /* support only read and write for now. */ > + ret = -EINVAL; > + goto out_meta; > + } Pleae always handle error in the first branch and don't bother with an else after a goto or return. > +static int nvme_ns_async_ioctl(struct nvme_ns *ns, struct io_uring_cmd *ioucmd) > +{ > + int ret; > + > + BUILD_BUG_ON(sizeof(struct nvme_uring_cmd_pdu) > sizeof(ioucmd->pdu)); > + > + switch (ioucmd->cmd_op) { > + case NVME_IOCTL_IO64_CMD: > + ret = nvme_user_cmd64(ns->ctrl, ns, NULL, ioucmd); > + break; > + default: > + ret = -ENOTTY; > + } > + > + if (ret >= 0) > + ret = -EIOCBQUEUED; That's a weird way to handle the returns. Just return -EIOCBQUEUED directly from the handler (which as said before should be split from the ioctl handler anyway).
On Tue, Mar 08, 2022 at 08:50:53PM +0530, Kanchan Joshi wrote: > diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c > index 5c9cd9695519..1df270b47af5 100644 > --- a/drivers/nvme/host/ioctl.c > +++ b/drivers/nvme/host/ioctl.c > @@ -369,6 +469,33 @@ long nvme_ns_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg) > return __nvme_ioctl(ns, cmd, (void __user *)arg); > } > > +static int nvme_ns_async_ioctl(struct nvme_ns *ns, struct io_uring_cmd *ioucmd) > +{ > + int ret; > + > + BUILD_BUG_ON(sizeof(struct nvme_uring_cmd_pdu) > sizeof(ioucmd->pdu)); > + > + switch (ioucmd->cmd_op) { > + case NVME_IOCTL_IO64_CMD: > + ret = nvme_user_cmd64(ns->ctrl, ns, NULL, ioucmd); > + break; > + default: > + ret = -ENOTTY; > + } > + > + if (ret >= 0) > + ret = -EIOCBQUEUED; > + return ret; > +} And here I think we'll need something like this: diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index ddb7e5864be6..83529adf130d 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -5,6 +5,7 @@ */ #include <linux/ptrace.h> /* for force_successful_syscall_return */ #include <linux/nvme_ioctl.h> +#include <linux/security.h> #include "nvme.h" /* @@ -524,6 +525,11 @@ static int nvme_ns_async_ioctl(struct nvme_ns *ns, struct io_uring_cmd *ioucmd) BUILD_BUG_ON(sizeof(struct nvme_uring_cmd_pdu) > sizeof(ioucmd->pdu)); + ret = security_file_ioctl(ioucmd->file, ioucmd->cmd_op, + (unsigned long) ioucmd->cmd); + if (ret) + return ret; + switch (ioucmd->cmd_op) { case NVME_IOCTL_IO64_CMD: ret = nvme_user_cmd64(ns->ctrl, ns, NULL, ioucmd);
On Fri, Mar 11, 2022 at 12:56 PM Luis Chamberlain <mcgrof@kernel.org> wrote: > > On Tue, Mar 08, 2022 at 08:50:53PM +0530, Kanchan Joshi wrote: > > diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c > > index 5c9cd9695519..1df270b47af5 100644 > > --- a/drivers/nvme/host/ioctl.c > > +++ b/drivers/nvme/host/ioctl.c > > @@ -369,6 +469,33 @@ long nvme_ns_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg) > > return __nvme_ioctl(ns, cmd, (void __user *)arg); > > } > > > > +static int nvme_ns_async_ioctl(struct nvme_ns *ns, struct io_uring_cmd *ioucmd) > > +{ > > + int ret; > > + > > + BUILD_BUG_ON(sizeof(struct nvme_uring_cmd_pdu) > sizeof(ioucmd->pdu)); > > + > > + switch (ioucmd->cmd_op) { > > + case NVME_IOCTL_IO64_CMD: > > + ret = nvme_user_cmd64(ns->ctrl, ns, NULL, ioucmd); > > + break; > > + default: > > + ret = -ENOTTY; > > + } > > + > > + if (ret >= 0) > > + ret = -EIOCBQUEUED; > > + return ret; > > +} > > And here I think we'll need something like this: If we can promise that we will have a LSM hook for all of the file_operations::async_cmd implementations that are security relevant we could skip the LSM passthrough hook at the io_uring layer. It would potentially make life easier in that we don't have to worry about putting the passthrough op in the right context, but risks missing a LSM hook control point (it will happen at some point and *boom* CVE). > diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c > index ddb7e5864be6..83529adf130d 100644 > --- a/drivers/nvme/host/ioctl.c > +++ b/drivers/nvme/host/ioctl.c > @@ -5,6 +5,7 @@ > */ > #include <linux/ptrace.h> /* for force_successful_syscall_return */ > #include <linux/nvme_ioctl.h> > +#include <linux/security.h> > #include "nvme.h" > > /* > @@ -524,6 +525,11 @@ static int nvme_ns_async_ioctl(struct nvme_ns *ns, struct io_uring_cmd *ioucmd) > > BUILD_BUG_ON(sizeof(struct nvme_uring_cmd_pdu) > sizeof(ioucmd->pdu)); > > + ret = security_file_ioctl(ioucmd->file, ioucmd->cmd_op, > + (unsigned long) ioucmd->cmd); > + if (ret) > + return ret; > + > switch (ioucmd->cmd_op) { > case NVME_IOCTL_IO64_CMD: > ret = nvme_user_cmd64(ns->ctrl, ns, NULL, ioucmd);
On Fri, Mar 11, 2022 at 01:53:03PM -0500, Paul Moore wrote: > On Fri, Mar 11, 2022 at 12:56 PM Luis Chamberlain <mcgrof@kernel.org> wrote: > > > > On Tue, Mar 08, 2022 at 08:50:53PM +0530, Kanchan Joshi wrote: > > > diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c > > > index 5c9cd9695519..1df270b47af5 100644 > > > --- a/drivers/nvme/host/ioctl.c > > > +++ b/drivers/nvme/host/ioctl.c > > > @@ -369,6 +469,33 @@ long nvme_ns_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg) > > > return __nvme_ioctl(ns, cmd, (void __user *)arg); > > > } > > > > > > +static int nvme_ns_async_ioctl(struct nvme_ns *ns, struct io_uring_cmd *ioucmd) > > > +{ > > > + int ret; > > > + > > > + BUILD_BUG_ON(sizeof(struct nvme_uring_cmd_pdu) > sizeof(ioucmd->pdu)); > > > + > > > + switch (ioucmd->cmd_op) { > > > + case NVME_IOCTL_IO64_CMD: > > > + ret = nvme_user_cmd64(ns->ctrl, ns, NULL, ioucmd); > > > + break; > > > + default: > > > + ret = -ENOTTY; > > > + } > > > + > > > + if (ret >= 0) > > > + ret = -EIOCBQUEUED; > > > + return ret; > > > +} > > > > And here I think we'll need something like this: > > If we can promise that we will have a LSM hook for all of the > file_operations::async_cmd implementations that are security relevant > we could skip the LSM passthrough hook at the io_uring layer. There is no way to ensure this unless perhaps we cake that into the API somehow... Or have a registration system for setting the respctive file ops / LSM. > It > would potentially make life easier in that we don't have to worry > about putting the passthrough op in the right context, but risks > missing a LSM hook control point (it will happen at some point and > *boom* CVE). Precicely my concern. So we either open code this and ask folks to do this or I think we do a registration and require both ops and the the LSM hook at registration. I think this should be enough information to get Kanchan rolling on the LSM side. Luis
> +int nvme_ns_head_chr_async_cmd(struct io_uring_cmd *ioucmd) > +{ > + struct cdev *cdev = file_inode(ioucmd->file)->i_cdev; > + struct nvme_ns_head *head = container_of(cdev, struct nvme_ns_head, cdev); > + int srcu_idx = srcu_read_lock(&head->srcu); > + struct nvme_ns *ns = nvme_find_path(head); > + int ret = -EWOULDBLOCK; > + > + if (ns) > + ret = nvme_ns_async_ioctl(ns, ioucmd); > + srcu_read_unlock(&head->srcu, srcu_idx); > + return ret; > +} No one cares that this has no multipathing capabilities what-so-ever? despite being issued on the mpath device node? I know we are not doing multipathing for userspace today, but this feels like an alternative I/O interface for nvme, seems a bit cripled with zero multipathing capabilities...
On Fri, Mar 11, 2022 at 08:01:48AM +0100, Christoph Hellwig wrote: >On Tue, Mar 08, 2022 at 08:50:53PM +0530, Kanchan Joshi wrote: >> +/* >> + * This overlays struct io_uring_cmd pdu. >> + * Expect build errors if this grows larger than that. >> + */ >> +struct nvme_uring_cmd_pdu { >> + u32 meta_len; >> + union { >> + struct bio *bio; >> + struct request *req; >> + }; >> + void *meta; /* kernel-resident buffer */ >> + void __user *meta_buffer; >> +} __packed; > >Why is this marked __packed? Did not like doing it, but had to. If not packed, this takes 32 bytes of space. While driver-pdu in struct io_uring_cmd can take max 30 bytes. Packing nvme-pdu brought it down to 28 bytes, which fits and gives 2 bytes back. For quick reference - struct io_uring_cmd { struct file * file; /* 0 8 */ void * cmd; /* 8 8 */ union { void * bio; /* 16 8 */ void (*driver_cb)(struct io_uring_cmd *); /* 16 8 */ }; /* 16 8 */ u32 flags; /* 24 4 */ u32 cmd_op; /* 28 4 */ u16 cmd_len; /* 32 2 */ u16 unused; /* 34 2 */ u8 pdu[28]; /* 36 28 */ /* size: 64, cachelines: 1, members: 8 */ }; io_uring_cmd struct goes into the first cacheline of io_kiocb. Last field is pdu, taking 28 bytes. Will be 30 if I evaporate above field. nvme-pdu after packing: struct nvme_uring_cmd_pdu { u32 meta_len; /* 0 4 */ union { struct bio * bio; /* 4 8 */ struct request * req; /* 4 8 */ }; /* 4 8 */ void * meta; /* 12 8 */ void * meta_buffer; /* 20 8 */ /* size: 28, cachelines: 1, members: 4 */ /* last cacheline: 28 bytes */ } __attribute__((__packed__)); >In general I'd be much more happy if the meta elelements were a >io_uring-level feature handled outside the driver and typesafe in >struct io_uring_cmd, with just a normal private data pointer for the >actual user, which would remove all the crazy casting. Not sure if I got your point. +static struct nvme_uring_cmd_pdu *nvme_uring_cmd_pdu(struct io_uring_cmd *ioucmd) +{ + return (struct nvme_uring_cmd_pdu *)&ioucmd->pdu; +} + +static void nvme_pt_task_cb(struct io_uring_cmd *ioucmd) +{ + struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); Do you mean crazy casting inside nvme_uring_cmd_pdu()? Somehow this looks sane to me (perhaps because it used to be crazier earlier). And on moving meta elements outside the driver, my worry is that it reduces scope of uring-cmd infra and makes it nvme passthru specific. At this point uring-cmd is still generic async ioctl/fsctl facility which may find other users (than nvme-passthru) down the line. Organization of fields within "struct io_uring_cmd" is around the rule that a field is kept out (of 28 bytes pdu) only if is accessed by both io_uring and driver. >> +static void nvme_end_async_pt(struct request *req, blk_status_t err) >> +{ >> + struct io_uring_cmd *ioucmd = req->end_io_data; >> + struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); >> + /* extract bio before reusing the same field for request */ >> + struct bio *bio = pdu->bio; >> + >> + pdu->req = req; >> + req->bio = bio; >> + /* this takes care of setting up task-work */ >> + io_uring_cmd_complete_in_task(ioucmd, nvme_pt_task_cb); > >This is a bit silly. First we defer the actual request I/O completion >from the block layer to a different CPU or softirq and then we have >another callback here. I think it would be much more useful if we >could find a way to enhance blk_mq_complete_request so that it could >directly complet in a given task. That would also be really nice for >say normal synchronous direct I/O. I see, so there is room for adding some efficiency. Hope it will be ok if I carry this out as a separate effort. Since this is about touching blk_mq_complete_request at its heart, and improving sync-direct-io, this does not seem best-fit and slow this series down. FWIW, I ran the tests with counters inside blk_mq_complete_request_remote() if (blk_mq_complete_need_ipi(rq)) { blk_mq_complete_send_ipi(rq); return true; } if (rq->q->nr_hw_queues == 1) { blk_mq_raise_softirq(rq); return true; } Deferring by ipi or softirq never occured. Neither for block nor for char. Softirq is obvious since I was not running against scsi (or nvme with single queue). I could not spot whether this is really a overhead, at least for nvme. >> + if (ioucmd) { /* async dispatch */ >> + if (cmd->common.opcode == nvme_cmd_write || >> + cmd->common.opcode == nvme_cmd_read) { > >No we can't just check for specific commands in the passthrough handler. Right. This is for inline-cmd approach. Last two patches of the series undo this (for indirect-cmd). I will do something about it. >> + nvme_setup_uring_cmd_data(req, ioucmd, meta, meta_buffer, >> + meta_len); >> + blk_execute_rq_nowait(req, 0, nvme_end_async_pt); >> + return 0; >> + } else { >> + /* support only read and write for now. */ >> + ret = -EINVAL; >> + goto out_meta; >> + } > >Pleae always handle error in the first branch and don't bother with an >else after a goto or return. Yes, that'll be better. >> +static int nvme_ns_async_ioctl(struct nvme_ns *ns, struct io_uring_cmd *ioucmd) >> +{ >> + int ret; >> + >> + BUILD_BUG_ON(sizeof(struct nvme_uring_cmd_pdu) > sizeof(ioucmd->pdu)); >> + >> + switch (ioucmd->cmd_op) { >> + case NVME_IOCTL_IO64_CMD: >> + ret = nvme_user_cmd64(ns->ctrl, ns, NULL, ioucmd); >> + break; >> + default: >> + ret = -ENOTTY; >> + } >> + >> + if (ret >= 0) >> + ret = -EIOCBQUEUED; > >That's a weird way to handle the returns. Just return -EIOCBQUEUED >directly from the handler (which as said before should be split from >the ioctl handler anyway). Indeed. That will make it cleaner.
On Mon, Mar 14, 2022 at 3:23 AM Sagi Grimberg <sagi@grimberg.me> wrote: > > > > +int nvme_ns_head_chr_async_cmd(struct io_uring_cmd *ioucmd) > > +{ > > + struct cdev *cdev = file_inode(ioucmd->file)->i_cdev; > > + struct nvme_ns_head *head = container_of(cdev, struct nvme_ns_head, cdev); > > + int srcu_idx = srcu_read_lock(&head->srcu); > > + struct nvme_ns *ns = nvme_find_path(head); > > + int ret = -EWOULDBLOCK; > > + > > + if (ns) > > + ret = nvme_ns_async_ioctl(ns, ioucmd); > > + srcu_read_unlock(&head->srcu, srcu_idx); > > + return ret; > > +} > > No one cares that this has no multipathing capabilities what-so-ever? > despite being issued on the mpath device node? > > I know we are not doing multipathing for userspace today, but this > feels like an alternative I/O interface for nvme, seems a bit cripled > with zero multipathing capabilities... Multipathing is on the radar. Either in the first cut or in subsequent. Thanks for bringing this up. So the char-node (/dev/ngX) will be exposed to the host if we enable controller passthru on the target side. And then the host can send commands using uring-passthru in the same way. May I know what are the other requirements here. Bit of a shame that I missed adding that in the LSF proposal, but it's correctible.
On Mon, Mar 14, 2022 at 09:53:56PM +0530, Kanchan Joshi wrote: >>> +struct nvme_uring_cmd_pdu { >>> + u32 meta_len; >>> + union { >>> + struct bio *bio; >>> + struct request *req; >>> + }; >>> + void *meta; /* kernel-resident buffer */ >>> + void __user *meta_buffer; >>> +} __packed; >> >> Why is this marked __packed? > Did not like doing it, but had to. > If not packed, this takes 32 bytes of space. While driver-pdu in struct > io_uring_cmd can take max 30 bytes. Packing nvme-pdu brought it down to > 28 bytes, which fits and gives 2 bytes back. What if you move meta_len to the end? Even if we need the __packed that will avoid all the unaligned access to pointers, which on some architectures will crash the kernel. > And on moving meta elements outside the driver, my worry is that it > reduces scope of uring-cmd infra and makes it nvme passthru specific. > At this point uring-cmd is still generic async ioctl/fsctl facility > which may find other users (than nvme-passthru) down the line. Organization > of fields within "struct io_uring_cmd" is around the rule > that a field is kept out (of 28 bytes pdu) only if is accessed by both > io_uring and driver. We have plenty of other interfaces of that kind. Sockets are one case already, and regular read/write with protection information will be another one. So having some core infrastrucure for "secondary data" seems very useful. > I see, so there is room for adding some efficiency. > Hope it will be ok if I carry this out as a separate effort. > Since this is about touching blk_mq_complete_request at its heart, and > improving sync-direct-io, this does not seem best-fit and slow this > series down. I really rather to this properly. Especially as the current effort adds new exported interfaces. > Deferring by ipi or softirq never occured. Neither for block nor for > char. Softirq is obvious since I was not running against scsi (or nvme with > single queue). I could not spot whether this is really a overhead, at > least for nvme. This tends to kick in if you have less queues than cpu cores. Quite command with either a high core cound or a not very high end nvme controller.
>>> +int nvme_ns_head_chr_async_cmd(struct io_uring_cmd *ioucmd) >>> +{ >>> + struct cdev *cdev = file_inode(ioucmd->file)->i_cdev; >>> + struct nvme_ns_head *head = container_of(cdev, struct nvme_ns_head, cdev); >>> + int srcu_idx = srcu_read_lock(&head->srcu); >>> + struct nvme_ns *ns = nvme_find_path(head); >>> + int ret = -EWOULDBLOCK; >>> + >>> + if (ns) >>> + ret = nvme_ns_async_ioctl(ns, ioucmd); >>> + srcu_read_unlock(&head->srcu, srcu_idx); >>> + return ret; >>> +} >> >> No one cares that this has no multipathing capabilities what-so-ever? >> despite being issued on the mpath device node? >> >> I know we are not doing multipathing for userspace today, but this >> feels like an alternative I/O interface for nvme, seems a bit cripled >> with zero multipathing capabilities... > > Multipathing is on the radar. Either in the first cut or in > subsequent. Thanks for bringing this up. Good to know... > So the char-node (/dev/ngX) will be exposed to the host if we enable > controller passthru on the target side. And then the host can send > commands using uring-passthru in the same way. Not sure I follow... > May I know what are the other requirements here. Again, not sure I follow... The fundamental capability is to requeue/failover I/O if there is no I/O capable path available... > Bit of a shame that I missed adding that in the LSF proposal, but it's > correctible.
On Tue, Mar 15, 2022 at 09:54:10AM +0100, Christoph Hellwig wrote: >On Mon, Mar 14, 2022 at 09:53:56PM +0530, Kanchan Joshi wrote: >>>> +struct nvme_uring_cmd_pdu { >>>> + u32 meta_len; >>>> + union { >>>> + struct bio *bio; >>>> + struct request *req; >>>> + }; >>>> + void *meta; /* kernel-resident buffer */ >>>> + void __user *meta_buffer; >>>> +} __packed; >>> >>> Why is this marked __packed? >> Did not like doing it, but had to. >> If not packed, this takes 32 bytes of space. While driver-pdu in struct >> io_uring_cmd can take max 30 bytes. Packing nvme-pdu brought it down to >> 28 bytes, which fits and gives 2 bytes back. > >What if you move meta_len to the end? Even if we need the __packed >that will avoid all the unaligned access to pointers, which on some >architectures will crash the kernel. ah, right. Will move that to the end. >> And on moving meta elements outside the driver, my worry is that it >> reduces scope of uring-cmd infra and makes it nvme passthru specific. >> At this point uring-cmd is still generic async ioctl/fsctl facility >> which may find other users (than nvme-passthru) down the line. Organization >> of fields within "struct io_uring_cmd" is around the rule >> that a field is kept out (of 28 bytes pdu) only if is accessed by both >> io_uring and driver. > >We have plenty of other interfaces of that kind. Sockets are one case >already, and regular read/write with protection information will be >another one. So having some core infrastrucure for "secondary data" >seems very useful. So what is the picture that you have in mind for struct io_uring_cmd? Moving meta fields out makes it look like this - @@ -28,7 +28,10 @@ struct io_uring_cmd { u32 cmd_op; u16 cmd_len; u16 unused; - u8 pdu[28]; /* available inline for free use */ + void __user *meta_buffer; /* nvme pt specific */ + u32 meta_len; /* nvme pt specific */ + u8 pdu[16]; /* available inline for free use */ + }; And corresponding nvme 16 byte pdu - struct nvme_uring_cmd_pdu { - u32 meta_len; union { struct bio *bio; struct request *req; }; void *meta; /* kernel-resident buffer */ - void __user *meta_buffer; } __packed; I do not understand how this helps. Only the generic space (28 bytes) got reduced to 16 bytes. >> I see, so there is room for adding some efficiency. >> Hope it will be ok if I carry this out as a separate effort. >> Since this is about touching blk_mq_complete_request at its heart, and >> improving sync-direct-io, this does not seem best-fit and slow this >> series down. > >I really rather to this properly. Especially as the current effort >adds new exported interfaces. Seems you are referring to io_uring_cmd_complete_in_task(). We would still need to use/export that even if we somehow manage to move task-work trigger from nvme-function to blk_mq_complete_request. io_uring's task-work infra is more baked than raw task-work infra. It would not be good to repeat all that code elsewhere. I tried raw one in the first attempt, and Jens suggested to move to baked one. Here is the link that gave birth to this interface - https://lore.kernel.org/linux-nvme/6d847f4a-65a5-bc62-1d36-52e222e3d142@kernel.dk/ >> Deferring by ipi or softirq never occured. Neither for block nor for >> char. Softirq is obvious since I was not running against scsi (or nvme with >> single queue). I could not spot whether this is really a overhead, at >> least for nvme. > >This tends to kick in if you have less queues than cpu cores. Quite >command with either a high core cound or a not very high end nvme >controller. I will check that. But swtiching (irq to task-work) is more generic and not about this series. Triggering task-work anyway happens for regular read/write completion too (in io_uring)...in the same return path involving blk_mq_complete_request. For passthru, we are just triggering this somewhat earlier in the completion path.
On Tue, Mar 15, 2022 at 11:02:30AM +0200, Sagi Grimberg wrote: > >>>>+int nvme_ns_head_chr_async_cmd(struct io_uring_cmd *ioucmd) >>>>+{ >>>>+ struct cdev *cdev = file_inode(ioucmd->file)->i_cdev; >>>>+ struct nvme_ns_head *head = container_of(cdev, struct nvme_ns_head, cdev); >>>>+ int srcu_idx = srcu_read_lock(&head->srcu); >>>>+ struct nvme_ns *ns = nvme_find_path(head); >>>>+ int ret = -EWOULDBLOCK; >>>>+ >>>>+ if (ns) >>>>+ ret = nvme_ns_async_ioctl(ns, ioucmd); >>>>+ srcu_read_unlock(&head->srcu, srcu_idx); >>>>+ return ret; >>>>+} >>> >>>No one cares that this has no multipathing capabilities what-so-ever? >>>despite being issued on the mpath device node? >>> >>>I know we are not doing multipathing for userspace today, but this >>>feels like an alternative I/O interface for nvme, seems a bit cripled >>>with zero multipathing capabilities... >> >>Multipathing is on the radar. Either in the first cut or in >>subsequent. Thanks for bringing this up. > >Good to know... > >>So the char-node (/dev/ngX) will be exposed to the host if we enable >>controller passthru on the target side. And then the host can send >>commands using uring-passthru in the same way. > >Not sure I follow... Doing this on target side: echo -n /dev/nvme0 > /sys/kernel/config/nvmet/subsystems/testnqn/passthru/device_path echo 1 > /sys/kernel/config/nvmet/subsystems/testnqn/passthru/enable >>May I know what are the other requirements here. > >Again, not sure I follow... The fundamental capability is to >requeue/failover I/O if there is no I/O capable path available... That is covered I think, with nvme_find_path() at places including the one you highlighted above.
On 3/16/22 11:21, Kanchan Joshi wrote: > On Tue, Mar 15, 2022 at 11:02:30AM +0200, Sagi Grimberg wrote: >> >>>>> +int nvme_ns_head_chr_async_cmd(struct io_uring_cmd *ioucmd) >>>>> +{ >>>>> + struct cdev *cdev = file_inode(ioucmd->file)->i_cdev; >>>>> + struct nvme_ns_head *head = container_of(cdev, struct >>>>> nvme_ns_head, cdev); >>>>> + int srcu_idx = srcu_read_lock(&head->srcu); >>>>> + struct nvme_ns *ns = nvme_find_path(head); >>>>> + int ret = -EWOULDBLOCK; >>>>> + >>>>> + if (ns) >>>>> + ret = nvme_ns_async_ioctl(ns, ioucmd); >>>>> + srcu_read_unlock(&head->srcu, srcu_idx); >>>>> + return ret; >>>>> +} >>>> >>>> No one cares that this has no multipathing capabilities what-so-ever? >>>> despite being issued on the mpath device node? >>>> >>>> I know we are not doing multipathing for userspace today, but this >>>> feels like an alternative I/O interface for nvme, seems a bit cripled >>>> with zero multipathing capabilities... >>> >>> Multipathing is on the radar. Either in the first cut or in >>> subsequent. Thanks for bringing this up. >> >> Good to know... >> >>> So the char-node (/dev/ngX) will be exposed to the host if we enable >>> controller passthru on the target side. And then the host can send >>> commands using uring-passthru in the same way. >> >> Not sure I follow... > > Doing this on target side: > echo -n /dev/nvme0 > > /sys/kernel/config/nvmet/subsystems/testnqn/passthru/device_path > echo 1 > /sys/kernel/config/nvmet/subsystems/testnqn/passthru/enable Cool, what does that have to do with what I asked? >>> May I know what are the other requirements here. >> >> Again, not sure I follow... The fundamental capability is to >> requeue/failover I/O if there is no I/O capable path available... > > That is covered I think, with nvme_find_path() at places including the > one you highlighted above. No it isn't. nvme_find_path is a simple function that retrieves an I/O capable path which is not guaranteed to exist, it has nothing to do with I/O requeue/failover. Please see nvme_ns_head_submit_bio, nvme_failover_req, nvme_requeue_work.
> On 3/16/22 11:21, Kanchan Joshi wrote: > > On Tue, Mar 15, 2022 at 11:02:30AM +0200, Sagi Grimberg wrote: > >> > >>>>> +int nvme_ns_head_chr_async_cmd(struct io_uring_cmd *ioucmd) > >>>>> +{ > >>>>> + struct cdev *cdev = file_inode(ioucmd->file)->i_cdev; > >>>>> + struct nvme_ns_head *head = container_of(cdev, struct > >>>>> nvme_ns_head, cdev); > >>>>> + int srcu_idx = srcu_read_lock(&head->srcu); > >>>>> + struct nvme_ns *ns = nvme_find_path(head); > >>>>> + int ret = -EWOULDBLOCK; > >>>>> + > >>>>> + if (ns) > >>>>> + ret = nvme_ns_async_ioctl(ns, ioucmd); > >>>>> + srcu_read_unlock(&head->srcu, srcu_idx); > >>>>> + return ret; > >>>>> +} > >>>> > >>>> No one cares that this has no multipathing capabilities what-so-ever? > >>>> despite being issued on the mpath device node? > >>>> > >>>> I know we are not doing multipathing for userspace today, but this > >>>> feels like an alternative I/O interface for nvme, seems a bit cripled > >>>> with zero multipathing capabilities... > >>> > >>> Multipathing is on the radar. Either in the first cut or in > >>> subsequent. Thanks for bringing this up. > >> > >> Good to know... > >> > >>> So the char-node (/dev/ngX) will be exposed to the host if we enable > >>> controller passthru on the target side. And then the host can send > >>> commands using uring-passthru in the same way. > >> > >> Not sure I follow... > > > > Doing this on target side: > > echo -n /dev/nvme0 > > > /sys/kernel/config/nvmet/subsystems/testnqn/passthru/device_path > > echo 1 > /sys/kernel/config/nvmet/subsystems/testnqn/passthru/enable > > Cool, what does that have to do with what I asked? Maybe nothing. This was rather about how to set up nvmeof if block-interface does not exist for the underlying nvme device. > >>> May I know what are the other requirements here. > >> > >> Again, not sure I follow... The fundamental capability is to > >> requeue/failover I/O if there is no I/O capable path available... > > > > That is covered I think, with nvme_find_path() at places including the > > one you highlighted above. > > No it isn't. nvme_find_path is a simple function that retrieves an I/O > capable path which is not guaranteed to exist, it has nothing to do with > I/O requeue/failover. Got it, thanks. Passthrough (sync or async) just returns the failure to user-space if that fails. No attempt to retry/requeue as the block path does.
>>>>>> No one cares that this has no multipathing capabilities what-so-ever? >>>>>> despite being issued on the mpath device node? >>>>>> >>>>>> I know we are not doing multipathing for userspace today, but this >>>>>> feels like an alternative I/O interface for nvme, seems a bit cripled >>>>>> with zero multipathing capabilities... [...] > Got it, thanks. Passthrough (sync or async) just returns the failure > to user-space if that fails. > No attempt to retry/requeue as the block path does. I know, and that was my original question, no one cares that this interface completely lacks this capability? Maybe it is fine, but it is not a trivial assumption given that this is designed to be more than an interface to send admin/vs commands to the controller...
On 3/16/22 7:52 AM, Sagi Grimberg wrote: > >>>>>>> No one cares that this has no multipathing capabilities what-so-ever? >>>>>>> despite being issued on the mpath device node? >>>>>>> >>>>>>> I know we are not doing multipathing for userspace today, but this >>>>>>> feels like an alternative I/O interface for nvme, seems a bit cripled >>>>>>> with zero multipathing capabilities... > > [...] > >> Got it, thanks. Passthrough (sync or async) just returns the failure >> to user-space if that fails. >> No attempt to retry/requeue as the block path does. > > I know, and that was my original question, no one cares that this > interface completely lacks this capability? Maybe it is fine, but > it is not a trivial assumption given that this is designed to be more > than an interface to send admin/vs commands to the controller... Most people don't really care about or use multipath, so it's not a primary goal. For passthrough, most of request types should hit the exact target, I would suggest that if someone cares about multipath for specific commands, that they be flagged as such.
>> [...] >> >>> Got it, thanks. Passthrough (sync or async) just returns the failure >>> to user-space if that fails. >>> No attempt to retry/requeue as the block path does. >> >> I know, and that was my original question, no one cares that this >> interface completely lacks this capability? Maybe it is fine, but >> it is not a trivial assumption given that this is designed to be more >> than an interface to send admin/vs commands to the controller... > > Most people don't really care about or use multipath, so it's not a > primary goal. This statement is generally correct. However what application would be interested in speaking raw nvme to a device and gaining performance that is even higher than the block layer (which is great to begin with)? First thing that comes to mind is a high-end storage array, where dual-ported drives are considered to be the standard. I could argue the same for a high-end oracle appliance or something like that... Although in a lot of cases, each nvme port will connect to a different host... What are the use-cases that need this interface that are the target here? Don't remember seeing this come up in the cover-letter or previous iterations... > For passthrough, most of request types should hit the > exact target, I would suggest that if someone cares about multipath for > specific commands, that they be flagged as such. What do you mean by "they be flagged as such"?
> From: Kanchan Joshi > Sent: Tuesday, March 8, 2022 7:21 AM > To: axboe@kernel.dk; hch@lst.de; kbusch@kernel.org; > asml.silence@gmail.com > Cc: io-uring@vger.kernel.org; linux-nvme@lists.infradead.org; linux- > block@vger.kernel.org; sbates@raithlin.com; logang@deltatee.com; > pankydev8@gmail.com; javier@javigon.com; mcgrof@kernel.org; > a.manzanares@samsung.com; joshiiitr@gmail.com; anuj20.g@samsung.com > Subject: [PATCH 05/17] nvme: wire-up support for async-passthru on char- > device. > <snip> > +static void nvme_pt_task_cb(struct io_uring_cmd *ioucmd) > +{ > + struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); > + struct request *req = pdu->req; > + int status; > + struct bio *bio = req->bio; > + > + if (nvme_req(req)->flags & NVME_REQ_CANCELLED) > + status = -EINTR; > + else > + status = nvme_req(req)->status; > + > + /* we can free request */ > + blk_mq_free_request(req); > + blk_rq_unmap_user(bio); > + > + if (!status && pdu->meta_buffer) { > + if (copy_to_user(pdu->meta_buffer, pdu->meta, pdu- > >meta_len)) This copy is incorrectly called for writes. > + status = -EFAULT; > + } > + kfree(pdu->meta); > + > + io_uring_cmd_done(ioucmd, status); > +} </snip>
> > + > > + if (!status && pdu->meta_buffer) { > > + if (copy_to_user(pdu->meta_buffer, pdu->meta, pdu- > > >meta_len)) > > This copy is incorrectly called for writes. Indeed. Will fix this up in v2, thanks.
On Wed, Mar 16, 2022 at 04:50:53PM +0200, Sagi Grimberg wrote: >>> I know, and that was my original question, no one cares that this >>> interface completely lacks this capability? Maybe it is fine, but >>> it is not a trivial assumption given that this is designed to be more >>> than an interface to send admin/vs commands to the controller... >> >> Most people don't really care about or use multipath, so it's not a >> primary goal. > > This statement is generally correct. However what application would be > interested in speaking raw nvme to a device and gaining performance that > is even higher than the block layer (which is great to begin with)? If passthrough is faster than the block I/O path we're doing someting wrong. At best it should be the same performance. That being said multipathing is an integral part of the nvme driver architecture, and the /dev/ngX devices. If we want to support uring async commands on /dev/ngX it will have to support multipath.
On Wed, Mar 16, 2022 at 12:57:27PM +0530, Kanchan Joshi wrote: > So what is the picture that you have in mind for struct io_uring_cmd? > Moving meta fields out makes it look like this - > @@ -28,7 +28,10 @@ struct io_uring_cmd { > u32 cmd_op; > u16 cmd_len; > u16 unused; > - u8 pdu[28]; /* available inline for free use */ > + void __user *meta_buffer; /* nvme pt specific */ > + u32 meta_len; /* nvme pt specific */ > + u8 pdu[16]; /* available inline for free use */ > + > }; > And corresponding nvme 16 byte pdu - struct nvme_uring_cmd_pdu { > - u32 meta_len; > union { > struct bio *bio; > struct request *req; > }; > void *meta; /* kernel-resident buffer */ > - void __user *meta_buffer; > } __packed; No, I'd also move the meta field (and call it meta_buffer) to struct io_uring_cmd, and replace the pdu array with a simple void *private; > We would still need to use/export that even if we somehow manage to move > task-work trigger from nvme-function to blk_mq_complete_request. > io_uring's task-work infra is more baked than raw task-work infra. > It would not be good to repeat all that code elsewhere. > I tried raw one in the first attempt, and Jens suggested to move to baked > one. Here is the link that gave birth to this interface - > https://lore.kernel.org/linux-nvme/6d847f4a-65a5-bc62-1d36-52e222e3d142@kernel.dk/ Ok. I can't say I'm a fan of where this is going.
>>>> I know, and that was my original question, no one cares that this >>>> interface completely lacks this capability? Maybe it is fine, but >>>> it is not a trivial assumption given that this is designed to be more >>>> than an interface to send admin/vs commands to the controller... >>> >>> Most people don't really care about or use multipath, so it's not a >>> primary goal. >> >> This statement is generally correct. However what application would be >> interested in speaking raw nvme to a device and gaining performance that >> is even higher than the block layer (which is great to begin with)? > > If passthrough is faster than the block I/O path we're doing someting > wrong. At best it should be the same performance. That is not what the changelog says. > That being said multipathing is an integral part of the nvme driver > architecture, and the /dev/ngX devices. If we want to support uring > async commands on /dev/ngX it will have to support multipath. Couldn't agree more...
On Thu, Mar 24, 2022 at 11:52 AM Christoph Hellwig <hch@lst.de> wrote: > > On Wed, Mar 16, 2022 at 12:57:27PM +0530, Kanchan Joshi wrote: > > So what is the picture that you have in mind for struct io_uring_cmd? > > Moving meta fields out makes it look like this - > > > > @@ -28,7 +28,10 @@ struct io_uring_cmd { > > u32 cmd_op; > > u16 cmd_len; > > u16 unused; > > - u8 pdu[28]; /* available inline for free use */ > > + void __user *meta_buffer; /* nvme pt specific */ > > + u32 meta_len; /* nvme pt specific */ > > + u8 pdu[16]; /* available inline for free use */ > > + > > }; > > And corresponding nvme 16 byte pdu - struct nvme_uring_cmd_pdu { > > - u32 meta_len; > > union { > > struct bio *bio; > > struct request *req; > > }; > > void *meta; /* kernel-resident buffer */ > > - void __user *meta_buffer; > > } __packed; > > No, I'd also move the meta field (and call it meta_buffer) to > struct io_uring_cmd, and replace the pdu array with a simple > > void *private; That clears up. Can go that route, but the tradeoff is - while we clean up one casting in nvme, we end up making async-cmd way too nvme-passthrough specific. People have talked about using async-cmd for other use cases; Darrick mentioned using for xfs-scrub, and Luis had some ideas (other than nvme) too. The pdu array of 28 bytes is being used to avoid fast path allocations. It got reduced to 8 bytes, and that is fine for one nvme-ioctl as we moved other fields out. But for other use-cases, 8 bytes of generic space may not be enough to help with fast-path allocations.
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 159944499c4f..3fe8f5901cd9 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3667,6 +3667,7 @@ static const struct file_operations nvme_ns_chr_fops = { .release = nvme_ns_chr_release, .unlocked_ioctl = nvme_ns_chr_ioctl, .compat_ioctl = compat_ptr_ioctl, + .async_cmd = nvme_ns_chr_async_cmd, }; static int nvme_add_ns_cdev(struct nvme_ns *ns) diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index 5c9cd9695519..1df270b47af5 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -18,6 +18,76 @@ static void __user *nvme_to_user_ptr(uintptr_t ptrval) ptrval = (compat_uptr_t)ptrval; return (void __user *)ptrval; } +/* + * This overlays struct io_uring_cmd pdu. + * Expect build errors if this grows larger than that. + */ +struct nvme_uring_cmd_pdu { + u32 meta_len; + union { + struct bio *bio; + struct request *req; + }; + void *meta; /* kernel-resident buffer */ + void __user *meta_buffer; +} __packed; + +static struct nvme_uring_cmd_pdu *nvme_uring_cmd_pdu(struct io_uring_cmd *ioucmd) +{ + return (struct nvme_uring_cmd_pdu *)&ioucmd->pdu; +} + +static void nvme_pt_task_cb(struct io_uring_cmd *ioucmd) +{ + struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); + struct request *req = pdu->req; + int status; + struct bio *bio = req->bio; + + if (nvme_req(req)->flags & NVME_REQ_CANCELLED) + status = -EINTR; + else + status = nvme_req(req)->status; + + /* we can free request */ + blk_mq_free_request(req); + blk_rq_unmap_user(bio); + + if (!status && pdu->meta_buffer) { + if (copy_to_user(pdu->meta_buffer, pdu->meta, pdu->meta_len)) + status = -EFAULT; + } + kfree(pdu->meta); + + io_uring_cmd_done(ioucmd, status); +} + +static void nvme_end_async_pt(struct request *req, blk_status_t err) +{ + struct io_uring_cmd *ioucmd = req->end_io_data; + struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); + /* extract bio before reusing the same field for request */ + struct bio *bio = pdu->bio; + + pdu->req = req; + req->bio = bio; + /* this takes care of setting up task-work */ + io_uring_cmd_complete_in_task(ioucmd, nvme_pt_task_cb); +} + +static void nvme_setup_uring_cmd_data(struct request *rq, + struct io_uring_cmd *ioucmd, void *meta, + void __user *meta_buffer, u32 meta_len) +{ + struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); + + /* to free bio on completion, as req->bio will be null at that time */ + pdu->bio = rq->bio; + pdu->meta = meta; + pdu->meta_buffer = meta_buffer; + pdu->meta_len = meta_len; + rq->end_io_data = ioucmd; +} static void *nvme_add_user_metadata(struct bio *bio, void __user *ubuf, unsigned len, u32 seed, bool write) @@ -56,7 +126,8 @@ static void *nvme_add_user_metadata(struct bio *bio, void __user *ubuf, static int nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, void __user *ubuffer, unsigned bufflen, void __user *meta_buffer, unsigned meta_len, - u32 meta_seed, u64 *result, unsigned timeout) + u32 meta_seed, u64 *result, unsigned timeout, + struct io_uring_cmd *ioucmd) { bool write = nvme_is_write(cmd); struct nvme_ns *ns = q->queuedata; @@ -64,9 +135,15 @@ static int nvme_submit_user_cmd(struct request_queue *q, struct request *req; struct bio *bio = NULL; void *meta = NULL; + unsigned int rq_flags = 0; + blk_mq_req_flags_t blk_flags = 0; int ret; - req = nvme_alloc_request(q, cmd, 0, 0); + if (ioucmd && (ioucmd->flags & IO_URING_F_NONBLOCK)) { + rq_flags |= REQ_NOWAIT; + blk_flags |= BLK_MQ_REQ_NOWAIT; + } + req = nvme_alloc_request(q, cmd, blk_flags, rq_flags); if (IS_ERR(req)) return PTR_ERR(req); @@ -92,6 +169,19 @@ static int nvme_submit_user_cmd(struct request_queue *q, req->cmd_flags |= REQ_INTEGRITY; } } + if (ioucmd) { /* async dispatch */ + if (cmd->common.opcode == nvme_cmd_write || + cmd->common.opcode == nvme_cmd_read) { + nvme_setup_uring_cmd_data(req, ioucmd, meta, meta_buffer, + meta_len); + blk_execute_rq_nowait(req, 0, nvme_end_async_pt); + return 0; + } else { + /* support only read and write for now. */ + ret = -EINVAL; + goto out_meta; + } + } ret = nvme_execute_passthru_rq(req); if (result) @@ -100,6 +190,7 @@ static int nvme_submit_user_cmd(struct request_queue *q, if (copy_to_user(meta_buffer, meta, meta_len)) ret = -EFAULT; } + out_meta: kfree(meta); out_unmap: if (bio) @@ -170,7 +261,8 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) return nvme_submit_user_cmd(ns->queue, &c, nvme_to_user_ptr(io.addr), length, - metadata, meta_len, lower_32_bits(io.slba), NULL, 0); + metadata, meta_len, lower_32_bits(io.slba), NULL, 0, + NULL); } static bool nvme_validate_passthru_nsid(struct nvme_ctrl *ctrl, @@ -224,7 +316,7 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, nvme_to_user_ptr(cmd.addr), cmd.data_len, nvme_to_user_ptr(cmd.metadata), cmd.metadata_len, - 0, &result, timeout); + 0, &result, timeout, NULL); if (status >= 0) { if (put_user(result, &ucmd->result)) @@ -235,45 +327,53 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, } static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns, - struct nvme_passthru_cmd64 __user *ucmd) + struct nvme_passthru_cmd64 __user *ucmd, + struct io_uring_cmd *ioucmd) { - struct nvme_passthru_cmd64 cmd; + struct nvme_passthru_cmd64 cmd, *cptr; struct nvme_command c; unsigned timeout = 0; int status; if (!capable(CAP_SYS_ADMIN)) return -EACCES; - if (copy_from_user(&cmd, ucmd, sizeof(cmd))) - return -EFAULT; - if (cmd.flags) + if (!ioucmd) { + if (copy_from_user(&cmd, ucmd, sizeof(cmd))) + return -EFAULT; + cptr = &cmd; + } else { + if (ioucmd->cmd_len != sizeof(struct nvme_passthru_cmd64)) + return -EINVAL; + cptr = (struct nvme_passthru_cmd64 *)ioucmd->cmd; + } + if (cptr->flags) return -EINVAL; - if (!nvme_validate_passthru_nsid(ctrl, ns, cmd.nsid)) + if (!nvme_validate_passthru_nsid(ctrl, ns, cptr->nsid)) return -EINVAL; memset(&c, 0, sizeof(c)); - c.common.opcode = cmd.opcode; - c.common.flags = cmd.flags; - c.common.nsid = cpu_to_le32(cmd.nsid); - c.common.cdw2[0] = cpu_to_le32(cmd.cdw2); - c.common.cdw2[1] = cpu_to_le32(cmd.cdw3); - c.common.cdw10 = cpu_to_le32(cmd.cdw10); - c.common.cdw11 = cpu_to_le32(cmd.cdw11); - c.common.cdw12 = cpu_to_le32(cmd.cdw12); - c.common.cdw13 = cpu_to_le32(cmd.cdw13); - c.common.cdw14 = cpu_to_le32(cmd.cdw14); - c.common.cdw15 = cpu_to_le32(cmd.cdw15); - - if (cmd.timeout_ms) - timeout = msecs_to_jiffies(cmd.timeout_ms); + c.common.opcode = cptr->opcode; + c.common.flags = cptr->flags; + c.common.nsid = cpu_to_le32(cptr->nsid); + c.common.cdw2[0] = cpu_to_le32(cptr->cdw2); + c.common.cdw2[1] = cpu_to_le32(cptr->cdw3); + c.common.cdw10 = cpu_to_le32(cptr->cdw10); + c.common.cdw11 = cpu_to_le32(cptr->cdw11); + c.common.cdw12 = cpu_to_le32(cptr->cdw12); + c.common.cdw13 = cpu_to_le32(cptr->cdw13); + c.common.cdw14 = cpu_to_le32(cptr->cdw14); + c.common.cdw15 = cpu_to_le32(cptr->cdw15); + + if (cptr->timeout_ms) + timeout = msecs_to_jiffies(cptr->timeout_ms); status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, - nvme_to_user_ptr(cmd.addr), cmd.data_len, - nvme_to_user_ptr(cmd.metadata), cmd.metadata_len, - 0, &cmd.result, timeout); + nvme_to_user_ptr(cptr->addr), cptr->data_len, + nvme_to_user_ptr(cptr->metadata), cptr->metadata_len, + 0, &cptr->result, timeout, ioucmd); - if (status >= 0) { - if (put_user(cmd.result, &ucmd->result)) + if (!ioucmd && status >= 0) { + if (put_user(cptr->result, &ucmd->result)) return -EFAULT; } @@ -296,7 +396,7 @@ static int nvme_ctrl_ioctl(struct nvme_ctrl *ctrl, unsigned int cmd, case NVME_IOCTL_ADMIN_CMD: return nvme_user_cmd(ctrl, NULL, argp); case NVME_IOCTL_ADMIN64_CMD: - return nvme_user_cmd64(ctrl, NULL, argp); + return nvme_user_cmd64(ctrl, NULL, argp, NULL); default: return sed_ioctl(ctrl->opal_dev, cmd, argp); } @@ -340,7 +440,7 @@ static int nvme_ns_ioctl(struct nvme_ns *ns, unsigned int cmd, case NVME_IOCTL_SUBMIT_IO: return nvme_submit_io(ns, argp); case NVME_IOCTL_IO64_CMD: - return nvme_user_cmd64(ns->ctrl, ns, argp); + return nvme_user_cmd64(ns->ctrl, ns, argp, NULL); default: return -ENOTTY; } @@ -369,6 +469,33 @@ long nvme_ns_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return __nvme_ioctl(ns, cmd, (void __user *)arg); } +static int nvme_ns_async_ioctl(struct nvme_ns *ns, struct io_uring_cmd *ioucmd) +{ + int ret; + + BUILD_BUG_ON(sizeof(struct nvme_uring_cmd_pdu) > sizeof(ioucmd->pdu)); + + switch (ioucmd->cmd_op) { + case NVME_IOCTL_IO64_CMD: + ret = nvme_user_cmd64(ns->ctrl, ns, NULL, ioucmd); + break; + default: + ret = -ENOTTY; + } + + if (ret >= 0) + ret = -EIOCBQUEUED; + return ret; +} + +int nvme_ns_chr_async_cmd(struct io_uring_cmd *ioucmd) +{ + struct nvme_ns *ns = container_of(file_inode(ioucmd->file)->i_cdev, + struct nvme_ns, cdev); + + return nvme_ns_async_ioctl(ns, ioucmd); +} + #ifdef CONFIG_NVME_MULTIPATH static int nvme_ns_head_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd, void __user *argp, struct nvme_ns_head *head, int srcu_idx) @@ -412,6 +539,20 @@ int nvme_ns_head_ioctl(struct block_device *bdev, fmode_t mode, return ret; } +int nvme_ns_head_chr_async_cmd(struct io_uring_cmd *ioucmd) +{ + struct cdev *cdev = file_inode(ioucmd->file)->i_cdev; + struct nvme_ns_head *head = container_of(cdev, struct nvme_ns_head, cdev); + int srcu_idx = srcu_read_lock(&head->srcu); + struct nvme_ns *ns = nvme_find_path(head); + int ret = -EWOULDBLOCK; + + if (ns) + ret = nvme_ns_async_ioctl(ns, ioucmd); + srcu_read_unlock(&head->srcu, srcu_idx); + return ret; +} + long nvme_ns_head_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -480,7 +621,7 @@ long nvme_dev_ioctl(struct file *file, unsigned int cmd, case NVME_IOCTL_ADMIN_CMD: return nvme_user_cmd(ctrl, NULL, argp); case NVME_IOCTL_ADMIN64_CMD: - return nvme_user_cmd64(ctrl, NULL, argp); + return nvme_user_cmd64(ctrl, NULL, argp, NULL); case NVME_IOCTL_IO_CMD: return nvme_dev_user_cmd(ctrl, argp); case NVME_IOCTL_RESET: diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index f8bf6606eb2f..1d798d09456f 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -459,6 +459,7 @@ static const struct file_operations nvme_ns_head_chr_fops = { .release = nvme_ns_head_chr_release, .unlocked_ioctl = nvme_ns_head_chr_ioctl, .compat_ioctl = compat_ptr_ioctl, + .async_cmd = nvme_ns_head_chr_async_cmd, }; static int nvme_add_ns_head_cdev(struct nvme_ns_head *head) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index b32f4e2c68fd..e6a30543d7c8 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -16,6 +16,7 @@ #include <linux/rcupdate.h> #include <linux/wait.h> #include <linux/t10-pi.h> +#include <linux/io_uring.h> #include <trace/events/block.h> @@ -752,6 +753,8 @@ long nvme_ns_head_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg); long nvme_dev_ioctl(struct file *file, unsigned int cmd, unsigned long arg); +int nvme_ns_chr_async_cmd(struct io_uring_cmd *ioucmd); +int nvme_ns_head_chr_async_cmd(struct io_uring_cmd *ioucmd); int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo); extern const struct attribute_group *nvme_ns_id_attr_groups[];